fix bugs in package data regeneration for ManyAnalysts manuscript (#148)

* Increment version number to 2.7.5.9000 * bug: fix threshold checking for multivariate model fitting #147 only count `mixed_model` after calling `distinct()` on analysis identifier column and `mixed_model` first. Update argument checking to fail if any id columns not present. docs: update `@details` section about required id column and add `any_of()` to `@importFrom` * docs: rearrange headings in NEWS.md * bug: ensure outlier subset creation occurs on all `exclusion_set` values in Zr #144 removed filter for `exclusion_set` == "complete" * bug: exclude analysis with non-count-based dependent variable from `yi` analysis #145 * feat: #146 add function for excluding extreme estimates based on a multiplier threshold for population parameter estimates * fix typo #146 * docs!: #146 `devtools::document()` * bug: #146 export function * build!: #146 apply exclusion function to Eucalyptus dataset in targets pipeline Note that this functionality was previously included in the manuscript * #146 increment dev version and news before rebuilding package and targets pipeline * - build!: don't forget to filter the corresponding diversity data after exclusions! * - build!: fix #146 regenerate yi data after excluding extreme values * Increment version number to 2.7.6
egouldo · Sep 5, 2024 · 967852b · 967852b
1 parent 8029b0d
commit 967852b
Show file tree

Hide file tree

Showing 13 changed files with 1,041 additions and 790 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ManyEcoEvo
 Title: Meta-analyse data from 'Many-Analysts' style studies
-Version: 2.7.5
+Version: 2.7.6
 Authors@R: c(
     person("Elliot", "Gould", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "https://orcid.org/0000-0002-6585-538X")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,7 @@ export(cube_back)
 export(divide_back)
 export(est_to_zr)
 export(exclude_extreme_VZ)
+export(exclude_extreme_estimates)
 export(fit_MA_mv)
 export(fit_boxcox_ratings_cat)
 export(fit_boxcox_ratings_cont)
@@ -131,6 +132,7 @@ importFrom(cli,cli_alert_warning)
 importFrom(cli,cli_bullets)
 importFrom(cli,cli_h1)
 importFrom(cli,cli_h2)
+importFrom(cli,cli_h3)
 importFrom(cli,cli_ol)
 importFrom(cli,cli_warn)
 importFrom(cli,style_italic)
@@ -214,10 +216,14 @@ importFrom(recipes,update_role)
 importFrom(rlang,"!!")
 importFrom(rlang,":=")
 importFrom(rlang,as_function)
+importFrom(rlang,as_quosures)
 importFrom(rlang,as_string)
 importFrom(rlang,caller_env)
+importFrom(rlang,current_env)
 importFrom(rlang,enquo)
+importFrom(rlang,enquos)
 importFrom(rlang,ensym)
+importFrom(rlang,env)
 importFrom(rlang,exec)
 importFrom(rlang,expr)
 importFrom(rlang,exprs)
@@ -229,6 +235,7 @@ importFrom(rlang,is_na)
 importFrom(rlang,is_null)
 importFrom(rlang,na_chr)
 importFrom(rlang,new_formula)
+importFrom(rlang,quo_set_env)
 importFrom(sae,bxcx)
 importFrom(see,geom_jitter2)
 importFrom(see,scale_fill_material_d)
@@ -255,6 +262,7 @@ importFrom(tidyr,unite)
 importFrom(tidyr,unnest)
 importFrom(tidyr,unnest_longer)
 importFrom(tidyselect,all_of)
+importFrom(tidyselect,any_of)
 importFrom(tidyselect,where)
 importFrom(timetk,step_box_cox)
 importFrom(workflows,add_model)

diff --git a/NEWS.md b/NEWS.md
@@ -1,12 +1,28 @@
+# ManyEcoEvo 2.7.6
+
 <!-- NEWS.md is maintained by https://cynkra.github.io/fledge, do not edit -->
 
-- build!: `usethis::use_data()` update results of `make_viz()`
-* - build!: force `tar_make()` for #140
-* - build!: force `tar_make()` for #140, add targets meta
+- build!: fix #146 regenerate yi data after excluding extreme values
+- build!: don't forget to filter the corresponding diversity data after exclusions!
 
+* Increment version number to 2.7.5
+* docs: update changelog
+*  bug: #146 export function
+*  docs!: #146 `devtools::document()`
+* feat: #146 add function for excluding extreme estimates based on a multiplier threshold for population parameter estimates
+* bug: exclude analysis with non-count-based dependent variable from `yi` analysis #145
+* bug: ensure outlier subset creation occurs on all `exclusion_set` values in Zr #144
+* bug: fix threshold checking for multivariate model fitting #147
+- build!: `usethis::use_data()` update results of `make_viz()`
 
 # ManyEcoEvo 2.7.5
 
+<!-- NEWS.md is maintained by https://cynkra.github.io/fledge, do not edit -->
+
+- build!: `usethis::use_data()` update results of `make_viz()`
+* - build!: force `tar_make()` for #140
+* - build!: force `tar_make()` for #140, add targets meta
+
 # ManyEcoEvo 2.7.4
 
 - docs: Update function documentation #140

diff --git a/R/exclude_extreme_estimates.R b/R/exclude_extreme_estimates.R
@@ -0,0 +1,98 @@
+#' Exclude extreme estimates above a threshold parameter sd
+#' 
+#' @param data A dataframe of analyst estimates
+#' @param outcome_variable the name of the variable in `data` containing the analyst estimates
+#' @param outcome_SE variable in `data` containing analyst SE estimates
+#' @param sd_threshold A numeric threshold multiplyer see details
+#' @param param_table A dataframe containing population parameters `mean` and `sd` for each `variable` in a given `dataset`
+#' @param .fn An optional function that will transform parameter estimates to the same scale as `outcome_variable` in `data`
+#' @param ... Arguments supplied to `.fn`
+#' @import dplyr
+#' @importFrom rlang enquo env as_quosures enquos enquo current_env quo_set_env is_null
+#' @importFrom cli cli_h3 cli_alert_success
+#' @importFrom purrr map list_c
+#' @importFrom tidyr pivot_wider hoist
+#' @details
+#' This function is used to exclude extreme estimates from a dataset. The function
+#' calculates a threshold for exclusion based on the mean and standard deviation of
+#' the population parameter estimates in `param_table`. The threshold is calculated
+#' as the mean of the population parameter plus `sd_threshold` times the standard
+#' deviation of the population parameter. Estimates in `data` that are greater than
+#' this threshold are excluded from the output.
+#' 
+#' If the user chooses to supply `.fn` and `...` arguments, the function will transform
+#' the population parameter estimates in `param_table` to the same scale as the
+#' `outcome_variable` in `data` using `.fn`, before calculating the threshold for exclusion.
+#' @export
+#' @return A dataframe of analyst estimates with extreme estimates excluded
+#' @examples
+#' # example code
+#' data <-   ManyEcoEvo_yi %>% 
+#' mutate(data = 
+#'          map_if(data, 
+#'                 ~ filter(.x, 
+#'                          stringr::str_detect(response_variable_name, 
+#'                                              "average.proportion.of.plots.containing",
+#'                                              negate = TRUE)),
+#'                 .p = dataset == "eucalyptus")) %>%   
+#'   mutate(
+#'     diversity_data =
+#'       map2(
+#'         .x = diversity_data,
+#'         .y = data,
+#'         .f = ~ semi_join(.x, .y, join_by(id_col)) %>% 
+#'           distinct()
+#'       )
+#'   ) %>% 
+#'   prepare_response_variables(
+#'     estimate_type = "yi",
+#'     param_table = 
+#'       ManyEcoEvo:::analysis_data_param_tables, 
+#'     dataset_standardise = "blue tit",
+#'     dataset_log_transform = "eucalyptus") %>%
+#'   generate_yi_subsets() %>% #TODO: must be run after prepare_response_variables??
+#'   apply_VZ_exclusions(
+#'     VZ_colname = list("eucalyptus" = "se_log", 
+#'                       "blue tit" = "VZ"), 
+#'     VZ_cutoff = 3) %>% 
+#'   filter(dataset == "eucalyptus", estimate_type == "y25")  %>% 
+#'   pluck("data", 1)
+#' sd_threshold = 3
+#' param_table <- ManyEcoEvo:::analysis_data_param_tables
+#' exclude_extreme_estimates(data, "mean_log", "se_log", 3, param_table, log_transform, estimate = mean, std.error = sd)
+exclude_extreme_estimates <- function(data, outcome_variable, outcome_SE, sd_threshold = numeric(1L), param_table, .fn = ..., ...) {
+  # FOR NOW: allow transformation here, but in future, we make sure that 
+  # `prepare_response_variables()` returns both `back_transformed_data` and the
+  # transformed / standardised data to separate list-columns to retain this data
+  # Then downstream functions operate off the list-column `analysis_data` or 
+  # some other named list-col like `transformed_data` etc.
+  dots <- rlang::enquos(...) %>% rlang::as_quosures(env = rlang::env())
+
+  param_table <- pivot_wider(param_table, names_from = parameter, values_from = value) 
+
+  if (!is_null(.fn)){
+    cli::cli_h3("Transforming {.arg param_table} using {.arg .fn}:")
+    param_table <- param_table  %>% 
+      rowwise() %>%
+      mutate(transformed_values = list(.fn(!!!dots))) %>% 
+      hoist(transformed_values, 
+            param_mean = outcome_variable, 
+            param_sd = outcome_SE) %>% 
+      select(-transformed_values, -{map(dots, rlang::as_name) %>% list_c()})
+  } else {
+    param_table <- param_table %>% 
+      rename_with(.cols = contains(c("mean", "sd")), ~ paste0("param_", .x))
+  }
+  cli::cli_h3("Excluding extreme estimates from data:")
+  out <- data %>% 
+    left_join(param_table, 
+              by = join_by(response_variable_name == variable)) %>% 
+    mutate(exclusion_threshold = param_mean + sd_threshold * param_sd) %>% 
+    filter(if_any(outcome_variable, ~ .x <= exclusion_threshold)) %>% 
+    select(-starts_with("param_"))
+
+  cli::cli_alert_success("Removed {.val {nrow(data) - nrow(out)}} columns from data with {.arg sd_threshold} = {.val {sd_threshold}}")
+
+  return(out)
+}
+
diff --git a/R/filt_multivar_MA.R b/R/filt_multivar_MA.R
@@ -19,6 +19,7 @@
 #' @importFrom tidyr unite
 #' @importFrom cli cli_alert_info cli_bullets cli_h2 style_italic
 #' @importFrom glue glue
+#' @importFrom tidyselect any_of
 #' @details
 #' Depending on whether enough analyses in `data_tbl` have been conducted with the `mixed_model` variable, the function will fit a model with or without the predictor `mixed_model`.
 #'
@@ -30,6 +31,7 @@
 #' - `box_cox_abs_deviation_score_estimate`: response variable, Box-Cox transformed deviation from the meta-analytic mean effect-size for each analysis
 #' - `mixed_model`: binary variable indicating whether the analysis used a mixed effects model or not
 #' - `ReviewerId`: reviewer identifier
+#' - one of `study_id` or `id_col` to uniquely identify each analysis for checking that the threshold `N` is met.
 #' @family Model fitting and meta-analysis
 fit_multivar_MA <- function(data_tbl, N = 5, ..., env = rlang::caller_env()) {
 
@@ -43,7 +45,8 @@ fit_multivar_MA <- function(data_tbl, N = 5, ..., env = rlang::caller_env()) {
         PublishableAsIs,
         mean_diversity_index,
         ReviewerId,
-        mixed_model
+        mixed_model,
+        any_of(c("id_col", "study_id"))
       ))
 
   # ----- Define Models -----
@@ -69,6 +72,7 @@ fit_multivar_MA <- function(data_tbl, N = 5, ..., env = rlang::caller_env()) {
 
   pass_threshold <-
     data_tbl %>%
+    distinct(pick(any_of(c("study_id", "id_col"))), mixed_model) %>% 
     count(mixed_model) %>%
     pointblank::test_col_vals_gte(n, N)
 

diff --git a/_targets.R b/_targets.R
@@ -112,8 +112,7 @@ list(tarchetypes::tar_file_read(name = euc_reviews,
                                rlang::exprs(
                                  collinearity_subset != "collinearity_removed", 
                                  expertise_subset != "expert", 
-                                 publishable_subset == "All", 
-                                 exclusion_set == "complete")) |>
+                                 publishable_subset == "All")) |>
                            compute_MA_inputs(estimate_type = "Zr") |> 
                            meta_analyse_datasets(
                              outcome_variable = "Zr", 
@@ -264,13 +263,52 @@ list(tarchetypes::tar_file_read(name = euc_reviews,
                                                          all_prediction_data)),
      targets::tar_target(name = ManyEcoEvo_yi_results,
                          command =  ManyEcoEvo_yi %>% 
+                           mutate(
+                             data = 
+                               map_if(data, 
+                                      ~ filter(.x, 
+                                               stringr::str_detect(
+                                                 response_variable_name, 
+                                                 "average.proportion.of.plots.containing",
+                                                 negate = TRUE)),
+                                      .p = dataset == "eucalyptus")) %>%   
+                           mutate(
+                             diversity_data =
+                               map2(
+                                 .x = diversity_data,
+                                 .y = data,
+                                 .f = ~ semi_join(.x, .y, join_by(id_col)) %>% 
+                                   distinct()
+                               )
+                           ) %>% 
                            prepare_response_variables(
                              estimate_type = "yi",
                              param_table = 
                                ManyEcoEvo:::analysis_data_param_tables, 
                              dataset_standardise = "blue tit",
                              dataset_log_transform = "eucalyptus") %>%
                            generate_yi_subsets() %>% #TODO: must be run after prepare_response_variables??
+                           rowwise() %>% 
+                           mutate(data = if (dataset == "eucalyptus") {
+                             list(
+                               exclude_extreme_estimates(
+                                 data, 
+                                 outcome_variable = "mean_log", 
+                                 outcome_SE = "se_log", 
+                                 param_table = ManyEcoEvo:::analysis_data_param_tables, 
+                                 sd_threshold = 3, 
+                                 .fn = log_transform, 
+                                 estimate = mean, 
+                                 std.error = sd))
+                           } else {list(data)},
+                           diversity_data = if (dataset == "eucalyptus") {
+                             list(
+                               semi_join(diversity_data, 
+                                         data, 
+                                         by = "id_col") %>% 
+                                 distinct())
+                           } else {list(diversity_data)}) %>% 
+                           ungroup %>% 
                            apply_VZ_exclusions(
                              VZ_colname = list("eucalyptus" = "se_log", 
                                                "blue tit" = "VZ"),