Merge pull request #39 from egouldo/collinearity_removal_analysis

Add post-hoc analysis exploring removal of collinear analyses
egouldo · Jun 13, 2024 · 7c11bf7 · 7c11bf7
2 parents f65dac5 + ec6eefa
commit 7c11bf7
Show file tree

Hide file tree

Showing 50 changed files with 1,528 additions and 3,042 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ManyEcoEvo
 Title: Meta-analyse data from 'Many-Analysts' style studies
-Version: 1.1.0
+Version: 1.2.0.9000
 Authors@R: c(person(given = "Elliot",
                     family = "Gould", 
                     email =  "[email protected]", 
@@ -68,7 +68,7 @@ Remotes:
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 URL: https://github.com/egouldo/ManyEcoEvo,
     https://egouldo.github.io/ManyEcoEvo/
 BugReports: https://github.com/egouldo/ManyEcoEvo/issues

diff --git a/NAMESPACE b/NAMESPACE
@@ -37,6 +37,7 @@ export(fit_metafor_mv_reduced)
 export(fit_metafor_uni)
 export(fit_sorensen_glm)
 export(fit_uni_mixed_effects)
+export(generate_collinearity_subset)
 export(generate_exclusion_subsets)
 export(generate_expertise_subsets)
 export(generate_outlier_subsets)
@@ -56,6 +57,8 @@ export(meta_analyse_datasets)
 export(named_group_split)
 export(plot_cont_rating_effects)
 export(plot_effects_diversity)
+export(plot_model_means_box_cox_cat)
+export(plot_model_means_orchard)
 export(power_back)
 export(pred_to_Z)
 export(prepare_ManyEcoEvo)
@@ -89,7 +92,12 @@ export(summarise_variable_counts)
 export(validate_predictions)
 export(validate_predictions_df_blue_tit)
 export(validate_predictions_df_euc)
+import(dplyr)
+import(ggbeeswarm)
+import(ggplot2)
 import(metafor)
+import(see)
+importFrom(EnvStats,stat_n_text)
 importFrom(broom,tidy)
 importFrom(dplyr,across)
 importFrom(dplyr,case_when)
@@ -108,13 +116,15 @@ importFrom(dplyr,rename)
 importFrom(dplyr,right_join)
 importFrom(dplyr,select)
 importFrom(dplyr,summarise)
+importFrom(forcats,fct_relevel)
 importFrom(magrittr,"%>%")
 importFrom(pointblank,col_vals_not_null)
 importFrom(purrr,map)
 importFrom(purrr,map_dfr)
 importFrom(purrr,set_names)
 importFrom(rlang,is_na)
 importFrom(rlang,na_chr)
+importFrom(sae,bxcx)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,separate)

diff --git a/R/generate_collinearity_subset.R b/R/generate_collinearity_subset.R
@@ -0,0 +1,70 @@
+#' Generate Collinearity Data Subset
+#' 
+#' This function generates a subset of the data that is used to demonstrate the
+#' effects of collinearity on regression models. The data is generated by
+#' sampling from a multivariate normal distribution with a specified correlation
+#' matrix.
+#' 
+#' #'
+#' @param ManyEcoEvo a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`,  `dataset`, `publishable_subset`, and `exclusion_set`. See details.
+#' @param collinearity_subset a dataframe containing the column `response_id` containing response ID's to be included in the expert subset
+#'
+#' @return A ManyEcoEvo dataframe with added column `expertise_subset` with new subsets of `data` and `diversity_data`
+#' @export
+#' @details
+#' Note that this function needs to be run on `ManyEcoEvo` after the following functions have been run (See examples):
+#' - `prepare_response_variables()`
+#' - `generate_exclusion_subsets()`
+#' - `generate_rating_subsets()`
+#' 
+#' `generate_collinearity_subset()` only creates expertise subsets based on the full dataset where `exclusion_set == "complete"` and `publishable_subset == "All"` and `expertise_subset == "All"`.
+#' @examples
+#' ManyEcoEvo %>%
+#' prepare_response_variables(estimate_type = "Zr") |>
+#' generate_exclusion_subsets(estimate_type = "Zr") |>
+#' generate_rating_subsets() |>
+#' generate_expertise_subsets(expert_subset) |>
+#' generate_collinearity_subset(collinearity_subset = collinearity_subset)
+generate_collinearity_subset <- function(ManyEcoEvo, collinearity_subset) {
+  # Check if the inputs are a dataframe
+  if (!is.data.frame(collinearity_subset)) {
+    stop("collinearity_subset must be a dataframe.")
+  }
+
+  if (!is.data.frame(ManyEcoEvo)) {
+    stop("ManyEcoEvo must be a dataframe.")
+  }
+
+  # Check if the subset_collumn dataframe has the correct column names
+  if (!all(c("response_id", "id_col") %in% colnames(collinearity_subset))) {
+    stop("The input dataframe must contain the column 'response_id' and 'id_col'.")
+  }
+
+  # Check if the response_id column is unique
+  if (length(unique(collinearity_subset$id_col)) != nrow(collinearity_subset)) {
+    stop("The 'id_col' column in collinearity_subset must be unique.")
+  }
+
+  collinearity_subset_dataset <- collinearity_subset %>% pluck("dataset", unique)
+
+  collinear_removed <- ManyEcoEvo %>%
+    filter(publishable_subset == "All" & exclusion_set == "complete" & expertise_subset == "All",
+           dataset %in% collinearity_subset_dataset) %>%
+    mutate(data = map(.x = data, 
+                      .f = dplyr::anti_join, collinearity_subset,  
+                      by = join_by(response_id, id_col, dataset) )) %>% 
+    mutate(diversity_data = 
+             map2(.x = diversity_data, 
+                  .y = data, 
+                  .f = ~ semi_join(.x, .y) %>% distinct),
+           collinearity_subset = "collinearity_removed")
+
+  out <- bind_rows(
+    ManyEcoEvo %>% 
+      mutate(collinearity_subset = "All"),
+    collinear_removed
+  )
+
+  return(out)
+
+}
diff --git a/R/generate_expertise_subsets.R b/R/generate_expertise_subsets.R
@@ -24,6 +24,8 @@
 #' generate_rating_subsets() |>
 #' generate_expertise_subsets(expert_subset)
 generate_expertise_subsets <- function(ManyEcoEvo, expert_subset) {
+  #TODO idea, allow ellipses arg in function and pass those expressions to filter.
+  # that way isn't hardcoded in the function. Repeat for all other generate / exclude map funs
   # NOTE: should be run *after* computing Zr with compute_MA_inputs() 
   out <- ManyEcoEvo %>% 
     filter(publishable_subset == "All" & exclusion_set == "complete") %>% 

diff --git a/R/generate_outlier_subsets.R b/R/generate_outlier_subsets.R
@@ -13,12 +13,12 @@ generate_outlier_subsets <- function(ManyEcoEvo){
   # TODO: will nolonger work on Zr dataset, because this doesn't contain an estimate_type col?
   # TODO: Don't run with the reduced publishability subset.... some of these already only have 10 data points!!
   # apply conditional behaviour to trigger both
-  # 
+  # TODO: do not run for collinearity_removed datasets
   if(str_detect(ManyEcoEvo$estimate_type, "Zr") %>% any(na.rm = TRUE)){
     ManyEcoEvo_Zr <-  ManyEcoEvo %>% 
       filter(estimate_type == "Zr") %>% 
       bind_rows(., {ManyEcoEvo %>% 
-          filter(estimate_type == "Zr") %>% 
+          filter(estimate_type == "Zr", collinearity_subset != "collinearity_removed") %>% 
           mutate(effects_analysis = map(effects_analysis, 
                                         ~ slice_max(.x, Zr, n = -2) %>% 
                                           slice_min(Zr, n = -2))) %>% 

diff --git a/R/make_viz.R b/R/make_viz.R
@@ -18,7 +18,7 @@ make_viz <- function(data) {
   if(any(str_detect(unique(data$estimate_type),pattern = "Zr"))){
     data_Zr <- data %>% 
       filter(estimate_type == "Zr") %>% 
-      group_by(exclusion_set, dataset, estimate_type, publishable_subset, expertise_subset, data) %>% 
+      group_by(exclusion_set, dataset, estimate_type, publishable_subset, expertise_subset, collinearity_subset, data) %>% 
       pivot_longer(names_to = "model_name", 
                    values_to = "model", 
                    cols = c(-exclusion_set, 
@@ -29,7 +29,8 @@ make_viz <- function(data) {
                             -diversity_indices, 
                             -effects_analysis,
                             -publishable_subset,
-                            -expertise_subset)) %>% 
+                            -expertise_subset,
+                            -collinearity_subset)) %>% 
       ungroup %>% 
       select(-data, 
              -diversity_data, 

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/_targets.R b/_targets.R
@@ -29,7 +29,7 @@ tar_option_set(
   packages = pkgs,
   imports = "ManyEcoEvo",
   # debug = c("augmented_data_3efd9941")#, #augmented_data_a4d78efa
-  cue = tar_cue(mode = "always") #because we have silent errors!
+  # cue = tar_cue(mode = "always") #because we have silent errors!
 )
 
 list(tarchetypes::tar_file_read(name = euc_reviews, 
@@ -80,6 +80,7 @@ list(tarchetypes::tar_file_read(name = euc_reviews,
                            generate_exclusion_subsets(estimate_type = "Zr") |> 
                            generate_rating_subsets() |> 
                            generate_expertise_subsets(expert_subset) |>
+                           generate_collinearity_subset(ManyEcoEvo:::collinearity_subset) |>
                            compute_MA_inputs(estimate_type = "Zr") |> 
                            generate_outlier_subsets() |> # TODO run before MA_inputs? diversity indices need to be recalculated!!
                            filter(expertise_subset != "expert" | exclusion_set != "complete-rm_outliers") |> #TODO mv into generate_outlier_subsets() so aren't created in the first place