Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add post-hoc analysis exploring removal of collinear analyses #39

Merged
merged 27 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
bb8d75d
#38 create function for generating collinearity subset
egouldo Jun 6, 2024
becaa9c
add todo comment
egouldo Jun 6, 2024
438dee4
#38 create internal dataset with ids of highly collinear bt analyses
egouldo Jun 6, 2024
45e3a2c
#38 regenerate documentation
egouldo Jun 6, 2024
d339076
regenerate documentaiton update description and namespace
egouldo Jun 6, 2024
79a4f50
#38 update make_viz() to reflect upstream changes - additional col co…
egouldo Jun 6, 2024
fcff297
Increment version number to 1.2.0
egouldo Jun 6, 2024
2bff3f1
#38 rm collinearity subset before generating outlier subsets
egouldo Jun 6, 2024
0f58bf2
#38 write collinearity_subset to pkg data - usethis::use_data on coll…
egouldo Jun 6, 2024
7a32ff3
#38 merge internal data creation together (https://r-pkgs.org/data.ht…
egouldo Jun 6, 2024
d6fd9bf
#38 regenerate sysdata after merging data-raw files
egouldo Jun 6, 2024
11fdb3b
comment out debug cue in targets
egouldo Jun 6, 2024
2fdbc5f
#38 regenerate targets pipeline and rebuild pkg
egouldo Jun 6, 2024
6d14781
#38 fix error in data subsetting -REMOVE analyses that are highly col…
egouldo Jun 6, 2024
a8d7637
- #38 regenerate pipeline and rebuild package after fixing bug in col…
egouldo Jun 6, 2024
fe02362
add todo comment consider making objects internal to speedup lazyloading
egouldo Jun 6, 2024
fdef97a
- #38 eval impact of removing analyses with highly collinear predicto…
egouldo Jun 6, 2024
b4c9767
update renv
egouldo Jun 12, 2024
71be604
#38 update data-raw script to incl. additional analyses for
egouldo Jun 13, 2024
854c32a
update comment for building internal objects from data-raw
egouldo Jun 13, 2024
b6affb9
rebuild internal objects #38
egouldo Jun 13, 2024
cbe8d15
#38 update variable coding:
egouldo Jun 13, 2024
b497dc4
#38 fix incorrectly coded analysis variables:
egouldo Jun 13, 2024
2c55d31
#38 rm investigatory script after running as reprex and gh commenting
egouldo Jun 13, 2024
19869e9
#38 update renv
egouldo Jun 13, 2024
1f67263
Increment version number to 1.2.0.9000
egouldo Jun 13, 2024
ec6eefa
#38 rerun targets
egouldo Jun 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: ManyEcoEvo
Title: Meta-analyse data from 'Many-Analysts' style studies
Version: 1.1.0
Version: 1.2.0.9000
Authors@R: c(person(given = "Elliot",
family = "Gould",
email = "[email protected]",
Expand Down Expand Up @@ -68,7 +68,7 @@ Remotes:
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
URL: https://github.com/egouldo/ManyEcoEvo,
https://egouldo.github.io/ManyEcoEvo/
BugReports: https://github.com/egouldo/ManyEcoEvo/issues
Expand Down
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ export(fit_metafor_mv_reduced)
export(fit_metafor_uni)
export(fit_sorensen_glm)
export(fit_uni_mixed_effects)
export(generate_collinearity_subset)
export(generate_exclusion_subsets)
export(generate_expertise_subsets)
export(generate_outlier_subsets)
Expand All @@ -56,6 +57,8 @@ export(meta_analyse_datasets)
export(named_group_split)
export(plot_cont_rating_effects)
export(plot_effects_diversity)
export(plot_model_means_box_cox_cat)
export(plot_model_means_orchard)
export(power_back)
export(pred_to_Z)
export(prepare_ManyEcoEvo)
Expand Down Expand Up @@ -89,7 +92,12 @@ export(summarise_variable_counts)
export(validate_predictions)
export(validate_predictions_df_blue_tit)
export(validate_predictions_df_euc)
import(dplyr)
import(ggbeeswarm)
import(ggplot2)
import(metafor)
import(see)
importFrom(EnvStats,stat_n_text)
importFrom(broom,tidy)
importFrom(dplyr,across)
importFrom(dplyr,case_when)
Expand All @@ -108,13 +116,15 @@ importFrom(dplyr,rename)
importFrom(dplyr,right_join)
importFrom(dplyr,select)
importFrom(dplyr,summarise)
importFrom(forcats,fct_relevel)
importFrom(magrittr,"%>%")
importFrom(pointblank,col_vals_not_null)
importFrom(purrr,map)
importFrom(purrr,map_dfr)
importFrom(purrr,set_names)
importFrom(rlang,is_na)
importFrom(rlang,na_chr)
importFrom(sae,bxcx)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,separate)
Expand Down
70 changes: 70 additions & 0 deletions R/generate_collinearity_subset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#' Generate Collinearity Data Subset
#'
#' This function generates a subset of the data that is used to demonstrate the
#' effects of collinearity on regression models. The data is generated by
#' sampling from a multivariate normal distribution with a specified correlation
#' matrix.
#'
#' #'
#' @param ManyEcoEvo a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, `dataset`, `publishable_subset`, and `exclusion_set`. See details.
#' @param collinearity_subset a dataframe containing the column `response_id` containing response ID's to be included in the expert subset
#'
#' @return A ManyEcoEvo dataframe with added column `expertise_subset` with new subsets of `data` and `diversity_data`
#' @export
#' @details
#' Note that this function needs to be run on `ManyEcoEvo` after the following functions have been run (See examples):
#' - `prepare_response_variables()`
#' - `generate_exclusion_subsets()`
#' - `generate_rating_subsets()`
#'
#' `generate_collinearity_subset()` only creates expertise subsets based on the full dataset where `exclusion_set == "complete"` and `publishable_subset == "All"` and `expertise_subset == "All"`.
#' @examples
#' ManyEcoEvo %>%
#' prepare_response_variables(estimate_type = "Zr") |>
#' generate_exclusion_subsets(estimate_type = "Zr") |>
#' generate_rating_subsets() |>
#' generate_expertise_subsets(expert_subset) |>
#' generate_collinearity_subset(collinearity_subset = collinearity_subset)
generate_collinearity_subset <- function(ManyEcoEvo, collinearity_subset) {
# Check if the inputs are a dataframe
if (!is.data.frame(collinearity_subset)) {
stop("collinearity_subset must be a dataframe.")
}

if (!is.data.frame(ManyEcoEvo)) {
stop("ManyEcoEvo must be a dataframe.")
}

# Check if the subset_collumn dataframe has the correct column names
if (!all(c("response_id", "id_col") %in% colnames(collinearity_subset))) {
stop("The input dataframe must contain the column 'response_id' and 'id_col'.")
}

# Check if the response_id column is unique
if (length(unique(collinearity_subset$id_col)) != nrow(collinearity_subset)) {
stop("The 'id_col' column in collinearity_subset must be unique.")
}

collinearity_subset_dataset <- collinearity_subset %>% pluck("dataset", unique)

collinear_removed <- ManyEcoEvo %>%
filter(publishable_subset == "All" & exclusion_set == "complete" & expertise_subset == "All",
dataset %in% collinearity_subset_dataset) %>%
mutate(data = map(.x = data,
.f = dplyr::anti_join, collinearity_subset,
by = join_by(response_id, id_col, dataset) )) %>%
mutate(diversity_data =
map2(.x = diversity_data,
.y = data,
.f = ~ semi_join(.x, .y) %>% distinct),
collinearity_subset = "collinearity_removed")

out <- bind_rows(
ManyEcoEvo %>%
mutate(collinearity_subset = "All"),
collinear_removed
)

return(out)

}
2 changes: 2 additions & 0 deletions R/generate_expertise_subsets.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#' generate_rating_subsets() |>
#' generate_expertise_subsets(expert_subset)
generate_expertise_subsets <- function(ManyEcoEvo, expert_subset) {
#TODO idea, allow ellipses arg in function and pass those expressions to filter.
# that way isn't hardcoded in the function. Repeat for all other generate / exclude map funs
# NOTE: should be run *after* computing Zr with compute_MA_inputs()
out <- ManyEcoEvo %>%
filter(publishable_subset == "All" & exclusion_set == "complete") %>%
Expand Down
4 changes: 2 additions & 2 deletions R/generate_outlier_subsets.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ generate_outlier_subsets <- function(ManyEcoEvo){
# TODO: will nolonger work on Zr dataset, because this doesn't contain an estimate_type col?
# TODO: Don't run with the reduced publishability subset.... some of these already only have 10 data points!!
# apply conditional behaviour to trigger both
#
# TODO: do not run for collinearity_removed datasets
if(str_detect(ManyEcoEvo$estimate_type, "Zr") %>% any(na.rm = TRUE)){
ManyEcoEvo_Zr <- ManyEcoEvo %>%
filter(estimate_type == "Zr") %>%
bind_rows(., {ManyEcoEvo %>%
filter(estimate_type == "Zr") %>%
filter(estimate_type == "Zr", collinearity_subset != "collinearity_removed") %>%
mutate(effects_analysis = map(effects_analysis,
~ slice_max(.x, Zr, n = -2) %>%
slice_min(Zr, n = -2))) %>%
Expand Down
5 changes: 3 additions & 2 deletions R/make_viz.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ make_viz <- function(data) {
if(any(str_detect(unique(data$estimate_type),pattern = "Zr"))){
data_Zr <- data %>%
filter(estimate_type == "Zr") %>%
group_by(exclusion_set, dataset, estimate_type, publishable_subset, expertise_subset, data) %>%
group_by(exclusion_set, dataset, estimate_type, publishable_subset, expertise_subset, collinearity_subset, data) %>%
pivot_longer(names_to = "model_name",
values_to = "model",
cols = c(-exclusion_set,
Expand All @@ -29,7 +29,8 @@ make_viz <- function(data) {
-diversity_indices,
-effects_analysis,
-publishable_subset,
-expertise_subset)) %>%
-expertise_subset,
-collinearity_subset)) %>%
ungroup %>%
select(-data,
-diversity_data,
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
3 changes: 2 additions & 1 deletion _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ tar_option_set(
packages = pkgs,
imports = "ManyEcoEvo",
# debug = c("augmented_data_3efd9941")#, #augmented_data_a4d78efa
cue = tar_cue(mode = "always") #because we have silent errors!
# cue = tar_cue(mode = "always") #because we have silent errors!
)

list(tarchetypes::tar_file_read(name = euc_reviews,
Expand Down Expand Up @@ -80,6 +80,7 @@ list(tarchetypes::tar_file_read(name = euc_reviews,
generate_exclusion_subsets(estimate_type = "Zr") |>
generate_rating_subsets() |>
generate_expertise_subsets(expert_subset) |>
generate_collinearity_subset(ManyEcoEvo:::collinearity_subset) |>
compute_MA_inputs(estimate_type = "Zr") |>
generate_outlier_subsets() |> # TODO run before MA_inputs? diversity indices need to be recalculated!!
filter(expertise_subset != "expert" | exclusion_set != "complete-rm_outliers") |> #TODO mv into generate_outlier_subsets() so aren't created in the first place
Expand Down
Loading