From b624c7e90736b80cc95dac4394a88e805a4acdf9 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Fri, 13 Sep 2024 00:44:05 +0000 Subject: [PATCH 1/2] Add internal set of params for config --- config/config.yml | 5 +++++ pipeline/run-pipeline.R | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/config/config.yml b/config/config.yml index fd6484c..865e102 100644 --- a/config/config.yml +++ b/config/config.yml @@ -22,3 +22,8 @@ staging: prod: inherits: staging synFolderID: syn51406700 + +internal: + inherits: staging + parquetDirID: syn63135213 + s3basekey: staging/2024-09-10 diff --git a/pipeline/run-pipeline.R b/pipeline/run-pipeline.R index 4b9d2ea..5a7f0e7 100644 --- a/pipeline/run-pipeline.R +++ b/pipeline/run-pipeline.R @@ -2,7 +2,7 @@ tictoc::tic(msg = "INFO: Total execution time") # Get config variables list2env( x = config::get(file = "config/config.yml", - config = "staging"), + config = "internal"), envir = .GlobalEnv ) From 27b316f29a10c0fa223377222f65b821abebaa28 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Fri, 13 Sep 2024 00:45:55 +0000 Subject: [PATCH 2/2] Update cat() progress statements to make them more readable --- scripts/process-data/fitbitactivitylogs.R | 9 ++++++--- scripts/process-data/fitbitdailydata.R | 9 ++++++--- scripts/process-data/fitbitecg.R | 10 ++++++---- scripts/process-data/fitbitintradaycombined.R | 9 ++++++--- scripts/process-data/fitbitsleeplogs.R | 10 ++++++---- scripts/process-data/healthkitv2electrocardiogram.R | 10 ++++++---- scripts/process-data/healthkitv2samples.R | 6 ++++-- scripts/process-data/healthkitv2statistics.R | 6 ++++-- scripts/process-data/participant_devices.R | 3 ++- 9 files changed, 46 insertions(+), 26 deletions(-) diff --git a/scripts/process-data/fitbitactivitylogs.R b/scripts/process-data/fitbitactivitylogs.R index efc0475..3558186 100644 --- a/scripts/process-data/fitbitactivitylogs.R +++ b/scripts/process-data/fitbitactivitylogs.R @@ -49,6 +49,7 @@ for (col_name in names(df_filtered)) { } # Pivot data frame from long to wide +cat("recoverutils::melt_df()....") df_melted_filtered <- df_filtered %>% recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% @@ -58,17 +59,19 @@ df_melted_filtered <- if("value" %in% colnames(.)) "value") %>% tidyr::drop_na("value") %>% mutate(value = as.numeric(value)) -cat("recoverutils::melt_df() completed.\n") +cat("OK\n") # Generate i2b2 summaries +cat("recoverutils::stat_summarize()....") df_summarized <- df_melted_filtered %>% select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% recoverutils::stat_summarize() %>% distinct() -cat("recoverutils::stat_summarize() completed.\n") +cat("OK\n") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>% dplyr::mutate(nval_num = signif(nval_num, 9)) %>% @@ -76,7 +79,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from fitbit variables fitbit_participants <- diff --git a/scripts/process-data/fitbitdailydata.R b/scripts/process-data/fitbitdailydata.R index 3c5ed07..51296c7 100644 --- a/scripts/process-data/fitbitdailydata.R +++ b/scripts/process-data/fitbitdailydata.R @@ -54,6 +54,7 @@ for (col_name in names(df_filtered)) { } # Pivot data frame from long to wide +cat("recoverutils::melt_df()....") df_melted_filtered <- df_filtered %>% recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% @@ -63,9 +64,10 @@ df_melted_filtered <- if("value" %in% colnames(.)) "value") %>% tidyr::drop_na("value") %>% mutate(value = as.numeric(value)) -cat("recoverutils::melt_df() completed.\n") +cat("OK\n") # Generate i2b2 summaries +cat("recoverutils::stat_summarize()....") df_summarized <- df_melted_filtered %>% rename(startdate = dplyr::any_of(c("date", "datetime"))) %>% @@ -73,9 +75,10 @@ df_summarized <- select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% recoverutils::stat_summarize() %>% distinct() -cat("recoverutils::stat_summarize() completed.\n") +cat("OK\n") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, @@ -87,7 +90,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from fitbit variables curr_fitbit_participants <- diff --git a/scripts/process-data/fitbitecg.R b/scripts/process-data/fitbitecg.R index 5fee30b..f683420 100644 --- a/scripts/process-data/fitbitecg.R +++ b/scripts/process-data/fitbitecg.R @@ -15,7 +15,6 @@ #' `participantidentifier` and `concept` for all data, and #' `participantidentifier`, `concept`, `year`, `week` for weekly summaries). ecg_stat_summarize <- function(df) { - cat("Running ecg_stat_summarize()...\n") if (!is.data.frame(df)) stop("df must be a data frame") if (!all(c("participantidentifier", "concept", "value") %in% colnames(df))) stop("'participantidentifier', 'concept', and 'value' columns must be present in df") @@ -131,6 +130,7 @@ approved_concepts_summarized <- ) # Pivot data frame from long to wide +cat("recoverutils::melt_df()....") df_melted_filtered <- df %>% mutate("SinusRhythm" = if_else(resultclassification == "Normal Sinus Rhythm", 1, NA), @@ -143,9 +143,10 @@ df_melted_filtered <- if("value" %in% colnames(.)) "value") %>% tidyr::drop_na("value") %>% mutate(value = as.numeric(value)) -cat("recoverutils::melt_df() completed.\n") +cat("OK\n") # Generate i2b2 summaries +cat("ecg_stat_summarize()....") df_summarized <- df_melted_filtered %>% rename(startdate = dplyr::any_of(c("date", "datetime"))) %>% @@ -154,9 +155,10 @@ df_summarized <- ecg_stat_summarize() %>% mutate(value = as.numeric(value)) %>% distinct() -cat("ecg_stat_summarize() completed.\n") +cat("OK\n") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, @@ -168,7 +170,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from fitbit variables curr_fitbit_participants <- diff --git a/scripts/process-data/fitbitintradaycombined.R b/scripts/process-data/fitbitintradaycombined.R index 503b8a1..127af6a 100644 --- a/scripts/process-data/fitbitintradaycombined.R +++ b/scripts/process-data/fitbitintradaycombined.R @@ -72,6 +72,7 @@ for (col_name in names(df_filtered)) { } # Pivot data frame from long to wide +cat("recoverutils::melt_df()....") df_melted_filtered <- df_filtered %>% recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% @@ -81,9 +82,10 @@ df_melted_filtered <- if("value" %in% colnames(.)) "value") %>% tidyr::drop_na("value") %>% mutate(value = as.numeric(value)) -cat("recoverutils::melt_df() completed.\n") +cat("OK\n") # Generate i2b2 summaries +cat("recoverutils::stat_summarize()....") df_summarized <- df_melted_filtered %>% rename(startdate = dplyr::any_of(c("date", "datetime"))) %>% @@ -91,9 +93,10 @@ df_summarized <- select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% recoverutils::stat_summarize() %>% distinct() -cat("recoverutils::stat_summarize() completed.\n") +cat("OK\n") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>% dplyr::mutate(nval_num = signif(nval_num, 9)) %>% @@ -101,7 +104,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from fitbit variables curr_fitbit_participants <- diff --git a/scripts/process-data/fitbitsleeplogs.R b/scripts/process-data/fitbitsleeplogs.R index ea122b4..255a39f 100644 --- a/scripts/process-data/fitbitsleeplogs.R +++ b/scripts/process-data/fitbitsleeplogs.R @@ -15,7 +15,6 @@ #' `participantidentifier` and `concept` for all data, and #' `participantidentifier`, `concept`, `year`, `week` for weekly summaries). sleeplogs_stat_summarize <- function(df) { - cat("Running sleeplogs_stat_summarize()...\n") if (!is.data.frame(df)) stop("df must be a data frame") if (!all(c("participantidentifier", "concept", "value") %in% colnames(df))) stop("'participantidentifier', 'concept', and 'value' columns must be present in df") @@ -332,6 +331,7 @@ for (col_name in names(df_filtered)) { } # Pivot data frames from long to wide +cat("recoverutils::melt_df()....") df_melted_filtered <- df_filtered %>% recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% @@ -366,9 +366,10 @@ numepisodes_df_melted_filtered_weekly <- tidyr::drop_na("value") %>% mutate(value = as.numeric(value)) -cat("recoverutils::melt_df() completed.\n") +cat("OK\n") # Generate i2b2 summaries +cat("sleeplogs_stat_summarize()....") df_summarized <- df_melted_filtered %>% select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% @@ -397,9 +398,10 @@ final_df_summarized <- numepisodes_df_summarized_weekly) %>% dplyr::distinct() -cat("sleeplogs_stat_summarize() completed.\n") +cat("OK\n") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(final_df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>% dplyr::mutate(nval_num = signif(nval_num, 9)) %>% @@ -407,7 +409,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from fitbit variables curr_fitbit_participants <- diff --git a/scripts/process-data/healthkitv2electrocardiogram.R b/scripts/process-data/healthkitv2electrocardiogram.R index cfde8cc..1cf9cc3 100644 --- a/scripts/process-data/healthkitv2electrocardiogram.R +++ b/scripts/process-data/healthkitv2electrocardiogram.R @@ -15,7 +15,6 @@ #' `participantidentifier` and `concept` for all data, and #' `participantidentifier`, `concept`, `year`, `week` for weekly summaries). ecg_stat_summarize <- function(df) { - cat("Running ecg_stat_summarize()...\n") if (!is.data.frame(df)) stop("df must be a data frame") if (!all(c("participantidentifier", "concept", "value") %in% colnames(df))) stop("'participantidentifier', 'concept', and 'value' columns must be present in df") @@ -136,6 +135,7 @@ approved_concepts_summarized <- ) # Pivot data frame from long to wide +cat("recoverutils::melt_df()....") df_melted_filtered <- df %>% mutate("SinusRhythm" = if_else(classification == "SinusRhythm", 1, NA), @@ -148,18 +148,20 @@ df_melted_filtered <- if("value" %in% colnames(.)) "value") %>% tidyr::drop_na("value") %>% mutate(value = as.numeric(value)) -cat("recoverutils::melt_df() completed.\n") +cat("OK\n") # Generate i2b2 summaries +cat("ecg_stat_summarize()....") df_summarized <- df_melted_filtered %>% select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% ecg_stat_summarize() %>% mutate(value = as.numeric(value)) %>% distinct() -cat("ecg_stat_summarize() completed.\n") +cat("OK\n") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, @@ -171,7 +173,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from healthkit variables curr_hk_participants <- diff --git a/scripts/process-data/healthkitv2samples.R b/scripts/process-data/healthkitv2samples.R index af4c9e5..7aec755 100644 --- a/scripts/process-data/healthkitv2samples.R +++ b/scripts/process-data/healthkitv2samples.R @@ -74,13 +74,14 @@ df_melted_filtered <- cat("Melt and filtering step completed.\n") # Generate i2b2 summaries +cat("recoverutils::stat_summarize()....") df_summarized <- df_melted_filtered %>% rename(enddate = "date") %>% select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% recoverutils::stat_summarize() %>% distinct() -cat("recoverutils::stat_summarize() completed.\n") +cat("OK\n") tmp_concept_replacements <- c("respiratoryrate" = "breathingrate", "heartratevariability" = "hrv", @@ -88,6 +89,7 @@ tmp_concept_replacements <- c("respiratoryrate" = "breathingrate", "oxygensaturation" = "spo2avg") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, @@ -99,7 +101,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from healthkit variables curr_hk_participants <- diff --git a/scripts/process-data/healthkitv2statistics.R b/scripts/process-data/healthkitv2statistics.R index 1bf0a2d..1861273 100644 --- a/scripts/process-data/healthkitv2statistics.R +++ b/scripts/process-data/healthkitv2statistics.R @@ -49,17 +49,19 @@ df_melted_filtered <- cat("Melt and filtering step completed.\n") # Generate i2b2 summaries +cat("recoverutils::stat_summarize()....") df_summarized <- df_melted_filtered %>% rename(enddate = "date") %>% select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% recoverutils::stat_summarize() %>% distinct() -cat("recoverutils::stat_summarize() completed.\n") +cat("OK\n") tmp_concept_replacements <- c("dailysteps" = "steps") # Add i2b2 columns from concept map (ontology file) and clean the output +cat("recoverutils::process_df()....") output_concepts <- recoverutils::process_df(df_summarized, concept_map, @@ -71,7 +73,7 @@ output_concepts <- dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% replace(is.na(.), "") %>% dplyr::filter(nval_num != "" | tval_char != "") -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Identify the participants who have output concepts derived from healthkit variables curr_hk_participants <- diff --git a/scripts/process-data/participant_devices.R b/scripts/process-data/participant_devices.R index 1e569a7..a0d2fec 100644 --- a/scripts/process-data/participant_devices.R +++ b/scripts/process-data/participant_devices.R @@ -66,6 +66,7 @@ df_joined <- # Add i2b2 columns from concept map (ontology file) and clean the output concept_map_concepts <- "CONCEPT_CD" concept_map_units <- "UNITS_CD" +cat("recoverutils::process_df()....") output_concepts <- df_joined %>% dplyr::mutate(valtype_cd = dplyr::case_when(class(value) == "numeric" ~ "N", @@ -84,7 +85,7 @@ output_concepts <- dplyr::select(participantidentifier, startdate, enddate, concept, valtype_cd, nval_num, tval_char, UNITS_CD) %>% dplyr::rename(units_cd = UNITS_CD) -cat("recoverutils::process_df() completed.\n") +cat("OK\n") # Write the output output_concepts %>%