From b624c7e90736b80cc95dac4394a88e805a4acdf9 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Fri, 13 Sep 2024 00:44:05 +0000
Subject: [PATCH 1/2] Add internal set of params for config

---
 config/config.yml       | 5 +++++
 pipeline/run-pipeline.R | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/config/config.yml b/config/config.yml
index fd6484c..865e102 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -22,3 +22,8 @@ staging:
 prod:
   inherits: staging
   synFolderID: syn51406700
+
+internal:
+  inherits: staging
+  parquetDirID: syn63135213
+  s3basekey: staging/2024-09-10
diff --git a/pipeline/run-pipeline.R b/pipeline/run-pipeline.R
index 4b9d2ea..5a7f0e7 100644
--- a/pipeline/run-pipeline.R
+++ b/pipeline/run-pipeline.R
@@ -2,7 +2,7 @@ tictoc::tic(msg = "INFO: Total execution time")
 # Get config variables
 list2env(
   x = config::get(file = "config/config.yml", 
-                  config = "staging"),
+                  config = "internal"),
   envir = .GlobalEnv
 )
 

From 27b316f29a10c0fa223377222f65b821abebaa28 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Fri, 13 Sep 2024 00:45:55 +0000
Subject: [PATCH 2/2] Update cat() progress statements to make them more
 readable

---
 scripts/process-data/fitbitactivitylogs.R           |  9 ++++++---
 scripts/process-data/fitbitdailydata.R              |  9 ++++++---
 scripts/process-data/fitbitecg.R                    | 10 ++++++----
 scripts/process-data/fitbitintradaycombined.R       |  9 ++++++---
 scripts/process-data/fitbitsleeplogs.R              | 10 ++++++----
 scripts/process-data/healthkitv2electrocardiogram.R | 10 ++++++----
 scripts/process-data/healthkitv2samples.R           |  6 ++++--
 scripts/process-data/healthkitv2statistics.R        |  6 ++++--
 scripts/process-data/participant_devices.R          |  3 ++-
 9 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/scripts/process-data/fitbitactivitylogs.R b/scripts/process-data/fitbitactivitylogs.R
index efc0475..3558186 100644
--- a/scripts/process-data/fitbitactivitylogs.R
+++ b/scripts/process-data/fitbitactivitylogs.R
@@ -49,6 +49,7 @@ for (col_name in names(df_filtered)) {
 }
 
 # Pivot data frame from long to wide
+cat("recoverutils::melt_df()....")
 df_melted_filtered <- 
   df_filtered %>% 
   recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% 
@@ -58,17 +59,19 @@ df_melted_filtered <-
          if("value" %in% colnames(.)) "value") %>% 
   tidyr::drop_na("value") %>% 
   mutate(value = as.numeric(value))
-cat("recoverutils::melt_df() completed.\n")
+cat("OK\n")
 
 # Generate i2b2 summaries
+cat("recoverutils::stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
   recoverutils::stat_summarize() %>% 
   distinct()
-cat("recoverutils::stat_summarize() completed.\n")
+cat("OK\n")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>% 
   dplyr::mutate(nval_num = signif(nval_num, 9)) %>% 
@@ -76,7 +79,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from fitbit variables
 fitbit_participants <- 
diff --git a/scripts/process-data/fitbitdailydata.R b/scripts/process-data/fitbitdailydata.R
index 3c5ed07..51296c7 100644
--- a/scripts/process-data/fitbitdailydata.R
+++ b/scripts/process-data/fitbitdailydata.R
@@ -54,6 +54,7 @@ for (col_name in names(df_filtered)) {
 }
 
 # Pivot data frame from long to wide
+cat("recoverutils::melt_df()....")
 df_melted_filtered <- 
   df_filtered %>% 
   recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% 
@@ -63,9 +64,10 @@ df_melted_filtered <-
          if("value" %in% colnames(.)) "value") %>% 
   tidyr::drop_na("value") %>% 
   mutate(value = as.numeric(value))
-cat("recoverutils::melt_df() completed.\n")
+cat("OK\n")
 
 # Generate i2b2 summaries
+cat("recoverutils::stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   rename(startdate = dplyr::any_of(c("date", "datetime"))) %>% 
@@ -73,9 +75,10 @@ df_summarized <-
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
   recoverutils::stat_summarize() %>% 
   distinct()
-cat("recoverutils::stat_summarize() completed.\n")
+cat("OK\n")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, 
              concept_map, 
@@ -87,7 +90,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from fitbit variables
 curr_fitbit_participants <- 
diff --git a/scripts/process-data/fitbitecg.R b/scripts/process-data/fitbitecg.R
index 5fee30b..f683420 100644
--- a/scripts/process-data/fitbitecg.R
+++ b/scripts/process-data/fitbitecg.R
@@ -15,7 +15,6 @@
 #' `participantidentifier` and `concept` for all data, and 
 #' `participantidentifier`, `concept`, `year`, `week` for weekly summaries).
 ecg_stat_summarize <- function(df) {
-  cat("Running ecg_stat_summarize()...\n")
   
   if (!is.data.frame(df)) stop("df must be a data frame")
   if (!all(c("participantidentifier", "concept", "value") %in% colnames(df))) stop("'participantidentifier', 'concept', and 'value' columns must be present in df")
@@ -131,6 +130,7 @@ approved_concepts_summarized <-
   )
 
 # Pivot data frame from long to wide
+cat("recoverutils::melt_df()....")
 df_melted_filtered <- 
   df %>% 
   mutate("SinusRhythm" = if_else(resultclassification == "Normal Sinus Rhythm", 1, NA),
@@ -143,9 +143,10 @@ df_melted_filtered <-
          if("value" %in% colnames(.)) "value") %>% 
   tidyr::drop_na("value") %>% 
   mutate(value = as.numeric(value))
-cat("recoverutils::melt_df() completed.\n")
+cat("OK\n")
 
 # Generate i2b2 summaries
+cat("ecg_stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   rename(startdate = dplyr::any_of(c("date", "datetime"))) %>% 
@@ -154,9 +155,10 @@ df_summarized <-
   ecg_stat_summarize() %>% 
   mutate(value = as.numeric(value)) %>% 
   distinct()
-cat("ecg_stat_summarize() completed.\n")
+cat("OK\n")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, 
                                 concept_map, 
@@ -168,7 +170,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from fitbit variables
 curr_fitbit_participants <- 
diff --git a/scripts/process-data/fitbitintradaycombined.R b/scripts/process-data/fitbitintradaycombined.R
index 503b8a1..127af6a 100644
--- a/scripts/process-data/fitbitintradaycombined.R
+++ b/scripts/process-data/fitbitintradaycombined.R
@@ -72,6 +72,7 @@ for (col_name in names(df_filtered)) {
 }
 
 # Pivot data frame from long to wide
+cat("recoverutils::melt_df()....")
 df_melted_filtered <- 
   df_filtered %>% 
   recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% 
@@ -81,9 +82,10 @@ df_melted_filtered <-
          if("value" %in% colnames(.)) "value") %>% 
   tidyr::drop_na("value") %>% 
   mutate(value = as.numeric(value))
-cat("recoverutils::melt_df() completed.\n")
+cat("OK\n")
 
 # Generate i2b2 summaries
+cat("recoverutils::stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   rename(startdate = dplyr::any_of(c("date", "datetime"))) %>% 
@@ -91,9 +93,10 @@ df_summarized <-
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
   recoverutils::stat_summarize() %>% 
   distinct()
-cat("recoverutils::stat_summarize() completed.\n")
+cat("OK\n")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>% 
   dplyr::mutate(nval_num = signif(nval_num, 9)) %>% 
@@ -101,7 +104,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from fitbit variables
 curr_fitbit_participants <- 
diff --git a/scripts/process-data/fitbitsleeplogs.R b/scripts/process-data/fitbitsleeplogs.R
index ea122b4..255a39f 100644
--- a/scripts/process-data/fitbitsleeplogs.R
+++ b/scripts/process-data/fitbitsleeplogs.R
@@ -15,7 +15,6 @@
 #' `participantidentifier` and `concept` for all data, and 
 #' `participantidentifier`, `concept`, `year`, `week` for weekly summaries).
 sleeplogs_stat_summarize <- function(df) {
-  cat("Running sleeplogs_stat_summarize()...\n")
   
   if (!is.data.frame(df)) stop("df must be a data frame")
   if (!all(c("participantidentifier", "concept", "value") %in% colnames(df))) stop("'participantidentifier', 'concept', and 'value' columns must be present in df")
@@ -332,6 +331,7 @@ for (col_name in names(df_filtered)) {
 }
 
 # Pivot data frames from long to wide
+cat("recoverutils::melt_df()....")
 df_melted_filtered <- 
   df_filtered %>% 
   recoverutils::melt_df(excluded_concepts = excluded_concepts) %>% 
@@ -366,9 +366,10 @@ numepisodes_df_melted_filtered_weekly <-
   tidyr::drop_na("value") %>%
   mutate(value = as.numeric(value))
 
-cat("recoverutils::melt_df() completed.\n")
+cat("OK\n")
 
 # Generate i2b2 summaries
+cat("sleeplogs_stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
@@ -397,9 +398,10 @@ final_df_summarized <-
                    numepisodes_df_summarized_weekly) %>% 
   dplyr::distinct()
 
-cat("sleeplogs_stat_summarize() completed.\n")
+cat("OK\n")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(final_df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>% 
   dplyr::mutate(nval_num = signif(nval_num, 9)) %>% 
@@ -407,7 +409,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from fitbit variables
 curr_fitbit_participants <- 
diff --git a/scripts/process-data/healthkitv2electrocardiogram.R b/scripts/process-data/healthkitv2electrocardiogram.R
index cfde8cc..1cf9cc3 100644
--- a/scripts/process-data/healthkitv2electrocardiogram.R
+++ b/scripts/process-data/healthkitv2electrocardiogram.R
@@ -15,7 +15,6 @@
 #' `participantidentifier` and `concept` for all data, and 
 #' `participantidentifier`, `concept`, `year`, `week` for weekly summaries).
 ecg_stat_summarize <- function(df) {
-  cat("Running ecg_stat_summarize()...\n")
   
   if (!is.data.frame(df)) stop("df must be a data frame")
   if (!all(c("participantidentifier", "concept", "value") %in% colnames(df))) stop("'participantidentifier', 'concept', and 'value' columns must be present in df")
@@ -136,6 +135,7 @@ approved_concepts_summarized <-
   )
 
 # Pivot data frame from long to wide
+cat("recoverutils::melt_df()....")
 df_melted_filtered <- 
   df %>% 
   mutate("SinusRhythm" = if_else(classification == "SinusRhythm", 1, NA),
@@ -148,18 +148,20 @@ df_melted_filtered <-
          if("value" %in% colnames(.)) "value") %>% 
   tidyr::drop_na("value") %>% 
   mutate(value = as.numeric(value))
-cat("recoverutils::melt_df() completed.\n")
+cat("OK\n")
 
 # Generate i2b2 summaries
+cat("ecg_stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
   ecg_stat_summarize() %>% 
   mutate(value = as.numeric(value)) %>% 
   distinct()
-cat("ecg_stat_summarize() completed.\n")
+cat("OK\n")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, 
                                 concept_map, 
@@ -171,7 +173,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from healthkit variables
 curr_hk_participants <- 
diff --git a/scripts/process-data/healthkitv2samples.R b/scripts/process-data/healthkitv2samples.R
index af4c9e5..7aec755 100644
--- a/scripts/process-data/healthkitv2samples.R
+++ b/scripts/process-data/healthkitv2samples.R
@@ -74,13 +74,14 @@ df_melted_filtered <-
 cat("Melt and filtering step completed.\n")
 
 # Generate i2b2 summaries
+cat("recoverutils::stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   rename(enddate = "date") %>% 
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
   recoverutils::stat_summarize() %>% 
   distinct()
-cat("recoverutils::stat_summarize() completed.\n")
+cat("OK\n")
 
 tmp_concept_replacements <- c("respiratoryrate" = "breathingrate",
                               "heartratevariability" = "hrv",
@@ -88,6 +89,7 @@ tmp_concept_replacements <- c("respiratoryrate" = "breathingrate",
                               "oxygensaturation" = "spo2avg")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, 
              concept_map, 
@@ -99,7 +101,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from healthkit variables
 curr_hk_participants <- 
diff --git a/scripts/process-data/healthkitv2statistics.R b/scripts/process-data/healthkitv2statistics.R
index 1bf0a2d..1861273 100644
--- a/scripts/process-data/healthkitv2statistics.R
+++ b/scripts/process-data/healthkitv2statistics.R
@@ -49,17 +49,19 @@ df_melted_filtered <-
 cat("Melt and filtering step completed.\n")
 
 # Generate i2b2 summaries
+cat("recoverutils::stat_summarize()....")
 df_summarized <- 
   df_melted_filtered %>% 
   rename(enddate = "date") %>% 
   select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>% 
   recoverutils::stat_summarize() %>% 
   distinct()
-cat("recoverutils::stat_summarize() completed.\n")
+cat("OK\n")
 
 tmp_concept_replacements <- c("dailysteps" = "steps")
 
 # Add i2b2 columns from concept map (ontology file) and clean the output
+cat("recoverutils::process_df()....")
 output_concepts <- 
   recoverutils::process_df(df_summarized, 
              concept_map, 
@@ -71,7 +73,7 @@ output_concepts <-
   dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>% 
   replace(is.na(.), "<null>") %>% 
   dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Identify the participants who have output concepts derived from healthkit variables
 curr_hk_participants <- 
diff --git a/scripts/process-data/participant_devices.R b/scripts/process-data/participant_devices.R
index 1e569a7..a0d2fec 100644
--- a/scripts/process-data/participant_devices.R
+++ b/scripts/process-data/participant_devices.R
@@ -66,6 +66,7 @@ df_joined <-
 # Add i2b2 columns from concept map (ontology file) and clean the output
 concept_map_concepts <- "CONCEPT_CD"
 concept_map_units <- "UNITS_CD"
+cat("recoverutils::process_df()....")
 output_concepts <- 
   df_joined %>% 
   dplyr::mutate(valtype_cd = dplyr::case_when(class(value) == "numeric" ~ "N", 
@@ -84,7 +85,7 @@ output_concepts <-
   dplyr::select(participantidentifier, startdate, enddate, 
                 concept, valtype_cd, nval_num, tval_char, UNITS_CD) %>% 
   dplyr::rename(units_cd = UNITS_CD)
-cat("recoverutils::process_df() completed.\n")
+cat("OK\n")
 
 # Write the output
 output_concepts %>%