Pass S3 object path instead of local file path since S3 objects are n…

…o longer synced to local directory and are instead directly read from S3 bucket connection
Sage-Bionetworks · Jul 11, 2024 · 612d859 · 612d859
1 parent 5d18968
commit 612d859
Show file tree

Hide file tree

Showing 9 changed files with 13 additions and 31 deletions.
diff --git a/scripts/process-data/fitbitactivitylogs.R b/scripts/process-data/fitbitactivitylogs.R
@@ -1,5 +1,3 @@
-library(dplyr)
-
 dataset <- "fitbitactivitylogs"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
@@ -12,7 +10,7 @@ vars <-
 
 # Load the desired subset of this dataset in memory
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(vars)) %>% 
   collect()
 

diff --git a/scripts/process-data/fitbitdailydata.R b/scripts/process-data/fitbitdailydata.R
@@ -1,5 +1,3 @@
-library(dplyr)
-
 dataset <- "fitbitdailydata"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
@@ -12,7 +10,7 @@ vars <-
 
 # Load the desired subset of this dataset in memory
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   mutate(Steps = as.numeric(Steps),
          HeartRateIntradayMinuteCount = as.numeric(HeartRateIntradayMinuteCount)) %>% 
   select(all_of(c(vars, "HeartRateIntradayMinuteCount"))) %>% 

diff --git a/scripts/process-data/fitbitecg.R b/scripts/process-data/fitbitecg.R
@@ -100,9 +100,7 @@ ecg_stat_summarize <- function(df) {
   return(result)
 }
 
-library(dplyr)
-
-dataset <- "fitbitecg"
+dataset <- "fitbitecg$"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
 
@@ -114,7 +112,7 @@ vars <-
 
 # Load the desired subset of this dataset in memory and do some feature engineering for derived variables
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(c(vars))) %>% 
   filter(ResultClassification %in% c("Normal Sinus Rhythm", "Atrial Fibrillation")) %>% 
   rename(StartDate = StartTime) %>% 

diff --git a/scripts/process-data/fitbitintradaycombined.R b/scripts/process-data/fitbitintradaycombined.R
@@ -1,5 +1,3 @@
-library(dplyr)
-
 dataset <- "fitbitintradaycombined"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
@@ -12,7 +10,7 @@ vars <-
 
 # Load the desired subset of this dataset in memory
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(vars)) %>% 
   mutate(
     DeepSleepSummaryBreathRate = as.numeric(DeepSleepSummaryBreathRate),

diff --git a/scripts/process-data/fitbitsleeplogs.R b/scripts/process-data/fitbitsleeplogs.R
@@ -126,9 +126,7 @@ sleeplogs_stat_summarize <- function(df) {
   return(result)
 }
 
-library(dplyr)
-
-dataset <- "fitbitsleeplogs"
+dataset <- "fitbitsleeplogs$"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
 
@@ -140,7 +138,7 @@ vars <-
 
 # Load the desired subset of this dataset in memory and do some feature engineering for derived variables
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(c(vars, "LogId"))) %>% 
   collect() %>% 
   distinct() %>% 
@@ -212,7 +210,7 @@ sleeplogsdetails_vars <-
   pull(Variable)
 
 sleeplogsdetails_df <- 
-  arrow::open_dataset(file.path(downloadLocation, "dataset_fitbitsleeplogs_sleeplogdetails")) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, "sleeplogdetails"))) %>% 
   select(all_of(sleeplogsdetails_vars)) %>% 
   collect() %>% 
   distinct() %>% 

diff --git a/scripts/process-data/healthkitv2electrocardiogram.R b/scripts/process-data/healthkitv2electrocardiogram.R
@@ -100,9 +100,7 @@ ecg_stat_summarize <- function(df) {
   return(result)
 }
 
-library(dplyr)
-
-dataset <- "healthkitv2electrocardiogram"
+dataset <- "healthkitv2electrocardiogram$"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
 
@@ -119,7 +117,7 @@ participants_to_exclude <-
 
 # Load the desired subset of this dataset in memory and do some feature engineering for derived variables
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(c(vars))) %>% 
   dplyr::filter(!(ParticipantIdentifier %in% participants_to_exclude)) %>% 
   filter(Classification %in% c("SinusRhythm", "AtrialFibrillation")) %>% 

diff --git a/scripts/process-data/healthkitv2samples.R b/scripts/process-data/healthkitv2samples.R
@@ -1,5 +1,3 @@
-library(dplyr)
-
 dataset <- "healthkitv2samples"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
@@ -17,7 +15,7 @@ participants_to_exclude <-
 
 # Load the desired subset of this dataset in memory
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(vars)) %>% 
   dplyr::filter(!(ParticipantIdentifier %in% participants_to_exclude)) %>%
   dplyr::filter(

diff --git a/scripts/process-data/healthkitv2statistics.R b/scripts/process-data/healthkitv2statistics.R
@@ -1,5 +1,3 @@
-library(dplyr)
-
 dataset <- "healthkitv2statistics"
 
 cat(glue::glue("Transforming data for {dataset}"),"\n")
@@ -17,7 +15,7 @@ participants_to_exclude <-
 
 # Load the desired subset of this dataset in memory
 df <- 
-  arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>% 
+  arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
   select(all_of(vars)) %>% 
   dplyr::filter(Type=="DailySteps") %>% 
   dplyr::filter(!(ParticipantIdentifier %in% participants_to_exclude)) %>% 

diff --git a/scripts/process-data/participant_devices.R b/scripts/process-data/participant_devices.R
@@ -1,5 +1,3 @@
-library(dplyr)
-
 dataset <- c("fitbitdevices", "healthkitv2samples")
 
 cat(glue::glue("Transforming device data for {dataset}"),"\n")
@@ -15,7 +13,7 @@ vars <- list(fitbitdevices = c("ParticipantIdentifier",
 df <- 
   lapply(dataset, function(x) {
     tmp <- vars[[x]]
-    arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{x}"))) %>% 
+    arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>% 
       select(all_of(tmp)) %>% 
       dplyr::rename_with(tolower) %>% 
       collect()