Skip to content

Commit

Permalink
Pass S3 object path instead of local file path since S3 objects are n…
Browse files Browse the repository at this point in the history
…o longer synced to local directory and are instead directly read from S3 bucket connection
  • Loading branch information
pranavanba committed Jul 11, 2024
1 parent 5d18968 commit 612d859
Show file tree
Hide file tree
Showing 9 changed files with 13 additions and 31 deletions.
4 changes: 1 addition & 3 deletions scripts/process-data/fitbitactivitylogs.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
library(dplyr)

dataset <- "fitbitactivitylogs"

cat(glue::glue("Transforming data for {dataset}"),"\n")
Expand All @@ -12,7 +10,7 @@ vars <-

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(vars)) %>%
collect()

Expand Down
4 changes: 1 addition & 3 deletions scripts/process-data/fitbitdailydata.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
library(dplyr)

dataset <- "fitbitdailydata"

cat(glue::glue("Transforming data for {dataset}"),"\n")
Expand All @@ -12,7 +10,7 @@ vars <-

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
mutate(Steps = as.numeric(Steps),
HeartRateIntradayMinuteCount = as.numeric(HeartRateIntradayMinuteCount)) %>%
select(all_of(c(vars, "HeartRateIntradayMinuteCount"))) %>%
Expand Down
6 changes: 2 additions & 4 deletions scripts/process-data/fitbitecg.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ ecg_stat_summarize <- function(df) {
return(result)
}

library(dplyr)

dataset <- "fitbitecg"
dataset <- "fitbitecg$"

cat(glue::glue("Transforming data for {dataset}"),"\n")

Expand All @@ -114,7 +112,7 @@ vars <-

# Load the desired subset of this dataset in memory and do some feature engineering for derived variables
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(c(vars))) %>%
filter(ResultClassification %in% c("Normal Sinus Rhythm", "Atrial Fibrillation")) %>%
rename(StartDate = StartTime) %>%
Expand Down
4 changes: 1 addition & 3 deletions scripts/process-data/fitbitintradaycombined.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
library(dplyr)

dataset <- "fitbitintradaycombined"

cat(glue::glue("Transforming data for {dataset}"),"\n")
Expand All @@ -12,7 +10,7 @@ vars <-

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(vars)) %>%
mutate(
DeepSleepSummaryBreathRate = as.numeric(DeepSleepSummaryBreathRate),
Expand Down
8 changes: 3 additions & 5 deletions scripts/process-data/fitbitsleeplogs.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,7 @@ sleeplogs_stat_summarize <- function(df) {
return(result)
}

library(dplyr)

dataset <- "fitbitsleeplogs"
dataset <- "fitbitsleeplogs$"

cat(glue::glue("Transforming data for {dataset}"),"\n")

Expand All @@ -140,7 +138,7 @@ vars <-

# Load the desired subset of this dataset in memory and do some feature engineering for derived variables
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(c(vars, "LogId"))) %>%
collect() %>%
distinct() %>%
Expand Down Expand Up @@ -212,7 +210,7 @@ sleeplogsdetails_vars <-
pull(Variable)

sleeplogsdetails_df <-
arrow::open_dataset(file.path(downloadLocation, "dataset_fitbitsleeplogs_sleeplogdetails")) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, "sleeplogdetails"))) %>%
select(all_of(sleeplogsdetails_vars)) %>%
collect() %>%
distinct() %>%
Expand Down
6 changes: 2 additions & 4 deletions scripts/process-data/healthkitv2electrocardiogram.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ ecg_stat_summarize <- function(df) {
return(result)
}

library(dplyr)

dataset <- "healthkitv2electrocardiogram"
dataset <- "healthkitv2electrocardiogram$"

cat(glue::glue("Transforming data for {dataset}"),"\n")

Expand All @@ -119,7 +117,7 @@ participants_to_exclude <-

# Load the desired subset of this dataset in memory and do some feature engineering for derived variables
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(c(vars))) %>%
dplyr::filter(!(ParticipantIdentifier %in% participants_to_exclude)) %>%
filter(Classification %in% c("SinusRhythm", "AtrialFibrillation")) %>%
Expand Down
4 changes: 1 addition & 3 deletions scripts/process-data/healthkitv2samples.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
library(dplyr)

dataset <- "healthkitv2samples"

cat(glue::glue("Transforming data for {dataset}"),"\n")
Expand All @@ -17,7 +15,7 @@ participants_to_exclude <-

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(vars)) %>%
dplyr::filter(!(ParticipantIdentifier %in% participants_to_exclude)) %>%
dplyr::filter(
Expand Down
4 changes: 1 addition & 3 deletions scripts/process-data/healthkitv2statistics.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
library(dplyr)

dataset <- "healthkitv2statistics"

cat(glue::glue("Transforming data for {dataset}"),"\n")
Expand All @@ -17,7 +15,7 @@ participants_to_exclude <-

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(vars)) %>%
dplyr::filter(Type=="DailySteps") %>%
dplyr::filter(!(ParticipantIdentifier %in% participants_to_exclude)) %>%
Expand Down
4 changes: 1 addition & 3 deletions scripts/process-data/participant_devices.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
library(dplyr)

dataset <- c("fitbitdevices", "healthkitv2samples")

cat(glue::glue("Transforming device data for {dataset}"),"\n")
Expand All @@ -15,7 +13,7 @@ vars <- list(fitbitdevices = c("ParticipantIdentifier",
df <-
lapply(dataset, function(x) {
tmp <- vars[[x]]
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{x}"))) %>%
arrow::open_dataset(s3$path(str_subset(dataset_paths, dataset))) %>%
select(all_of(tmp)) %>%
dplyr::rename_with(tolower) %>%
collect()
Expand Down

0 comments on commit 612d859

Please sign in to comment.