From 4fd82d868e965a67e7b2c4d62239821afda64618 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Wed, 1 Nov 2023 21:36:03 +0000 Subject: [PATCH 1/4] Specify input and partition arguments in function calls --- filtering.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/filtering.R b/filtering.R index a8397d0..4e0ab6f 100644 --- a/filtering.R +++ b/filtering.R @@ -56,7 +56,10 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA # Filtering --------------------------------------------------------------- -dob2age("dataset_enrolledparticipants", "DateOfBirth") +dob2age(dataset = "dataset_enrolledparticipants", + column = "DateOfBirth", + input = AWS_PARQUET_DOWNLOAD_LOCATION, + partitions = "cohort") unlink(PARQUET_FILTERED_LOCATION, recursive = T, force = T) @@ -67,7 +70,11 @@ pii_to_drop <- synGet('syn52523394')$path %>% read.csv() tmp <- lapply(seq_len(nrow(pii_to_drop)), function(i) { cat(i, "Dropping", pii_to_drop$column_to_be_dropped[[i]], "from", pii_to_drop$dataset[[i]], "\n") - drop_cols_datasets(dataset = pii_to_drop$dataset[[i]], columns = pii_to_drop$column_to_be_dropped[[i]]) + drop_cols_datasets(dataset = pii_to_drop$dataset[[i]], + columns = pii_to_drop$column_to_be_dropped[[i]], + input = AWS_PARQUET_DOWNLOAD_LOCATION, + output = PARQUET_FILTERED_LOCATION, + partitions = "cohort") }) rm(pii_to_drop) From 8d4d09fa312f1828cee028eeac7e486af39a3058 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Wed, 1 Nov 2023 21:36:24 +0000 Subject: [PATCH 2/4] Minor fixes --- sts_synindex_external.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sts_synindex_external.R b/sts_synindex_external.R index da084ae..8d3a5ff 100644 --- a/sts_synindex_external.R +++ b/sts_synindex_external.R @@ -170,14 +170,14 @@ system(manifest_cmd) # Index files in Synapse -------------------------------------------------- # Get a list of all files to upload and their synapse locations (parentId) -STR_LEN_PARQUET_FINAL_LOCATION <- stringr::str_length(PARQUET_FINAL_LOCATION) +STR_LEN_PARQUET_FINAL_LOCATION <- stringr::str_length(AWS_ARCHIVE_DOWNLOAD_LOCATION) ## List all local files present (from manifest) synapse_manifest <- read.csv('./current_manifest.tsv', sep = '\t', stringsAsFactors = F) %>% dplyr::filter(!grepl('owner.txt', path)) %>% dplyr::rowwise() %>% - dplyr::mutate(file_key = stringr::str_sub(string = path, start = STR_LEN_PARQUET_FINAL_LOCATION)) %>% + dplyr::mutate(file_key = stringr::str_sub(string = path, start = STR_LEN_PARQUET_FINAL_LOCATION+2)) %>% dplyr::mutate(s3_file_key = paste0(PARQUET_BUCKET_BASE_KEY_ARCHIVE, file_key)) %>% dplyr::mutate(md5_hash = as.character(tools::md5sum(path))) %>% dplyr::ungroup() @@ -232,8 +232,8 @@ if(nrow(synapse_manifest_to_upload) > 0){ name = new_fileName) f <- synStore(f, - activity = "Indexing", - activityDescription = "Indexing external parquet datasets", + activityName = "Indexing", + activityDescription = "Indexing external parquet datasets", used = PARQUET_FOLDER_INTERNAL, executed = latest_commit_tree_url) From e1cdb3e29bb3d7ab949085b0a3cf264ba34b4ab8 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Tue, 7 Nov 2023 22:31:02 +0000 Subject: [PATCH 3/4] Ignore files and dirs whose names contain "pilot" --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index df8fb74..f2c5f43 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,4 @@ dictionaries dev* misc* *temp* +*pilot* From 1b2c2db6983c7db310fb67330d8c8f4220584383 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Wed, 8 Nov 2023 17:16:09 +0000 Subject: [PATCH 4/4] Use basename_template when writing parquet datasets --- filtering.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/filtering.R b/filtering.R index 4e0ab6f..aa92142 100644 --- a/filtering.R +++ b/filtering.R @@ -50,7 +50,8 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA arrow::write_dataset(path = final_path, max_rows_per_file = 100000, partitioning = partitions, - existing_data_behavior = 'delete_matching') + existing_data_behavior = 'delete_matching', + basename_template = paste0("part-0000{i}.", as.character("parquet"))) } }