From 4fd82d868e965a67e7b2c4d62239821afda64618 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Wed, 1 Nov 2023 21:36:03 +0000
Subject: [PATCH 1/4] Specify input and partition arguments in function calls

---
 filtering.R | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/filtering.R b/filtering.R
index a8397d0..4e0ab6f 100644
--- a/filtering.R
+++ b/filtering.R
@@ -56,7 +56,10 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA
 
 
 # Filtering ---------------------------------------------------------------
-dob2age("dataset_enrolledparticipants", "DateOfBirth")
+dob2age(dataset = "dataset_enrolledparticipants", 
+        column = "DateOfBirth", 
+        input = AWS_PARQUET_DOWNLOAD_LOCATION, 
+        partitions = "cohort")
 
 unlink(PARQUET_FILTERED_LOCATION, recursive = T, force = T)
 
@@ -67,7 +70,11 @@ pii_to_drop <- synGet('syn52523394')$path %>% read.csv()
 tmp <- 
   lapply(seq_len(nrow(pii_to_drop)), function(i) {
     cat(i, "Dropping", pii_to_drop$column_to_be_dropped[[i]], "from", pii_to_drop$dataset[[i]], "\n")
-    drop_cols_datasets(dataset = pii_to_drop$dataset[[i]], columns = pii_to_drop$column_to_be_dropped[[i]])
+    drop_cols_datasets(dataset = pii_to_drop$dataset[[i]], 
+                       columns = pii_to_drop$column_to_be_dropped[[i]], 
+                       input = AWS_PARQUET_DOWNLOAD_LOCATION, 
+                       output = PARQUET_FILTERED_LOCATION, 
+                       partitions = "cohort")
     })
 
 rm(pii_to_drop)

From 8d4d09fa312f1828cee028eeac7e486af39a3058 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Wed, 1 Nov 2023 21:36:24 +0000
Subject: [PATCH 2/4] Minor fixes

---
 sts_synindex_external.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sts_synindex_external.R b/sts_synindex_external.R
index da084ae..8d3a5ff 100644
--- a/sts_synindex_external.R
+++ b/sts_synindex_external.R
@@ -170,14 +170,14 @@ system(manifest_cmd)
 
 # Index files in Synapse --------------------------------------------------
 # Get a list of all files to upload and their synapse locations (parentId)
-STR_LEN_PARQUET_FINAL_LOCATION <- stringr::str_length(PARQUET_FINAL_LOCATION)
+STR_LEN_PARQUET_FINAL_LOCATION <- stringr::str_length(AWS_ARCHIVE_DOWNLOAD_LOCATION)
 
 ## List all local files present (from manifest)
 synapse_manifest <- 
   read.csv('./current_manifest.tsv', sep = '\t', stringsAsFactors = F) %>%
   dplyr::filter(!grepl('owner.txt', path)) %>%
   dplyr::rowwise() %>%
-  dplyr::mutate(file_key = stringr::str_sub(string = path, start = STR_LEN_PARQUET_FINAL_LOCATION)) %>%
+  dplyr::mutate(file_key = stringr::str_sub(string = path, start = STR_LEN_PARQUET_FINAL_LOCATION+2)) %>%
   dplyr::mutate(s3_file_key = paste0(PARQUET_BUCKET_BASE_KEY_ARCHIVE, file_key)) %>%
   dplyr::mutate(md5_hash = as.character(tools::md5sum(path))) %>%
   dplyr::ungroup()
@@ -232,8 +232,8 @@ if(nrow(synapse_manifest_to_upload) > 0){
               name = new_fileName)
     
     f <- synStore(f, 
-                  activity = "Indexing", 
-                  activityDescription = "Indexing external parquet datasets", 
+                  activityName = "Indexing", 
+                  activityDescription = "Indexing external parquet datasets",
                   used = PARQUET_FOLDER_INTERNAL, 
                   executed = latest_commit_tree_url)
     

From e1cdb3e29bb3d7ab949085b0a3cf264ba34b4ab8 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Tue, 7 Nov 2023 22:31:02 +0000
Subject: [PATCH 3/4] Ignore files and dirs whose names contain "pilot"

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index df8fb74..f2c5f43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,4 @@ dictionaries
 dev*
 misc*
 *temp*
+*pilot*

From 1b2c2db6983c7db310fb67330d8c8f4220584383 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Wed, 8 Nov 2023 17:16:09 +0000
Subject: [PATCH 4/4] Use basename_template when writing parquet datasets

---
 filtering.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/filtering.R b/filtering.R
index 4e0ab6f..aa92142 100644
--- a/filtering.R
+++ b/filtering.R
@@ -50,7 +50,8 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA
       arrow::write_dataset(path = final_path, 
                            max_rows_per_file = 100000,
                            partitioning = partitions, 
-                           existing_data_behavior = 'delete_matching')
+                           existing_data_behavior = 'delete_matching',
+                           basename_template = paste0("part-0000{i}.", as.character("parquet")))
   }
 }