Reduce max_rows_per_file for newly written partitioned parquet datasets

Sage-Bionetworks · Sep 20, 2023 · 0819f3c · 0819f3c
1 parent 09f5415
commit 0819f3c
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/deidentification.R b/deidentification.R
@@ -86,7 +86,7 @@ for (i in seq_along(deidentified_results$deidentified_datasets)) {
 
   arrow::write_dataset(dataset = deidentified_results$deidentified_datasets[[i]], 
                        path = file.path(PARQUET_FINAL_LOCATION, names(deidentified_results$deidentified_datasets)[[i]]), 
-                       max_rows_per_file = 900000,
+                       max_rows_per_file = 100000,
                        partitioning = c('cohort'), 
                        existing_data_behavior = 'delete_matching')
 }

diff --git a/filtering.R b/filtering.R
@@ -6,7 +6,7 @@ dob2age <- function(dataset, column, output=PARQUET_FILTERED_LOCATION) {
     arrow::open_dataset(sources = input_path) %>% 
       dplyr::mutate(age = lubridate::year(lubridate::today())-lubridate::year(lubridate::as_date(!!sym(column)))) %>% 
       arrow::write_dataset(path = input_path, 
-                           max_rows_per_file = 900000, 
+                           max_rows_per_file = 100000, 
                            partitioning = c('cohort'), 
                            existing_data_behavior = 'delete_matching')
   }
@@ -23,7 +23,7 @@ drop_cols_datasets <- function(dataset, columns=c(), output=PARQUET_FILTERED_LOC
     arrow::open_dataset(sources = input_path) %>% 
       dplyr::select(!columns) %>% 
       arrow::write_dataset(path = final_path, 
-                           max_rows_per_file = 900000,
+                           max_rows_per_file = 100000,
                            partitioning = c('cohort'), 
                            existing_data_behavior = 'delete_matching')
   }