Skip to content

Commit

Permalink
Reduce max_rows_per_file for newly written partitioned parquet datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
pranavanba committed Sep 20, 2023
1 parent 09f5415 commit 0819f3c
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion deidentification.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ for (i in seq_along(deidentified_results$deidentified_datasets)) {

arrow::write_dataset(dataset = deidentified_results$deidentified_datasets[[i]],
path = file.path(PARQUET_FINAL_LOCATION, names(deidentified_results$deidentified_datasets)[[i]]),
max_rows_per_file = 900000,
max_rows_per_file = 100000,
partitioning = c('cohort'),
existing_data_behavior = 'delete_matching')
}
Expand Down
4 changes: 2 additions & 2 deletions filtering.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dob2age <- function(dataset, column, output=PARQUET_FILTERED_LOCATION) {
arrow::open_dataset(sources = input_path) %>%
dplyr::mutate(age = lubridate::year(lubridate::today())-lubridate::year(lubridate::as_date(!!sym(column)))) %>%
arrow::write_dataset(path = input_path,
max_rows_per_file = 900000,
max_rows_per_file = 100000,
partitioning = c('cohort'),
existing_data_behavior = 'delete_matching')
}
Expand All @@ -23,7 +23,7 @@ drop_cols_datasets <- function(dataset, columns=c(), output=PARQUET_FILTERED_LOC
arrow::open_dataset(sources = input_path) %>%
dplyr::select(!columns) %>%
arrow::write_dataset(path = final_path,
max_rows_per_file = 900000,
max_rows_per_file = 100000,
partitioning = c('cohort'),
existing_data_behavior = 'delete_matching')
}
Expand Down

0 comments on commit 0819f3c

Please sign in to comment.