Skip to content

Commit

Permalink
Merge pull request #10 from pranavanba/fix-filtering
Browse files Browse the repository at this point in the history
Fix filtering step
  • Loading branch information
pranavanba authored Nov 8, 2023
2 parents 945ad1e + b1b2af2 commit 38c544e
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions filtering.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA
final_path <- paste0(output, '/', dataset, '/')

arrow::open_dataset(sources = input_path) %>%
dplyr::select(!columns) %>%
dplyr::select(!dplyr::any_of(columns)) %>%
arrow::write_dataset(path = final_path,
max_rows_per_file = 100000,
partitioning = partitions,
Expand All @@ -68,14 +68,18 @@ synLogin()

pii_to_drop <- synGet('syn52523394')$path %>% read.csv()

datasets_to_filter <- pii_to_drop$dataset %>% unique()
cols_to_drop <- lapply(datasets_to_filter, function(x) {
pii_to_drop$column_to_be_dropped[which(pii_to_drop$dataset==x)]
})

tmp <-
lapply(seq_len(nrow(pii_to_drop)), function(i) {
cat(i, "Dropping", pii_to_drop$column_to_be_dropped[[i]], "from", pii_to_drop$dataset[[i]], "\n")
drop_cols_datasets(dataset = pii_to_drop$dataset[[i]],
columns = pii_to_drop$column_to_be_dropped[[i]],
lapply(seq_len(nrow(datasets_to_filter)), function(i) {
cat(i, "Dropping", cols_to_drop[[i]], "from", datasets_to_filter[[i]], "\n")
drop_cols_datasets(dataset = datasets_to_filter[[i]],
columns = cols_to_drop[[i]],
input = AWS_PARQUET_DOWNLOAD_LOCATION,
output = PARQUET_FILTERED_LOCATION,
partitions = "cohort")
})

rm(pii_to_drop)

0 comments on commit 38c544e

Please sign in to comment.