Skip to content

Commit

Permalink
Handle missing newlines and non-ASCII characters, and fix status chec…
Browse files Browse the repository at this point in the history
…king logic to handle unmatched dictionary values
  • Loading branch information
pranavanba committed Oct 2, 2024
1 parent 730efcb commit 5c7260f
Showing 1 changed file with 18 additions and 4 deletions.
22 changes: 18 additions & 4 deletions scripts/deidentification/deidentification.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@ list.files("./dictionaries", full.names = T) %>%
junk <- lapply(list.files("./dictionaries/", full.names = T), function(f) {
lines <- readLines(f)

# Ensure there is a proper newline at the end
if (substr(lines[length(lines)], nchar(lines[length(lines)]), nchar(lines[length(lines)])) != "\n") {
lines[length(lines)] <- paste0(lines[length(lines)] , "\n")
}

# Process and clean the lines
modified_lines <- lapply(lines, function(line) {
line <- iconv(line, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
line <- gsub('"', '', line)
if (grepl(",APPROVED|,UNAPPROVED", line)) {
line <- gsub('(.*?)"?(,APPROVED|,approved|,UNAPPROVED|,unapproved)', '"\\1"\\2', line)
Expand All @@ -32,6 +39,7 @@ junk <- lapply(list.files("./dictionaries/", full.names = T), function(f) {
writeLines(modified_lines, f)
})


store_dicts <- function(files_dir) {
dicts <- list()

Expand Down Expand Up @@ -66,18 +74,24 @@ deidentify <- function(dicts_list, parquet_dir) {

out <- df
out[[var_name]] <- trimws(out[[var_name]])

needs_review <- character(0)

for (j in 1:nrow(out)) {
val <- out[[var_name]][j]
status <- dicts_list[[i]][[2]][which(dicts_list[[i]][[1]]==val)]

if (val %in% dicts_list[[i]][[var_name]]) {
# Find the status for the value
status_idx <- which(dicts_list[[i]][[1]] == val)

if (length(status_idx) > 0) {
status <- dicts_list[[i]][[2]][status_idx]

# Check the status and apply de-identification
if (status == "UNAPPROVED") {
out[[var_name]][j] <- NA
}
} else {
# No match in dictionary, mark for review and set value to NA
needs_review <- c(needs_review, val)
out[[var_name]][j] <- NA
}
Expand All @@ -88,6 +102,7 @@ deidentify <- function(dicts_list, parquet_dir) {
out_list[[i]] <- out
review_list[[i]] <- needs_review
}

names(out_list) <- names(dicts_list)
names(review_list) <- names(dicts_list)

Expand All @@ -99,7 +114,6 @@ deidentify <- function(dicts_list, parquet_dir) {

deidentified_results <- deidentify(dicts, PARQUET_FINAL_LOCATION)


# Write de-identified datasets to parquet datasets dir --------------------
for (i in seq_along(deidentified_results$deidentified_datasets)) {
dir <- file.path(PARQUET_FINAL_LOCATION, names(deidentified_results$deidentified_datasets)[[i]])
Expand Down

0 comments on commit 5c7260f

Please sign in to comment.