Skip to content

Commit

Permalink
Add in support for mixed-mixed join, to verify
Browse files Browse the repository at this point in the history
  • Loading branch information
rsh52 committed Sep 23, 2024
1 parent 7f64232 commit 593254d
Showing 1 changed file with 74 additions and 9 deletions.
83 changes: 74 additions & 9 deletions R/join_data_tibbles.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,35 @@ join_data_tibbles <- function(suprtbl,
suprtbl <- suprtbl |>
mutate(pks = purrr::map_chr(.data$redcap_data, ~ extract_keys(., record_id_field = record_id_field))) %>%
select(.data$redcap_form_name, .data$redcap_form_label, .data$redcap_data,
.data$redcap_metadata, .data$structure, .data$pks)
.data$redcap_metadata, .data$structure, .data$pks, .data$redcap_events)

tbl_x <- extract_tibble(suprtbl, x)
tbl_x_type <- get_structure(suprtbl, x)
tbl_x_structure <- get_structure(suprtbl, x)
tbl_y <- extract_tibble(suprtbl, y)
tbl_y_type <- get_structure(suprtbl, y)
tbl_y_structure <- get_structure(suprtbl, y)

# Mixed structure requires special handling
is_mixed <- any(c(tbl_x_type, tbl_y_type) == "mixed")
is_mixed <- any(c(tbl_x_structure, tbl_y_structure) == "mixed")

if (is_mixed) {
required_columns <- c("redcap_event_instance", "redcap_form_instance")
tbl_x <- add_missing_columns(tbl_x, required_columns)
tbl_y <- add_missing_columns(tbl_y, required_columns)
# TODO: Determine if ok to remove
# required_columns <- c("redcap_event_instance", "redcap_form_instance") # nolint: commented_code_linter
# tbl_x <- add_missing_columns(tbl_x, required_columns) # nolint: commented_code_linter
# tbl_y <- add_missing_columns(tbl_y, required_columns) # nolint: commented_code_linter

tbl_x_type <- get_type(suprtbl, x)
tbl_y_type <- get_type(suprtbl, y)

tbl_x <- left_join(tbl_x, tbl_x_type, by = "redcap_event")
tbl_y <- left_join(tbl_y, tbl_y_type, by = "redcap_event")
}

join_fn <- get_join_fn(type)
by <- build_by(suprtbl, x, y, is_mixed)

join_fn(tbl_x, tbl_y, by = by, suffix = suffix) %>%
relocate(starts_with("form_status_complete"), .after = everything())
join_tbls(tbl_x, tbl_y, join_fn, by, suffix, is_mixed) %>%
relocate(starts_with("form_status_complete"), .after = everything()) %>%
select(-starts_with(".repeat_type"))
}

extract_keys <- function(suprtbl, record_id_field) {
Expand All @@ -50,6 +58,16 @@ get_structure <- function(suprtbl, tbl_name) {
suprtbl$structure[suprtbl$redcap_form_name == tbl_name]
}

get_type <- function(suprtbl, tbl_name) {
suprtbl %>%
filter(.data$redcap_form_name == tbl_name) %>%
pull(.data$redcap_events) %>%
pluck(1) %>%
select(.data$redcap_event,
".repeat_type" = .data$repeat_type) %>%
unique()
}

get_join_fn <- function(type) {
join_functions <- list(
left = dplyr::left_join,
Expand Down Expand Up @@ -84,8 +102,55 @@ build_by <- function(suprtbl, x, y, is_mixed) {
out
}

# TODO: Determine if ok to remove
add_missing_columns <- function(tbl, columns) {
missing_cols <- setdiff(columns, names(tbl))
tbl[missing_cols] <- NA
return(tbl)
}

join_tbls <- function(x, y, join_fn, by, suffix, is_mixed) {
if (is_mixed) {
# Filter based on .repeat_type
x_together <- x %>% filter(.data$.repeat_type == "repeat_together")
y_together <- y %>% filter(.data$.repeat_type == "repeat_together")

x_separate <- x %>% filter(.data$.repeat_type == "repeat_separate")
y_separate <- y %>% filter(.data$.repeat_type == "repeat_separate")

# Join together sets
joined_together <- x_together %>%
join_fn(y_together, by = by[by != "redcap_form_instance"], suffix = suffix)

# Join separate sets
joined_separate <- x_separate %>%
join_fn(y_separate, by = by[by != "redcap_form_instance"], suffix = suffix)

# Bind rows together
result <- bind_rows(joined_together, joined_separate) %>%
drop_non_suffix_columns()
} else {
result <- join_fn(x, y, by = by, suffix = suffix)
}

result
}

drop_non_suffix_columns <- function(data) {
# Extract column names that contain a "."
# Note: We can look for periods because REDCap will not allow variables to made
# with them. Only user tampering with column names in the output would result in this.
dot_columns <- names(data)[grepl("\\.", names(data))]

# Extract the base column names without the suffixes (everything before the ".")
base_columns <- unique(sub("\\..*", "", dot_columns))

# Filter out base columns that do not exist without a suffix
columns_to_drop <- base_columns[base_columns %in% names(data)]

# Drop only those base columns that exist both with and without suffixes
data <- data %>%
select(-all_of(columns_to_drop))

return(data)
}

0 comments on commit 593254d

Please sign in to comment.