Skip to content

Benchmarks

Aviezer Lifshitz edited this page May 9, 2022 · 16 revisions
devtools::load_all()
options(emr_max.data.size = 1e9)
emr_db.connect("/net/mraid14/export/tgdata/db/tgdb/emr/mock/")

Important note: this document is for benchmarking only, and it is not intended for learning naryn!

Logical tracks

Extract physical track

ptrack <- "WZMN.dx.1.250.11"
ltrack <- "ltrack"
ltrack_values <- "dx.250.1.star"

system.time(replicate(100, emr_extract(ptrack)))
  • vanilla: 1.059
  • dev: 1.096 (this is due to devtools::load_all() instead of library)
  • logical tracks: 0.928

Extract logical track (no values)

system.time(replicate(100, emr_extract(ltrack)))
  • logical tracks: 0.914

Extract logical track (with values)

system.time(replicate(10, emr_extract(ltrack_values)))
  • logical tracks: 3.916

Extract virtual track with values

emr_vtrack.create("vt", "diagnosis.250", params = c(11))
system.time(replicate(10, emr_extract("vt")))
  • vanilla: 13.729
  • dev: 15.897
  • logical tracks: 13.548

Extract tracks with explicit iterator

system.time(replicate(100, emr_extract(ptrack, iterator = ptrack)))
  • vanilla: 0.955
  • dev: 1.024
  • logical tracks: 0.857
system.time(replicate(100, emr_extract(ltrack, iterator = ltrack)))
  • logical tracks: 0.848

Value filters

devtools::load_all()
library(glue)
options(emr_max.data.size = 1e9)
emr_db.connect("/net/mraid14/export/tgdata/db/tgdb/emr/mock/")
dx_track <- "diagnosis.250"
lab_track <- "lab.103"
val <- 15 # around the 90th percentile

compare with screen and then extract

screen_and_extract <- function(){
    itr_df <- emr_screen(glue("{lab_track} >= {val}"), iterator = lab_track)
    emr_filter.create("f", itr_df, time.shift = c(-years(5), 0))
    emr_extract(dx_track, filter = "f")
}

value_filter <- function(){
    emr_filter.create("f", lab_track, val = val, operator = ">=", time.shift = c(-years(5), 0))
    emr_extract(dx_track, filter = "f")
}
system.time(replicate(10, screen_and_extract()))
  • screen and extract: 35.871
system.time(replicate(10, value_filter()))
  • value filter: 11.725

Compare with disk caching (for multiple extractions)

This was our previous strategy, mostly for relative risk computations.

tmp <- tempdir()
emr_db.connect(c("/net/mraid14/export/tgdata/db/tgdb/emr/mock/", tmp))

cache_track <- function(){
    itr_df <- emr_screen(glue("{lab_track} >= {val}"), iterator = lab_track)
    itr_df$value <- 1
    emr_track.import("temp", "user", categorical = TRUE, src = itr_df)
}

extract_cache <- function(){    
    emr_filter.create("f", "temp", time.shift = c(-years(5), 0))
    emr_extract(dx_track, filter = "f")
}

screen_and_extract_cache <- function(n=10){
    cache_track()
    replicate(n, extract_cache())
}
system.time(screen_and_extract_cache(10))
  • screen, cache and extract: 11.649

The caching itself takes 2.496, and then 0.839 for each extract, compared with 1.126 for every track value filter extraction.

Some examples (there are definitely better ways to do each example)

Anemia

withr::local_options(list(emr_max.data.size = 1e9))
emr_filter.create("female", "patients.female", time.shift = c(-years(120), 0))
emr_filter.create("male", "patients.male", time.shift = c(-years(120), 0))
hgb_female <- emr_screen("lab.103 < 12", filter = "female", keepref = TRUE)
hgb_male <- emr_screen("lab.103 < 14", filter = "male", keepref = TRUE)
emr_filter.create("abnormal_hgb_female", hgb_female %>% dplyr::distinct(id, time))
emr_filter.create("abnormal_hgb_male", hgb_male %>% dplyr::distinct(id, time))
emr_track.create("anemia",
    categorical = FALSE, expr = "lab.103",
    filter = "abnormal_hgb_female | abnormal_hgb_male"
)

before: 14.642 seconds

withr::local_options(list(emr_max.data.size = 1e9))
emr_filter.create("female", "patients.female", time.shift = c(-years(120), 0))
emr_filter.create("male", "patients.male", time.shift = c(-years(120), 0))
emr_filter.create("abnormal_hgb_female", src = "lab.103", val = 12, operator = "<")
emr_filter.create("abnormal_hgb_male", src = "lab.103", val = 14, operator = "<")
emr_track.create("anemia1",
    categorical = FALSE, expr = "lab.103",
    filter = "(female & abnormal_hgb_female) | (male & abnormal_hgb_male)"
)

after: 5.504 seconds

abnormal hematocrit second time

withr::local_options(list(emr_max.data.size = 1e9))
emr_filter.create("female", "patients.female", time.shift = c(-years(120), 0))
emr_filter.create("male", "patients.male", time.shift = c(-years(120), 0))
hct_female_48 <- emr_screen("lab.104 > 48", filter = "female", keepref = TRUE)
hct_male_49 <- emr_screen("lab.104 > 49", filter = "male", keepref = TRUE)
emr_filter.create("abnormal_hct_female_past", hct_female_48 %>% dplyr::distinct(id, time), time.shift = c(-years(3), -1))
emr_filter.create("abnormal_hct_male_past", hct_male_49 %>% dplyr::distinct(id, time), time.shift = c(-years(3), -1))
emr_filter.create("abnormal_hct_female_current", hct_female_48 %>% dplyr::distinct(id, time))
emr_filter.create("abnormal_hct_male_current", hct_male_49 %>% dplyr::distinct(id, time))
emr_track.create("abnormal_hct_second_time",
    categorical = FALSE, expr = "lab.104",
    filter = "(abnormal_hct_female_past & abnormal_hct_female_current) | (abnormal_hct_male_past & abnormal_hct_male_current)"
)

before: 3.339 seconds

emr_filter.clear()
withr::local_options(list(emr_max.data.size = 1e9))
emr_filter.create("female", "patients.female", time.shift = c(-years(120), 0))
emr_filter.create("male", "patients.male", time.shift = c(-years(120), 0))
emr_filter.create("abnormal_hct_female", src = "lab.104", val = 48, operator = ">")
emr_filter.create("abnormal_hct_male", src = "lab.104", val = 49, operator = ">")
emr_vtrack.create("abnormal_hct_past", src = "lab.104", filter = "(female & abnormal_hct_female) | (male & abnormal_hct_male)", time.shift = c(-years(3), -1))
emr_track.create("abnormal_hct_second_time1",
    categorical = FALSE, expr = "lab.104",
    filter = "abnormal_hct_past & ((female & abnormal_hct_female) | (male & abnormal_hct_male))", iterator = "lab.104"
)

after: 2.222 seconds