get benchmark results

OHDSI · Sep 9, 2024 · 5d3c5e6 · 5d3c5e6
1 parent 1a3eb7d
commit 5d3c5e6
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ inst/doc
 docs
 /sql
 /sql_1
+extras/data/*
diff --git a/extras/benchmark.RData b/extras/benchmark.RData
diff --git a/extras/getBenchmarkResults.R b/extras/getBenchmarkResults.R
@@ -0,0 +1,55 @@
+library(readr)
+library(omopgenerics)
+library(here)
+library(dplyr)
+library(tidyr)
+
+readData <- function(path) {
+  zipFiles <- list.files(path = path, pattern = ".zip")
+  tempfolder <- tempdir()
+  data <- NULL
+  for (file in zipFiles) {
+    file <- file.path(path, file)
+    fname = unzip(file, list = TRUE)$Name
+    fname <- fname[tools::file_ext(fname) == "csv"]
+    unzip(file, files = fname, exdir = tempfolder, overwrite = TRUE)
+    files <- file.path(tempfolder, fname)
+    data <- c(data, readFiles(files))
+  }
+  return(data)
+}
+
+readFiles <- function(files) {
+  data <- list()
+  for (file in files) {
+    data[[file]] <- readr::read_csv(file, col_types = readr::cols(.default = readr::col_character()))
+    if (all(colnames(data[[file]]) %in% omopgenerics::resultColumns()) & "settings" %in% data[[file]]$variable_name) {
+      data[[file]] <- data[[file]] |> omopgenerics::newSummarisedResult()
+    }
+  }
+  names(data) <- basename(tools::file_path_sans_ext(names(data)))
+  return(data)
+}
+
+mergeData <- function(data, patterns) {
+  x <- list()
+  for (pat in patterns) {
+    dataSubset <- data[grepl(pat, names(data))]
+    srExp <- length(dataSubset)
+    srObs <- sum(lapply(data[grepl(pat, names(data))], class) |> unlist() == "summarised_result")
+    if (srObs > 0) {
+      if (srObs == srExp) {
+        x[[pat]] <- dataSubset %>% omopgenerics::bind()
+      } else {
+        cli::cli_abort("Not all results with pattern {pat} have class summarised result.")
+      }
+    }  else {
+      x[[pat]] <- dataSubset %>% dplyr::bind_rows() %>% distinct()
+    }
+  }
+  return(x)
+}
+
+result_patterns <- c("time", "comparison", "details", "omop", "index_counts", "sql_indexes")
+data <- readData(here("extras", "data")) %>% mergeData(result_patterns)
+save(data, file = here("extras", "benchmark.RData"))
diff --git a/vignettes/a11_benchmark.Rmd b/vignettes/a11_benchmark.Rmd
@@ -0,0 +1,61 @@
+---
+title: "CohortConstructor benchmark"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{a11_benchmark}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  warning = FALSE
+)
+```
+
+```{r, echo=FALSE}
+library(visOmopResults)
+library(readr)
+library(omopgenerics)
+library(ggplot2)
+library(CohortCharacteristics)
+library(stringr)
+library(here)
+library(dplyr)
+library(tidyr)
+library(gt)
+
+createRData <- FALSE
+if (createRData) {
+  source("extras", "getBenchmarkResults.R")
+} else {
+  load(here("extras", "benchmark.RData"))
+}
+```
+
+# Introduction
+
+Cohorts are a fundamental building block for studies that use the OMOP CDM, identifying people who satisfy one or more inclusion criteria for a duration of time based on their clinical records. Currently cohorts are typically built using [CIRCE](https://github.com/OHDSI/circe-be) which allows complex cohorts to be represented using JSON. This JSON is then converted to SQL for execution against a database containing data mapped to the OMOP CDM. CIRCE JSON can be created via the [ATLAS](https://github.com/OHDSI/Atlas) graphical user interface or programmatically via the [Capr](https://github.com/OHDSI/Capr) R package. However, although a powerful tool for expressing and operationalising cohort definitions, the SQL generated can be cumbersome especially for complex cohort definitions. Moreover, when multiple cohorts are defined these are typically instantiated independently which can lead to duplication of work.
+
+The CohortConstructor package provides an alternative approach to building cohorts in data mapped to the OMOP CDM.  It promotes cohort building in a pipeline fashion, with creating base cohorts coming first which is then followed by manipulation of these cohorts to apply specific inclusion criteria. 
+
+Rather than constructing cohorts "by definition" where cohorts are built independently, the CohortConstructor introduces the idea of constructing cohorts "by domain", with the aim of reducing repetitive calls to large OMOP tables. More about this approach is described in the [Introduction](https://ohdsi.github.io/CohortConstructor/articles/a00_introduction.html) vignette.
+
+To test the performance of the package we have created a benchmarking script in which we selected 9 phenotypes from the OHDSI library that cover a range of concept domains, entry and inclusion criteria, and cohort exit options. These have been replicated using [CodelistGenerator](https://github.com/darwin-eu/CodelistGenerator) and CohortConstructor, following two approches: first, to emulate Atlas+CIRCE workflow we have instantiated each cohort separately ("by definition"), and second, we have created the 9 cohorts as a set ("by domain").
+
+
+# Timings
+
+## Cohort construction by definition
+
+## Cohort construction by domain
+
+## Use of SQL indexes
+
+# Cohort similarities
+
+# Conclusions
+
+# Methods
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ inst/doc @@
     docs
     /sql
     /sql_1
+    extras/data/*