Merge pull request #450 from OHDSI/develop

v2.2.0 Release
OHDSI · May 5, 2023 · 162e709 · 162e709
2 parents 1197d5d + 8918738
commit 162e709
Show file tree

Hide file tree

Showing 78 changed files with 1,590 additions and 1,376 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DataQualityDashboard
 Type: Package
 Title: Execute and View Data Quality Checks on OMOP CDM Database
-Version: 2.1.2
-Date: 2023-03-10
+Version: 2.2.0
+Date: 2023-05-05
 Authors@R: c(
   person("Katy", "Sadowski", email = "[email protected]", role = c("aut", "cre")),
   person("Clair", "Blacketer", role = c("aut")),
@@ -31,7 +31,8 @@ Imports:
     plyr,
     stringr,
     rlang,
-    tidyselect
+    tidyselect,
+    readr
 Suggests:
     testthat,
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,7 +8,10 @@ export(writeJsonResultsToCsv)
 export(writeJsonResultsToTable)
 import(DatabaseConnector)
 import(magrittr)
+importFrom(dplyr,case_when)
+importFrom(dplyr,mutate)
 importFrom(magrittr,"%>%")
+importFrom(readr,read_csv)
 importFrom(rlang,.data)
 importFrom(stats,na.omit)
 importFrom(stats,setNames)
@@ -18,5 +21,4 @@ importFrom(tidyselect,all_of)
 importFrom(utils,install.packages)
 importFrom(utils,menu)
 importFrom(utils,packageVersion)
-importFrom(utils,read.csv)
 importFrom(utils,write.table)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,25 @@
+DataQualityDashboard 2.2.0
+==========================
+This release includes:
+
+### New features
+
+- `cohortTableName` parameter added to `executeDqChecks`. Allows user to specify the name of the cohort table when running DQD on a cohort. Defaults to `"cohort"`
+
+
+### Bugfixes
+
+- Fixed several bugs in the default threshold files:
+  - Updated plausible low value for specimen quantity from 1 to 0
+  - Removed foreign key domains for episode object concept ID (multitude of plausible domains make checking this field infeasible)
+  - Updated date format for hard-coded dates to `YYYYMMDD` to conform to SqlRender standard
+  - Added DEATH checks to v5.2 and v5.3
+  - Fixed field level checks to incorporate user-specified `vocabDatabaseSchema` and `cohortDatabaseSchema` where appropriate
+- Removed `outputFile` parameter from DQD setup vignette (variable not set in script)
+- Removed hidden BOM character from several threshold csv files, and updated csv read method to account for BOM character moving forward. This character caused an error on some operating systems
+
+And some minor documentation updates for clarity/accuracy.
+
 DataQualityDashboard 2.1.2
 ==========================
 

diff --git a/R/executeDqChecks.R b/R/executeDqChecks.R
@@ -34,9 +34,10 @@
 #' @param csvFile                   (OPTIONAL) CSV file to write results
 #' @param checkLevels               Choose which DQ check levels to execute. Default is all 3 (TABLE, FIELD, CONCEPT)
 #' @param checkNames                (OPTIONAL) Choose which check names to execute. Names can be found in inst/csv/OMOP_CDM_v[cdmVersion]_Check_Descriptions.csv. Note that "cdmTable", "cdmField" and "measureValueCompleteness" are always executed.
-#' @param cohortDefinitionId        The cohort definition id for the cohort you wish to run the DQD on. The package assumes a standard OHDSI cohort table called 'Cohort'
+#' @param cohortDefinitionId        The cohort definition id for the cohort you wish to run the DQD on. The package assumes a standard OHDSI cohort table
 #'                                  with the fields cohort_definition_id and subject_id.
 #' @param cohortDatabaseSchema      The schema where the cohort table is located.
+#' @param cohortTableName           The name of the cohort table. Defaults to `cohort`.
 #' @param tablesToExclude           (OPTIONAL) Choose which CDM tables to exclude from the execution.
 #' @param cdmVersion                The CDM version to target for the data source. Options are "5.2", "5.3", or "5.4". By default, "5.3" is used.
 #' @param tableCheckThresholdLoc    The location of the threshold file for evaluating the table checks. If not specified the default thresholds will be applied.
@@ -48,9 +49,11 @@
 #' @importFrom magrittr %>%
 #' @import DatabaseConnector
 #' @importFrom stringr str_detect regex
-#' @importFrom utils packageVersion read.csv write.table
+#' @importFrom utils packageVersion write.table
 #' @importFrom rlang .data
 #' @importFrom tidyselect all_of
+#' @importFrom readr read_csv
+#' @importFrom dplyr mutate case_when
 #'
 #' @export
 #'
@@ -72,6 +75,7 @@ executeDqChecks <- function(connectionDetails,
                             checkNames = c(),
                             cohortDefinitionId = c(),
                             cohortDatabaseSchema = resultsDatabaseSchema,
+                            cohortTableName = "cohort",
                             tablesToExclude = c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN"),
                             cdmVersion = "5.3",
                             tableCheckThresholdLoc = "default",
@@ -89,6 +93,7 @@ executeDqChecks <- function(connectionDetails,
   stopifnot(is.character(cdmDatabaseSchema), is.character(resultsDatabaseSchema), is.numeric(numThreads))
   stopifnot(is.character(cdmSourceName), is.logical(sqlOnly), is.character(outputFolder), is.logical(verboseMode))
   stopifnot(is.logical(writeToTable), is.character(checkLevels))
+  stopifnot(is.character(cohortDatabaseSchema), is.character(cohortTableName))
 
   if (!all(checkLevels %in% c("TABLE", "FIELD", "CONCEPT"))) {
     stop('checkLevels argument must be a subset of c("TABLE", "FIELD", "CONCEPT").
@@ -169,14 +174,14 @@ executeDqChecks <- function(connectionDetails,
 
   startTime <- Sys.time()
 
-  checkDescriptionsDf <- read.csv(
+  checkDescriptionsDf <- read_csv(
     file = system.file(
       "csv",
       sprintf("OMOP_CDMv%s_Check_Descriptions.csv", cdmVersion),
       package = "DataQualityDashboard"
-    ),
-    stringsAsFactors = FALSE
+    )
   )
+  checkDescriptionsDf <- as.data.frame(checkDescriptionsDf)
 
   tableChecks <- .readThresholdFile(
     checkThresholdLoc = tableCheckThresholdLoc,
@@ -202,9 +207,18 @@ executeDqChecks <- function(connectionDetails,
     conceptChecks <- conceptChecks[!conceptChecks$cdmTableName %in% tablesToExclude, ]
   }
 
-  ## remove offset from being checked
+  ## remove offset from being checked as it is a reserved word in some databases
   fieldChecks <- subset(fieldChecks, fieldChecks$cdmFieldName != "offset")
 
+  tableChecks <- dplyr::mutate(tableChecks, schema = dplyr::case_when(
+    schema == "CDM" ~ cdmDatabaseSchema,
+    schema == "VOCAB" ~ vocabDatabaseSchema,
+    schema == "COHORT" ~ cohortDatabaseSchema,
+    TRUE ~ cdmDatabaseSchema
+  ))
+
+  fieldChecks <- merge(x = fieldChecks, y = tableChecks[, c("cdmTableName", "schema")], by = "cdmTableName", all.x = TRUE)
+
   checksToInclude <- checkDescriptionsDf$checkName[sapply(checkDescriptionsDf$checkName, function(check) {
     !is.null(eval(parse(text = sprintf("tableChecks$%s", check)))) |
       !is.null(eval(parse(text = sprintf("fieldChecks$%s", check)))) |
@@ -246,6 +260,7 @@ executeDqChecks <- function(connectionDetails,
     cdmDatabaseSchema,
     vocabDatabaseSchema,
     cohortDatabaseSchema,
+    cohortTableName,
     cohortDefinitionId,
     outputFolder,
     sqlOnly,

diff --git a/R/listChecks.R b/R/listChecks.R
@@ -24,75 +24,75 @@
 #' @param fieldCheckThresholdLoc    The location of the threshold file for evaluating the field checks. If not specified the default thresholds will be applied.
 #' @param conceptCheckThresholdLoc  The location of the threshold file for evaluating the concept checks. If not specified the default thresholds will be applied.
 #'
+#' @importFrom readr read_csv
 #'
 #' @export
 listDqChecks <- function(cdmVersion = "5.3", tableCheckThresholdLoc = "default", fieldCheckThresholdLoc = "default", conceptCheckThresholdLoc = "default") {
   dqChecks <- {}
   dqChecks$checkDescriptions <-
-    read.csv(system.file(
+    read_csv(system.file(
       "csv",
       sprintf("OMOP_CDMv%s_Check_Descriptions.csv", cdmVersion),
       package = "DataQualityDashboard"
-    ),
-    stringsAsFactors = FALSE
-    )
+    ))
+  dqChecks$checkDescriptions <- as.data.frame(dqChecks$checkDescriptions)
 
 
   if (tableCheckThresholdLoc == "default") {
     dqChecks$tableChecks <-
-      read.csv(
+      read_csv(
         system.file(
           "csv",
           sprintf("OMOP_CDMv%s_Table_Level.csv", cdmVersion),
           package = "DataQualityDashboard"
         ),
-        stringsAsFactors = FALSE,
-        na.strings = c(" ", "")
+        na = c(" ", "")
       )
+    dqChecks$tableChecks <- as.data.frame(dqChecks$tableChecks)
   } else {
-    dqChecks$tableChecks <- read.csv(
+    dqChecks$tableChecks <- read_csv(
       tableCheckThresholdLoc,
-      stringsAsFactors = FALSE,
-      na.strings = c(" ", "")
+      na = c(" ", "")
     )
+    dqChecks$tableChecks <- as.data.frame(dqChecks$tableChecks)
   }
 
   if (fieldCheckThresholdLoc == "default") {
     dqChecks$fieldChecks <-
-      read.csv(
+      read_csv(
         system.file(
           "csv",
           sprintf("OMOP_CDMv%s_Field_Level.csv", cdmVersion),
           package = "DataQualityDashboard"
         ),
-        stringsAsFactors = FALSE,
-        na.strings = c(" ", "")
+        na = c(" ", "")
       )
+    dqChecks$fieldChecks <- as.data.frame(dqChecks$fieldChecks)
   } else {
-    dqChecks$fieldChecks <- read.csv(
+    dqChecks$fieldChecks <- read_csv(
       fieldCheckThresholdLoc,
-      stringsAsFactors = FALSE,
-      na.strings = c(" ", "")
+      na = c(" ", "")
     )
+    dqChecks$fieldChecks <- as.data.frame(dqChecks$fieldChecks)
   }
 
   if (conceptCheckThresholdLoc == "default") {
     dqChecks$conceptChecks <-
-      read.csv(
+      read_csv(
         system.file(
           "csv",
           sprintf("OMOP_CDMv%s_Concept_Level.csv", cdmVersion),
           package = "DataQualityDashboard"
         ),
-        stringsAsFactors = FALSE,
-        na.strings = c(" ", "")
+        na = c(" ", "")
       )
+    dqChecks$conceptChecks <- as.data.frame(dqChecks$conceptChecks)
   } else {
-    dqChecks$conceptChecks <- read.csv(
+    dqChecks$conceptChecks <- read_csv(
       conceptCheckThresholdLoc,
-      stringsAsFactors = FALSE,
-      na.strings = c(" ", "")
+      na = c(" ", "")
     )
+    dqChecks$conceptChecks <- as.data.frame(dqChecks$conceptChecks)
   }
 
   return(dqChecks)

diff --git a/R/readThresholdFile.R b/R/readThresholdFile.R
@@ -15,22 +15,28 @@
 # limitations under the License.
 
 .readThresholdFile <- function(checkThresholdLoc, defaultLoc) {
+  thresholdFile <- checkThresholdLoc
+
   if (checkThresholdLoc == "default") {
-    result <- read.csv(
-      file = system.file(
-        "csv",
-        defaultLoc,
-        package = "DataQualityDashboard"
-      ),
-      stringsAsFactors = FALSE,
-      na.strings = c(" ", "")
-    )
-  } else {
-    result <- read.csv(
-      file = checkThresholdLoc,
-      stringsAsFactors = FALSE,
-      na.strings = c(" ", "")
+    thresholdFile <- system.file(
+      "csv",
+      defaultLoc,
+      package = "DataQualityDashboard"
     )
   }
+
+  colspec <- readr::spec_csv(thresholdFile)
+
+  # plausibleUnitConceptIds is a comma-separated list of concept ids, but it is being interpreted as col_double()
+  if ("plausibleUnitConceptIds" %in% names(colspec$cols)) {
+    colspec$cols$plausibleUnitConceptIds <- readr::col_character()
+  }
+
+  result <- read_csv(
+    file = thresholdFile,
+    col_types = colspec,
+    na = c(" ", "")
+  )
+  result <- as.data.frame(result)
   return(result)
 }
diff --git a/R/runCheck.R b/R/runCheck.R
@@ -25,6 +25,7 @@
 #' @param cdmDatabaseSchema         The fully qualified database name of the CDM schema
 #' @param vocabDatabaseSchema       The fully qualified database name of the vocabulary schema (default is to set it as the cdmDatabaseSchema)
 #' @param cohortDatabaseSchema      The schema where the cohort table is located.
+#' @param cohortTableName           The name of the cohort table.
 #' @param cohortDefinitionId        The cohort definition id for the cohort you wish to run the DQD on. The package assumes a standard OHDSI cohort table called 'Cohort'
 #' @param outputFolder              The folder to output logs and SQL files to
 #' @param sqlOnly                   Should the SQLs be executed (FALSE) or just returned (TRUE)?
@@ -42,6 +43,7 @@
                       cdmDatabaseSchema,
                       vocabDatabaseSchema,
                       cohortDatabaseSchema,
+                      cohortTableName,
                       cohortDefinitionId,
                       outputFolder,
                       sqlOnly) {
@@ -77,6 +79,7 @@
         list(warnOnMissingParameters = FALSE),
         list(cdmDatabaseSchema = cdmDatabaseSchema),
         list(cohortDatabaseSchema = cohortDatabaseSchema),
+        list(cohortTableName = cohortTableName),
         list(cohortDefinitionId = cohortDefinitionId),
         list(vocabDatabaseSchema = vocabDatabaseSchema),
         list(cohort = cohort),

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
diff --git a/docs/articles/AddNewCheck.html b/docs/articles/AddNewCheck.html