Merge pull request #559 from OHDSI/develop

Develop
OHDSI · Jul 13, 2024 · 98c825a · 98c825a
2 parents 6ef7ee2 + 3a2c48e
commit 98c825a
Show file tree

Hide file tree

Showing 115 changed files with 5,017 additions and 1,032 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DataQualityDashboard
 Type: Package
 Title: Execute and View Data Quality Checks on OMOP CDM Database
-Version: 2.6.0
-Date: 2024-02-21
+Version: 2.6.1
+Date: 2024-07-12
 Authors@R: c(
   person("Katy", "Sadowski", email = "[email protected]", role = c("aut", "cre")),
   person("Clair", "Blacketer", role = c("aut")),
@@ -44,7 +44,5 @@ Suggests:
     ggplot2,
     Eunomia,
     R.utils
-Remotes:
-    ohdsi/Eunomia
-RoxygenNote: 7.2.2
+RoxygenNote: 7.3.1
 Encoding: UTF-8
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,35 @@
+DataQualityDashboard 2.6.1
+==========================
+This release includes: 
+
+### Bugfixes
+
+- Checks
+  - `plausibleStartBeforeEnd` was failing if SOURCE_RELEASE_DATE was before CDM_RELEASE_DATE in the CDM_SOURCE table. This is the opposite of the correct logic!  The check is now updated to fail if the CDM_RELEASE_DATE is before the SOURCE_RELEASE_DATE
+  - `plausibleTemporalAfter` was throwing a syntax error in BigQuery due to the format of a hardcoded date in the SQL query.  This query has now been updated to be compliant with SqlRender and the issue has been resolved
+- A dependency issue was causing `viewDqDashboard` to error out in newer versions of R.  This has now been resolved
+- `SqlOnly` mode was failing due to the format of the new check `plausibleGenderUseDescendants`, which takes multiple concepts as an input.  This has now been fixed
+
+### New Results Field
+
+- A new field has been added to the DQD results output - `executionTimeSeconds`.  This field stores the execution time in seconds of each check in numeric format.  (The existing `executionTime` field stores execution time as a string, making it difficult to use in analysis.)
+
+### Check Threshold Updates
+
+The default thresholds for 2 checks were discovered to be inconsistently populated and occasionally set to illogical levels.  These have now been fixed as detailed below.
+
+- The default thresholds for `sourceValueCompleteness` have been updated as follows:
+  - 10% for `_source_value` columns in condition_occurrence, measurement, procedure_occurrence, drug_exposure, and visit_occurrence tables
+  - 100% for all other `_source_value` columns
+- The default thresholds for `sourceConceptRecordCompleteness` have been updated as follows:
+  - 10% for `_source_concept_id` columns in condition_occurrence, drug_exposure, measurement, procedure_occurrence, device_exposure, and observation tables
+  - 100% for all other `_source_concept_id` columns
+
+### New Documentation
+We have continued (and nearly completed) our initiative to add more comprehensive user documentation at the data quality check level.  A dedicated documentation page is being created for each check type.  Each check's page includes detailed information about how its result is generated and what to do if it fails.  Guidance is provided for both ETL developers and data users.
+
+Check out the newly added pages [here](https://ohdsi.github.io/DataQualityDashboard/articles/checkIndex.html) and please reach out with feedback as we continue improving our documentation!
+
 DataQualityDashboard 2.6.0
 ==========================
 This release includes: 

diff --git a/R/calculateNotApplicableStatus.R b/R/calculateNotApplicableStatus.R
@@ -0,0 +1,195 @@
+# Copyright 2024 Observational Health Data Sciences and Informatics
+#
+# This file is part of DataQualityDashboard
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Determines if all checks are present expected to calculate the 'Not Applicable' status
+#'
+#' @param checkResults A dataframe containing the results of the data quality checks
+#'
+#' @keywords internal
+.hasNAchecks <- function(checkResults) {
+  checkNames <- unique(checkResults$checkName)
+  return(.containsNAchecks(checkNames))
+}
+
+#' Determines if all checks required for 'Not Applicable' status are in the checkNames
+#'
+#' @param checkNames A character vector of check names
+#'
+#' @keywords internal
+.containsNAchecks <- function(checkNames) {
+  naCheckNames <- c("cdmTable", "cdmField", "measureValueCompleteness")
+  missingNAChecks <- !(naCheckNames %in% checkNames)
+  if (any(missingNAChecks)) {
+    return(FALSE)
+  }
+  return(TRUE)
+}
+
+#' Applies the 'Not Applicable' status to a single check
+#'
+#' @param x Results from a single check
+#'
+#' @keywords internal
+.applyNotApplicable <- function(x) {
+  # Errors precede all other statuses
+  if (x$isError == 1) {
+    return(0)
+  }
+
+  # No NA status for cdmTable and cdmField if missing
+  if (x$checkName == "cdmTable" || x$checkName == "cdmField") {
+    return(0)
+  }
+
+  if (any(x$tableIsMissing, x$fieldIsMissing, x$tableIsEmpty, na.rm = TRUE)) {
+    return(1)
+  }
+
+  # No NA status for measureValueCompleteness if empty
+  if (x$checkName == "measureValueCompleteness") {
+    return(0)
+  }
+
+  if (any(x$fieldIsEmpty, x$conceptIsMissing, x$conceptAndUnitAreMissing, na.rm = TRUE)) {
+    return(1)
+  }
+
+  return(0)
+}
+
+#' Determines if check should be notApplicable and the notApplicableReason
+#'
+#' @param checkResults A dataframe containing the results of the data quality checks
+#'
+#' @keywords internal
+.calculateNotApplicableStatus <- function(checkResults) {
+  # Look up missing tables and add variable tableIsMissing to checkResults
+  missingTables <- checkResults %>%
+    dplyr::filter(
+      .data$checkName == "cdmTable"
+    ) %>%
+    dplyr::mutate(
+      .data$cdmTableName,
+      tableIsMissing = .data$failed == 1,
+      .keep = "none"
+    )
+
+  # Look up missing fields and add variable fieldIsMissing to checkResults
+  missingFields <- checkResults %>%
+    dplyr::filter(
+      .data$checkName == "cdmField"
+    ) %>%
+    dplyr::mutate(
+      .data$cdmTableName,
+      .data$cdmFieldName,
+      fieldIsMissing = .data$failed == 1,
+      .keep = "none"
+    )
+
+  # Look up empty tables and add variable tableIsEmpty to checkResults
+  emptyTables <- checkResults %>%
+    dplyr::filter(
+      .data$checkName == "measureValueCompleteness"
+    ) %>%
+    dplyr::mutate(
+      .data$cdmTableName,
+      tableIsEmpty = .data$numDenominatorRows == 0,
+      .keep = "none"
+    ) %>%
+    dplyr::distinct()
+
+  # Look up empty fields and add variable fieldIsEmpty to checkResults
+  emptyFields <- checkResults %>%
+    dplyr::filter(
+      .data$checkName == "measureValueCompleteness"
+    ) %>%
+    dplyr::mutate(
+      .data$cdmTableName,
+      .data$cdmFieldName,
+      fieldIsEmpty = .data$numDenominatorRows == .data$numViolatedRows,
+      .keep = "none"
+    )
+
+  # Assign notApplicable status
+  checkResults <- checkResults %>%
+    dplyr::left_join(
+      missingTables,
+      by = "cdmTableName"
+    ) %>%
+    dplyr::left_join(
+      missingFields,
+      by = c("cdmTableName", "cdmFieldName")
+    ) %>%
+    dplyr::left_join(
+      emptyTables,
+      by = "cdmTableName"
+    ) %>%
+    dplyr::left_join(
+      emptyFields,
+      by = c("cdmTableName", "cdmFieldName")
+    ) %>%
+    dplyr::mutate(
+      conceptIsMissing = .data$checkLevel == "CONCEPT" & is.na(.data$unitConceptId) & .data$numDenominatorRows == 0,
+      conceptAndUnitAreMissing = .data$checkLevel == "CONCEPT" & !is.na(.data$unitConceptId) & .data$numDenominatorRows == 0,
+      fieldIsMissing = dplyr::coalesce(.data$fieldIsMissing, !is.na(.data$cdmFieldName)),
+      fieldIsEmpty = dplyr::coalesce(.data$fieldIsEmpty, !is.na(.data$cdmFieldName)),
+    )
+
+  checkResults$notApplicable <- NA
+  checkResults$notApplicableReason <- NA
+
+  conditionOccurrenceIsMissing <- missingTables %>%
+    dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>%
+    dplyr::pull(.data$tableIsMissing)
+  conditionOccurrenceIsEmpty <- emptyTables %>%
+    dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>%
+    dplyr::pull(.data$tableIsEmpty)
+  for (i in seq_len(nrow(checkResults))) {
+    # Special rule for measureConditionEraCompleteness, which should be notApplicable if CONDITION_OCCURRENCE is empty
+    if (checkResults[i, "checkName"] == "measureConditionEraCompleteness") {
+      if (conditionOccurrenceIsMissing || conditionOccurrenceIsEmpty) {
+        checkResults$notApplicable[i] <- 1
+        checkResults$notApplicableReason[i] <- "Table CONDITION_OCCURRENCE is empty."
+      } else {
+        checkResults$notApplicable[i] <- 0
+      }
+    } else {
+      checkResults$notApplicable[i] <- .applyNotApplicable(checkResults[i, ])
+    }
+  }
+
+  checkResults <- checkResults %>%
+    dplyr::mutate(
+      notApplicableReason = ifelse(
+        .data$notApplicable == 1,
+        dplyr::case_when(
+          !is.na(.data$notApplicableReason) ~ .data$notApplicableReason,
+          .data$tableIsMissing ~ sprintf("Table %s does not exist.", .data$cdmTableName),
+          .data$fieldIsMissing ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName),
+          .data$tableIsEmpty ~ sprintf("Table %s is empty.", .data$cdmTableName),
+          .data$fieldIsEmpty ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName),
+          .data$conceptIsMissing ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName),
+          .data$conceptAndUnitAreMissing ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName) # nolint
+        ),
+        NA
+      ),
+      failed = ifelse(.data$notApplicable == 1, 0, .data$failed),
+      passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0)
+    ) %>%
+    dplyr::select(-c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing"))
+
+  return(checkResults)
+}
diff --git a/R/evaluateThresholds.R b/R/evaluateThresholds.R
@@ -163,119 +163,9 @@
     }
   }
 
-  missingTables <- dplyr::select(
-    dplyr::filter(checkResults, .data$checkName == "cdmTable" & .data$failed == 1),
-    "cdmTableName"
-  )
-  if (nrow(missingTables) > 0) {
-    missingTables$tableIsMissing <- 1
-    checkResults <- dplyr::mutate(
-      dplyr::left_join(checkResults, missingTables, by = "cdmTableName"),
-      tableIsMissing = ifelse(.data$checkName != "cdmTable" & .data$isError == 0, .data$tableIsMissing, NA)
-    )
-  } else {
-    checkResults$tableIsMissing <- NA
+  if (.hasNAchecks(checkResults)) {
+    checkResults <- .calculateNotApplicableStatus(checkResults)
   }
 
-  missingFields <- dplyr::select(
-    dplyr::filter(checkResults, .data$checkName == "cdmField" & .data$failed == 1 & is.na(.data$tableIsMissing)),
-    "cdmTableName", "cdmFieldName"
-  )
-  if (nrow(missingFields) > 0) {
-    missingFields$fieldIsMissing <- 1
-    checkResults <- dplyr::mutate(
-      dplyr::left_join(checkResults, missingFields, by = c("cdmTableName", "cdmFieldName")),
-      fieldIsMissing = ifelse(.data$checkName != "cdmField" & .data$isError == 0, .data$fieldIsMissing, NA)
-    )
-  } else {
-    checkResults$fieldIsMissing <- NA
-  }
-
-  emptyTables <- dplyr::distinct(
-    dplyr::select(
-      dplyr::filter(checkResults, .data$checkName == "measureValueCompleteness" &
-        .data$numDenominatorRows == 0 &
-        .data$isError == 0 &
-        is.na(.data$tableIsMissing) &
-        is.na(.data$fieldIsMissing)),
-      "cdmTableName"
-    )
-  )
-  if (nrow(emptyTables) > 0) {
-    emptyTables$tableIsEmpty <- 1
-    checkResults <- dplyr::mutate(
-      dplyr::left_join(checkResults, emptyTables, by = c("cdmTableName")),
-      tableIsEmpty = ifelse(.data$checkName != "cdmField" & .data$checkName != "cdmTable" & .data$isError == 0, .data$tableIsEmpty, NA)
-    )
-  } else {
-    checkResults$tableIsEmpty <- NA
-  }
-
-  emptyFields <-
-    dplyr::select(
-      dplyr::filter(checkResults, .data$checkName == "measureValueCompleteness" &
-        .data$numDenominatorRows == .data$numViolatedRows &
-        is.na(.data$tableIsMissing) & is.na(.data$fieldIsMissing) & is.na(.data$tableIsEmpty)),
-      "cdmTableName", "cdmFieldName"
-    )
-  if (nrow(emptyFields) > 0) {
-    emptyFields$fieldIsEmpty <- 1
-    checkResults <- dplyr::mutate(
-      dplyr::left_join(checkResults, emptyFields, by = c("cdmTableName", "cdmFieldName")),
-      fieldIsEmpty = ifelse(.data$checkName != "measureValueCompleteness" & .data$checkName != "cdmField" & .data$checkName != "isRequired" & .data$isError == 0, .data$fieldIsEmpty, NA)
-    )
-  } else {
-    checkResults$fieldIsEmpty <- NA
-  }
-
-  checkResults <- dplyr::mutate(
-    checkResults,
-    conceptIsMissing = ifelse(
-      .data$isError == 0 &
-        is.na(.data$tableIsMissing) &
-        is.na(.data$fieldIsMissing) &
-        is.na(.data$tableIsEmpty) &
-        is.na(.data$fieldIsEmpty) &
-        .data$checkLevel == "CONCEPT" &
-        is.na(.data$unitConceptId) &
-        .data$numDenominatorRows == 0,
-      1,
-      NA
-    )
-  )
-
-  checkResults <- dplyr::mutate(
-    checkResults,
-    conceptAndUnitAreMissing = ifelse(
-      .data$isError == 0 &
-        is.na(.data$tableIsMissing) &
-        is.na(.data$fieldIsMissing) &
-        is.na(.data$tableIsEmpty) &
-        is.na(.data$fieldIsEmpty) &
-        .data$checkLevel == "CONCEPT" &
-        !is.na(.data$unitConceptId) &
-        .data$numDenominatorRows == 0,
-      1,
-      NA
-    )
-  )
-
-  checkResults <- dplyr::mutate(
-    checkResults,
-    notApplicable = dplyr::coalesce(.data$tableIsMissing, .data$fieldIsMissing, .data$tableIsEmpty, .data$fieldIsEmpty, .data$conceptIsMissing, .data$conceptAndUnitAreMissing, 0),
-    notApplicableReason = dplyr::case_when(
-      !is.na(.data$tableIsMissing) ~ sprintf("Table %s does not exist.", .data$cdmTableName),
-      !is.na(.data$fieldIsMissing) ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName),
-      !is.na(.data$tableIsEmpty) ~ sprintf("Table %s is empty.", .data$cdmTableName),
-      !is.na(.data$fieldIsEmpty) ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName),
-      !is.na(.data$conceptIsMissing) ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName),
-      !is.na(.data$conceptAndUnitAreMissing) ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName)
-    )
-  )
-
-  checkResults <- dplyr::select(checkResults, -c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing"))
-  checkResults <- dplyr::mutate(checkResults, failed = ifelse(.data$notApplicable == 1, 0, .data$failed))
-  checkResults <- dplyr::mutate(checkResults, passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0))
-
   checkResults
 }