From d53f2e2c931ec20f448f68982adb7455cb17ba8d Mon Sep 17 00:00:00 2001 From: Sadowski Date: Wed, 4 Oct 2023 14:50:12 -0400 Subject: [PATCH] add instruction for excluding checks --- extras/codeToRun.R | 105 +++++++++++++++++------------ vignettes/DataQualityDashboard.rmd | 8 +++ 2 files changed, 69 insertions(+), 44 deletions(-) diff --git a/extras/codeToRun.R b/extras/codeToRun.R index 4807d3c2..81cc0e45 100644 --- a/extras/codeToRun.R +++ b/extras/codeToRun.R @@ -19,11 +19,11 @@ library(DatabaseConnector) # fill out the connection details ----------------------------------------------------------------------- connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = "", - user = "", - password = "", - server = "", - port = "", + dbms = "", + user = "", + password = "", + server = "", + port = "", extraSettings = "", pathToDriver = "" ) @@ -31,30 +31,47 @@ connectionDetails <- DatabaseConnector::createConnectionDetails( cdmDatabaseSchema <- "yourCdmSchema" # the fully qualified database schema name of the CDM resultsDatabaseSchema <- "yourResultsSchema" # the fully qualified database schema name of the results schema (that you can write to) cdmSourceName <- "Your CDM Source" # a human readable name for your CDM source -cdmVersion <- "5.4" # the CDM version you are targetting. Currently supporst 5.2.2, 5.3.1, and 5.4 +cdmVersion <- "5.4" # the CDM version you are targetting. Currently supports 5.2, 5.3, and 5.4 # determine how many threads (concurrent SQL sessions) to use ---------------------------------------- numThreads <- 1 # on Redshift, 3 seems to work well # specify if you want to execute the queries or inspect them ------------------------------------------ -sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries. See codeToRun_sqlOnly.R for other sqlOnly parameters +sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries +sqlOnlyIncrementalInsert <- FALSE # set to TRUE if you want the generated SQL queries to calculate DQD results and insert them into a database table (@resultsDatabaseSchema.@writeTableName) +sqlOnlyUnionCount <- 1 # in sqlOnlyIncrementalInsert mode, the number of check sqls to union in a single query; higher numbers can improve performance in some DBMS (e.g. a value of 25 may be 25x faster) + +# NOTES specific to sqlOnly <- TRUE option ------------------------------------------------------------ +# 1. You do not need a live database connection. Instead, connectionDetails only needs these parameters: +# connectionDetails <- DatabaseConnector::createConnectionDetails( +# dbms = "", # specify your dbms +# pathToDriver = "/" +# ) +# 2. Since these are fully functional queries, this can help with debugging. +# 3. In the results output by the sqlOnlyIncrementalInsert queries, placeholders are populated for execution_time, query_text, and warnings/errors; and the NOT_APPLICABLE rules are not applied. +# 4. In order to use the generated SQL to insert metadata and check results into output table, you must set sqlOnlyIncrementalInsert = TRUE. Otherwise sqlOnly is backwards compatable with <= v2.2.0, generating queries which run the checks but don't store the results. # where should the results and logs go? ---------------------------------------------------------------- outputFolder <- "output" outputFile <- "results.json" + # logging type ------------------------------------------------------------------------------------- verboseMode <- TRUE # set to FALSE if you don't want the logs to be printed to the console -# write results to table? ----------------------------------------------------------------------- -writeToTable <- FALSE # set to TRUE if you want to write to a SQL table in the results schema +# write results to table? ------------------------------------------------------------------------------ +writeToTable <- TRUE # set to FALSE if you want to skip writing to a SQL table in the results schema + +# specify the name of the results table (used when writeToTable = TRUE and when sqlOnlyIncrementalInsert = TRUE) +writeTableName <- "dqdashboard_results" # write results to a csv file? ----------------------------------------------------------------------- writeToCsv <- FALSE # set to FALSE if you want to skip writing to csv file csvFile <- "" # only needed if writeToCsv is set to TRUE # if writing to table and using Redshift, bulk loading can be initialized ------------------------------- + # Sys.setenv("AWS_ACCESS_KEY_ID" = "", # "AWS_SECRET_ACCESS_KEY" = "", # "AWS_DEFAULT_REGION" = "", @@ -67,46 +84,46 @@ csvFile <- "" # only needed if writeToCsv is set to TRUE checkLevels <- c("TABLE", "FIELD", "CONCEPT") # which DQ checks to run? ------------------------------------ -checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv +checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv + +# want to EXCLUDE a pre-specified list of checks? run the following code: +# +# checksToExclude <- c() # Names of check types to exclude from your DQD run +# allChecks <- DataQualityDashboard::listDqChecks() +# checkNames <- allChecks$checkDescriptions %>% +# subset(!(checkName %in% checksToExclude)) %>% +# select(checkName) # which CDM tables to exclude? ------------------------------------ -tablesToExclude <- c() +tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables # run the job -------------------------------------------------------------------------------------- -DataQualityDashboard::executeDqChecks( - connectionDetails = connectionDetails, - cdmDatabaseSchema = cdmDatabaseSchema, - resultsDatabaseSchema = resultsDatabaseSchema, - cdmSourceName = cdmSourceName, - cdmVersion = cdmVersion - numThreads = numThreads, - sqlOnly = sqlOnly, - outputFolder = outputFolder, - outputFile = outputFile, - verboseMode = verboseMode, - writeToTable = writeToTable, - writeToCsv = writeToCsv, - csvFile = csvFile, - checkLevels = checkLevels, - tablesToExclude = tablesToExclude, - checkNames = checkNames -) +DataQualityDashboard::executeDqChecks(connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + resultsDatabaseSchema = resultsDatabaseSchema, + cdmSourceName = cdmSourceName, + cdmVersion = cdmVersion, + numThreads = numThreads, + sqlOnly = sqlOnly, + sqlOnlyUnionCount = sqlOnlyUnionCount, + sqlOnlyIncrementalInsert = sqlOnlyIncrementalInsert, + outputFolder = outputFolder, + outputFile = outputFile, + verboseMode = verboseMode, + writeToTable = writeToTable, + writeToCsv = writeToCsv, + csvFile = csvFile, + checkLevels = checkLevels, + tablesToExclude = tablesToExclude, + checkNames = checkNames) # inspect logs ---------------------------------------------------------------------------- -ParallelLogger::launchLogViewer( - logFileName = file.path(outputFolder, - sprintf("log_DqDashboard_%s.txt", cdmSourceName)) -) - -# View the Data Quality Dashboard using the integrated shiny application ------------------------------------ -DataQualityDashboard::viewDqDashboard( - jsonPath = file.path(getwd(), outputFolder, outputFile) -) +ParallelLogger::launchLogViewer(logFileName = file.path(outputFolder, cdmSourceName, + sprintf("log_DqDashboard_%s.txt", cdmSourceName))) # (OPTIONAL) if you want to write the JSON file to the results table separately ----------------------------- -jsonFilePath <- "" # put the path to the outputted JSON file -DataQualityDashboard::writeJsonResultsToTable( - connectionDetails = connectionDetails, - resultsDatabaseSchema = resultsDatabaseSchema, - jsonFilePath = jsonFilePath -) +jsonFilePath <- "" +DataQualityDashboard::writeJsonResultsToTable(connectionDetails = connectionDetails, + resultsDatabaseSchema = resultsDatabaseSchema, + jsonFilePath = jsonFilePath) + diff --git a/vignettes/DataQualityDashboard.rmd b/vignettes/DataQualityDashboard.rmd index 4013bcef..1a60e78a 100644 --- a/vignettes/DataQualityDashboard.rmd +++ b/vignettes/DataQualityDashboard.rmd @@ -111,6 +111,14 @@ checkLevels <- c("TABLE", "FIELD", "CONCEPT") # which DQ checks to run? ------------------------------------ checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv +# want to EXCLUDE a pre-specified list of checks? run the following code: +# +# checksToExclude <- c() # Names of check types to exclude from your DQD run +# allChecks <- DataQualityDashboard::listDqChecks() +# checkNames <- allChecks$checkDescriptions %>% +# subset(!(checkName %in% checksToExclude)) %>% +# select(checkName) + # which CDM tables to exclude? ------------------------------------ tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables