Skip to content

Commit

Permalink
add instruction for excluding checks
Browse files Browse the repository at this point in the history
  • Loading branch information
katy-sadowski committed Oct 4, 2023
1 parent 7a2b489 commit d53f2e2
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 44 deletions.
105 changes: 61 additions & 44 deletions extras/codeToRun.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,42 +19,59 @@ library(DatabaseConnector)

# fill out the connection details -----------------------------------------------------------------------
connectionDetails <- DatabaseConnector::createConnectionDetails(
dbms = "",
user = "",
password = "",
server = "",
port = "",
dbms = "",
user = "",
password = "",
server = "",
port = "",
extraSettings = "",
pathToDriver = ""
)

cdmDatabaseSchema <- "yourCdmSchema" # the fully qualified database schema name of the CDM
resultsDatabaseSchema <- "yourResultsSchema" # the fully qualified database schema name of the results schema (that you can write to)
cdmSourceName <- "Your CDM Source" # a human readable name for your CDM source
cdmVersion <- "5.4" # the CDM version you are targetting. Currently supporst 5.2.2, 5.3.1, and 5.4
cdmVersion <- "5.4" # the CDM version you are targetting. Currently supports 5.2, 5.3, and 5.4

# determine how many threads (concurrent SQL sessions) to use ----------------------------------------
numThreads <- 1 # on Redshift, 3 seems to work well

# specify if you want to execute the queries or inspect them ------------------------------------------
sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries. See codeToRun_sqlOnly.R for other sqlOnly parameters
sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries
sqlOnlyIncrementalInsert <- FALSE # set to TRUE if you want the generated SQL queries to calculate DQD results and insert them into a database table (@resultsDatabaseSchema.@writeTableName)
sqlOnlyUnionCount <- 1 # in sqlOnlyIncrementalInsert mode, the number of check sqls to union in a single query; higher numbers can improve performance in some DBMS (e.g. a value of 25 may be 25x faster)

# NOTES specific to sqlOnly <- TRUE option ------------------------------------------------------------
# 1. You do not need a live database connection. Instead, connectionDetails only needs these parameters:
# connectionDetails <- DatabaseConnector::createConnectionDetails(
# dbms = "", # specify your dbms
# pathToDriver = "/"
# )
# 2. Since these are fully functional queries, this can help with debugging.
# 3. In the results output by the sqlOnlyIncrementalInsert queries, placeholders are populated for execution_time, query_text, and warnings/errors; and the NOT_APPLICABLE rules are not applied.
# 4. In order to use the generated SQL to insert metadata and check results into output table, you must set sqlOnlyIncrementalInsert = TRUE. Otherwise sqlOnly is backwards compatable with <= v2.2.0, generating queries which run the checks but don't store the results.


# where should the results and logs go? ----------------------------------------------------------------
outputFolder <- "output"
outputFile <- "results.json"


# logging type -------------------------------------------------------------------------------------
verboseMode <- TRUE # set to FALSE if you don't want the logs to be printed to the console

# write results to table? -----------------------------------------------------------------------
writeToTable <- FALSE # set to TRUE if you want to write to a SQL table in the results schema
# write results to table? ------------------------------------------------------------------------------
writeToTable <- TRUE # set to FALSE if you want to skip writing to a SQL table in the results schema

# specify the name of the results table (used when writeToTable = TRUE and when sqlOnlyIncrementalInsert = TRUE)
writeTableName <- "dqdashboard_results"

# write results to a csv file? -----------------------------------------------------------------------
writeToCsv <- FALSE # set to FALSE if you want to skip writing to csv file
csvFile <- "" # only needed if writeToCsv is set to TRUE

# if writing to table and using Redshift, bulk loading can be initialized -------------------------------

# Sys.setenv("AWS_ACCESS_KEY_ID" = "",
# "AWS_SECRET_ACCESS_KEY" = "",
# "AWS_DEFAULT_REGION" = "",
Expand All @@ -67,46 +84,46 @@ csvFile <- "" # only needed if writeToCsv is set to TRUE
checkLevels <- c("TABLE", "FIELD", "CONCEPT")

# which DQ checks to run? ------------------------------------
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv

# want to EXCLUDE a pre-specified list of checks? run the following code:
#
# checksToExclude <- c() # Names of check types to exclude from your DQD run
# allChecks <- DataQualityDashboard::listDqChecks()
# checkNames <- allChecks$checkDescriptions %>%
# subset(!(checkName %in% checksToExclude)) %>%
# select(checkName)

# which CDM tables to exclude? ------------------------------------
tablesToExclude <- c()
tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables

# run the job --------------------------------------------------------------------------------------
DataQualityDashboard::executeDqChecks(
connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema,
cdmSourceName = cdmSourceName,
cdmVersion = cdmVersion
numThreads = numThreads,
sqlOnly = sqlOnly,
outputFolder = outputFolder,
outputFile = outputFile,
verboseMode = verboseMode,
writeToTable = writeToTable,
writeToCsv = writeToCsv,
csvFile = csvFile,
checkLevels = checkLevels,
tablesToExclude = tablesToExclude,
checkNames = checkNames
)
DataQualityDashboard::executeDqChecks(connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema,
cdmSourceName = cdmSourceName,
cdmVersion = cdmVersion,
numThreads = numThreads,
sqlOnly = sqlOnly,
sqlOnlyUnionCount = sqlOnlyUnionCount,
sqlOnlyIncrementalInsert = sqlOnlyIncrementalInsert,
outputFolder = outputFolder,
outputFile = outputFile,
verboseMode = verboseMode,
writeToTable = writeToTable,
writeToCsv = writeToCsv,
csvFile = csvFile,
checkLevels = checkLevels,
tablesToExclude = tablesToExclude,
checkNames = checkNames)

# inspect logs ----------------------------------------------------------------------------
ParallelLogger::launchLogViewer(
logFileName = file.path(outputFolder,
sprintf("log_DqDashboard_%s.txt", cdmSourceName))
)

# View the Data Quality Dashboard using the integrated shiny application ------------------------------------
DataQualityDashboard::viewDqDashboard(
jsonPath = file.path(getwd(), outputFolder, outputFile)
)
ParallelLogger::launchLogViewer(logFileName = file.path(outputFolder, cdmSourceName,
sprintf("log_DqDashboard_%s.txt", cdmSourceName)))

# (OPTIONAL) if you want to write the JSON file to the results table separately -----------------------------
jsonFilePath <- "" # put the path to the outputted JSON file
DataQualityDashboard::writeJsonResultsToTable(
connectionDetails = connectionDetails,
resultsDatabaseSchema = resultsDatabaseSchema,
jsonFilePath = jsonFilePath
)
jsonFilePath <- ""
DataQualityDashboard::writeJsonResultsToTable(connectionDetails = connectionDetails,
resultsDatabaseSchema = resultsDatabaseSchema,
jsonFilePath = jsonFilePath)

8 changes: 8 additions & 0 deletions vignettes/DataQualityDashboard.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@ checkLevels <- c("TABLE", "FIELD", "CONCEPT")
# which DQ checks to run? ------------------------------------
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv

# want to EXCLUDE a pre-specified list of checks? run the following code:
#
# checksToExclude <- c() # Names of check types to exclude from your DQD run
# allChecks <- DataQualityDashboard::listDqChecks()
# checkNames <- allChecks$checkDescriptions %>%
# subset(!(checkName %in% checksToExclude)) %>%
# select(checkName)

# which CDM tables to exclude? ------------------------------------
tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables

Expand Down

0 comments on commit d53f2e2

Please sign in to comment.