Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add instruction for excluding checks #492

Merged
merged 3 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 61 additions & 44 deletions extras/codeToRun.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,42 +19,59 @@ library(DatabaseConnector)

# fill out the connection details -----------------------------------------------------------------------
connectionDetails <- DatabaseConnector::createConnectionDetails(
dbms = "",
user = "",
password = "",
server = "",
port = "",
dbms = "",
user = "",
password = "",
server = "",
port = "",
extraSettings = "",
pathToDriver = ""
)

cdmDatabaseSchema <- "yourCdmSchema" # the fully qualified database schema name of the CDM
resultsDatabaseSchema <- "yourResultsSchema" # the fully qualified database schema name of the results schema (that you can write to)
cdmSourceName <- "Your CDM Source" # a human readable name for your CDM source
cdmVersion <- "5.4" # the CDM version you are targetting. Currently supporst 5.2.2, 5.3.1, and 5.4
cdmVersion <- "5.4" # the CDM version you are targetting. Currently supports 5.2, 5.3, and 5.4

# determine how many threads (concurrent SQL sessions) to use ----------------------------------------
numThreads <- 1 # on Redshift, 3 seems to work well

# specify if you want to execute the queries or inspect them ------------------------------------------
sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries. See codeToRun_sqlOnly.R for other sqlOnly parameters
sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries
sqlOnlyIncrementalInsert <- FALSE # set to TRUE if you want the generated SQL queries to calculate DQD results and insert them into a database table (@resultsDatabaseSchema.@writeTableName)
sqlOnlyUnionCount <- 1 # in sqlOnlyIncrementalInsert mode, the number of check sqls to union in a single query; higher numbers can improve performance in some DBMS (e.g. a value of 25 may be 25x faster)

# NOTES specific to sqlOnly <- TRUE option ------------------------------------------------------------
# 1. You do not need a live database connection. Instead, connectionDetails only needs these parameters:
# connectionDetails <- DatabaseConnector::createConnectionDetails(
# dbms = "", # specify your dbms
# pathToDriver = "/"
# )
# 2. Since these are fully functional queries, this can help with debugging.
# 3. In the results output by the sqlOnlyIncrementalInsert queries, placeholders are populated for execution_time, query_text, and warnings/errors; and the NOT_APPLICABLE rules are not applied.
# 4. In order to use the generated SQL to insert metadata and check results into output table, you must set sqlOnlyIncrementalInsert = TRUE. Otherwise sqlOnly is backwards compatable with <= v2.2.0, generating queries which run the checks but don't store the results.


# where should the results and logs go? ----------------------------------------------------------------
outputFolder <- "output"
outputFile <- "results.json"


# logging type -------------------------------------------------------------------------------------
verboseMode <- TRUE # set to FALSE if you don't want the logs to be printed to the console

# write results to table? -----------------------------------------------------------------------
writeToTable <- FALSE # set to TRUE if you want to write to a SQL table in the results schema
# write results to table? ------------------------------------------------------------------------------
writeToTable <- TRUE # set to FALSE if you want to skip writing to a SQL table in the results schema

# specify the name of the results table (used when writeToTable = TRUE and when sqlOnlyIncrementalInsert = TRUE)
writeTableName <- "dqdashboard_results"

# write results to a csv file? -----------------------------------------------------------------------
writeToCsv <- FALSE # set to FALSE if you want to skip writing to csv file
csvFile <- "" # only needed if writeToCsv is set to TRUE

# if writing to table and using Redshift, bulk loading can be initialized -------------------------------

# Sys.setenv("AWS_ACCESS_KEY_ID" = "",
# "AWS_SECRET_ACCESS_KEY" = "",
# "AWS_DEFAULT_REGION" = "",
Expand All @@ -67,46 +84,46 @@ csvFile <- "" # only needed if writeToCsv is set to TRUE
checkLevels <- c("TABLE", "FIELD", "CONCEPT")

# which DQ checks to run? ------------------------------------
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv

# want to EXCLUDE a pre-specified list of checks? run the following code:
#
# checksToExclude <- c() # Names of check types to exclude from your DQD run
# allChecks <- DataQualityDashboard::listDqChecks()
# checkNames <- allChecks$checkDescriptions %>%
# subset(!(checkName %in% checksToExclude)) %>%
# select(checkName)

# which CDM tables to exclude? ------------------------------------
tablesToExclude <- c()
tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables

# run the job --------------------------------------------------------------------------------------
DataQualityDashboard::executeDqChecks(
connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema,
cdmSourceName = cdmSourceName,
cdmVersion = cdmVersion
numThreads = numThreads,
sqlOnly = sqlOnly,
outputFolder = outputFolder,
outputFile = outputFile,
verboseMode = verboseMode,
writeToTable = writeToTable,
writeToCsv = writeToCsv,
csvFile = csvFile,
checkLevels = checkLevels,
tablesToExclude = tablesToExclude,
checkNames = checkNames
)
DataQualityDashboard::executeDqChecks(connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema,
cdmSourceName = cdmSourceName,
cdmVersion = cdmVersion,
numThreads = numThreads,
sqlOnly = sqlOnly,
sqlOnlyUnionCount = sqlOnlyUnionCount,
sqlOnlyIncrementalInsert = sqlOnlyIncrementalInsert,
outputFolder = outputFolder,
outputFile = outputFile,
verboseMode = verboseMode,
writeToTable = writeToTable,
writeToCsv = writeToCsv,
csvFile = csvFile,
checkLevels = checkLevels,
tablesToExclude = tablesToExclude,
checkNames = checkNames)

# inspect logs ----------------------------------------------------------------------------
ParallelLogger::launchLogViewer(
logFileName = file.path(outputFolder,
sprintf("log_DqDashboard_%s.txt", cdmSourceName))
)

# View the Data Quality Dashboard using the integrated shiny application ------------------------------------
DataQualityDashboard::viewDqDashboard(
jsonPath = file.path(getwd(), outputFolder, outputFile)
)
ParallelLogger::launchLogViewer(logFileName = file.path(outputFolder, cdmSourceName,
sprintf("log_DqDashboard_%s.txt", cdmSourceName)))

# (OPTIONAL) if you want to write the JSON file to the results table separately -----------------------------
jsonFilePath <- "" # put the path to the outputted JSON file
DataQualityDashboard::writeJsonResultsToTable(
connectionDetails = connectionDetails,
resultsDatabaseSchema = resultsDatabaseSchema,
jsonFilePath = jsonFilePath
)
jsonFilePath <- ""
DataQualityDashboard::writeJsonResultsToTable(connectionDetails = connectionDetails,
resultsDatabaseSchema = resultsDatabaseSchema,
jsonFilePath = jsonFilePath)

8 changes: 8 additions & 0 deletions vignettes/DataQualityDashboard.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@ checkLevels <- c("TABLE", "FIELD", "CONCEPT")
# which DQ checks to run? ------------------------------------
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv

# want to EXCLUDE a pre-specified list of checks? run the following code:
#
# checksToExclude <- c() # Names of check types to exclude from your DQD run
# allChecks <- DataQualityDashboard::listDqChecks()
# checkNames <- allChecks$checkDescriptions %>%
# subset(!(checkName %in% checksToExclude)) %>%
# select(checkName)

# which CDM tables to exclude? ------------------------------------
tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables

Expand Down
Loading