Skip to content

Commit

Permalink
Add support for minCellCount
Browse files Browse the repository at this point in the history
  • Loading branch information
anthonysena committed Aug 29, 2024
1 parent 2a50514 commit dac1b94
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 0 deletions.
45 changes: 45 additions & 0 deletions R/Export.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#'
#' @param databaseId Optional - when specified, the databaseId will be added
#' to the exported results
#' @template minCellCount
#'
#' @template CohortDefinitionSet
#'
Expand All @@ -61,6 +62,7 @@ exportCohortStatsTables <- function(connectionDetails,
fileNamesInSnakeCase = FALSE,
incremental = FALSE,
databaseId = NULL,
minCellCount = 5,
cohortDefinitionSet = NULL,
tablePrefix = "") {
if (is.null(connection)) {
Expand All @@ -80,7 +82,18 @@ exportCohortStatsTables <- function(connectionDetails,
tablePrefix) {
fullFileName <- file.path(cohortStatisticsFolder, paste0(tablePrefix, fileName))
primaryKeyColumns <- getPrimaryKey(resultsDataModelTableName)
columnsToCensor <- getColumnsToCensor(resultsDataModelTableName)
rlang::inform(paste0("- Saving data to - ", fullFileName))

# Make sure the data is censored before saving
if (length(columnsToCensor) > 0) {
for (i in seq_along(columnsToCensor)) {
colName <- ifelse(isTRUE(snakeCaseToCamelCase), yes = columnsToCensor[i], no = SqlRender::camelCaseToSnakeCase(columnsToCensor[i]))
data <- data %>%
enforceMinCellValue(colName, minCellCount)
}
}

if (incremental) {
# Dynamically build the arguments to the saveIncremental
# to specify the primary key(s) for the file
Expand Down Expand Up @@ -226,3 +239,35 @@ getPrimaryKey <- function(tableName) {
SqlRender::snakeCaseToCamelCase()
return(columns)
}

getColumnsToCensor <- function(tableName) {
columns <- readCsv(
file = system.file("csv", "resultsDataModelSpecification.csv", package = "CohortGenerator")
) %>%
dplyr::filter(.data$tableName == !!tableName & tolower(.data$minCellCount) == "yes") %>%
dplyr::pull(.data$columnName) %>%
SqlRender::snakeCaseToCamelCase()
return(columns)
}

enforceMinCellValue <- function(data, fieldName, minValues, silent = FALSE) {
toCensor <- !is.na(pull(data, fieldName)) & pull(data, fieldName) < minValues & pull(data, fieldName) != 0
if (!silent) {
percent <- round(100 * sum(toCensor) / nrow(data), 1)
message(
" censoring ",
sum(toCensor),
" values (",
percent,
"%) from ",
fieldName,
" because value below minimum"
)
}
if (length(minValues) == 1) {
data[toCensor, fieldName] <- -minValues
} else {
data[toCensor, fieldName] <- -minValues[toCensor]
}
return(data)
}
14 changes: 14 additions & 0 deletions R/RunCohortGeneration.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
#'
#' @param databaseId A unique ID for the database. This will be appended to
#' most tables.
#'
#' @template minCellCount
#'
#' @param incremental Create only cohorts that haven't been created before?
#'
Expand All @@ -78,6 +80,7 @@ runCohortGeneration <- function(connectionDetails,
stopOnError = TRUE,
outputFolder,
databaseId = 1,
minCellCount = 5,
incremental = FALSE,
incrementalFolder = NULL) {
if (is.null(cohortDefinitionSet) && is.null(negativeControlOutcomeCohortSet)) {
Expand Down Expand Up @@ -127,6 +130,7 @@ runCohortGeneration <- function(connectionDetails,
stopOnError = stopOnError,
outputFolder = outputFolder,
databaseId = databaseId,
minCellCount = minCellCount,
incremental = incremental,
incrementalFolder = incrementalFolder
)
Expand All @@ -142,6 +146,7 @@ runCohortGeneration <- function(connectionDetails,
detectOnDescendants = detectOnDescendants,
outputFolder = outputFolder,
databaseId = databaseId,
minCellCount = minCellCount,
incremental = incremental,
incrementalFolder = incrementalFolder
)
Expand All @@ -164,6 +169,7 @@ generateAndExportCohorts <- function(connection,
stopOnError,
outputFolder,
databaseId,
minCellCount,
incremental,
incrementalFolder) {
# Generate the cohorts
Expand Down Expand Up @@ -220,6 +226,9 @@ generateAndExportCohorts <- function(connection,
}

rlang::inform("Saving cohort counts")
cohortCounts <- cohortCounts %>%
enforceMinCellValue("cohortEntries", minCellCount) %>%
enforceMinCellValue("cohortSubjects", minCellCount)
writeCsv(
x = cohortCounts,
file = cohortCountsFileName
Expand All @@ -235,6 +244,7 @@ generateAndExportCohorts <- function(connection,
fileNamesInSnakeCase = TRUE,
incremental = incremental,
databaseId = databaseId,
minCellCount = minCellCount,
cohortDefinitionSet = cohortDefinitionSet,
tablePrefix = "cg_"
)
Expand All @@ -254,6 +264,7 @@ generateAndExportNegativeControls <- function(connection,
detectOnDescendants,
outputFolder,
databaseId,
minCellCount,
incremental,
incrementalFolder) {
# Generate any negative controls
Expand Down Expand Up @@ -299,6 +310,9 @@ generateAndExportNegativeControls <- function(connection,
)

rlang::inform("Saving negative control outcome cohort counts")
cohortCountsNegativeControlOutcomes <- cohortCountsNegativeControlOutcomes %>%
enforceMinCellValue("cohortEntries", minCellCount) %>%
enforceMinCellValue("cohortSubjects", minCellCount)
writeCsv(
x = cohortCountsNegativeControlOutcomes,
file = cohortCountsNegativeControlOutcomesFileName
Expand Down
3 changes: 3 additions & 0 deletions man-roxygen/MinCellCount.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#' @param minCellCount To preserve privacy: the minimum number of subjects contributing
#' to a count before it can be included in the results. If the
#' count is below this threshold, it will be set to `-minCellCount`.
5 changes: 5 additions & 0 deletions man/exportCohortStatsTables.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/runCohortGeneration.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit dac1b94

Please sign in to comment.