From bb0c91956a35e11a81679e7adf9592ab31c2fd10 Mon Sep 17 00:00:00 2001 From: Admin_mschuemi Date: Fri, 18 Aug 2023 09:08:10 -0400 Subject: [PATCH] Porting computation of correction to C++ for speed --- NAMESPACE | 4 ++ R/Diagnostics.R | 69 ++++++++++++++------------- R/Export.R | 13 +++-- R/RcppExports.R | 4 ++ R/SelfControlledCaseSeries.R | 2 +- extras/AgeAndSeasonSimulations.R | 2 +- man/computeTimeStability.Rd | 20 ++++---- man/createSccsDiagnosticThresholds.Rd | 2 +- src/RcppExports.cpp | 13 +++++ src/RcppWrapper.cpp | 35 ++++++++++++++ 10 files changed, 113 insertions(+), 51 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f3b981b..f5beb82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -77,7 +77,11 @@ importFrom(stats,aggregate) importFrom(stats,coef) importFrom(stats,confint) importFrom(stats,dgamma) +importFrom(stats,dpois) +importFrom(stats,integrate) importFrom(stats,nlm) +importFrom(stats,optim) +importFrom(stats,pchisq) importFrom(stats,pgamma) importFrom(stats,pnorm) importFrom(stats,ppois) diff --git a/R/Diagnostics.R b/R/Diagnostics.R index 9d179f7..e04a2a7 100644 --- a/R/Diagnostics.R +++ b/R/Diagnostics.R @@ -16,8 +16,14 @@ # source("~/git/SelfControlledCaseSeries/R/SccsDataConversion.R") computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) { + # Computes not only the observed outcome rate per calendar month, but also + # observed / expected rate assuming everything is constant, and if sccsModel is + # provided, observed / expected using the adjustments in the model. + # To compute the expected rate for a month, we must consider only those cases + # observed during that month, summing their respective incidence rates. + if (nrow(studyPopulation$cases) == 0) { - tibble( + result <- tibble( month = 1.0, observedCount = 1.0, observationPeriodCount = 1.0, @@ -25,8 +31,8 @@ computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) { adjustedRate = 1.0, monthStartDate = as.Date("2000-01-01"), monthEndDate = as.Date("2000-01-01")) %>% - filter(month == 0) %>% - return() + filter(month == 0) + return(result) } cases <- studyPopulation$cases %>% mutate(startDate = .data$observationPeriodStartDate + .data$startDay, @@ -78,23 +84,11 @@ computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) { select(-"monthOfYear") %>% mutate(totalRr = .data$totalRr * .data$seasonRr) } - computeCorrection <- function(startMonth, endMonth, startMonthFraction, endMonthFraction) { - monthAdjustments %>% - filter(.data$month >= startMonth, .data$month <= endMonth) %>% - mutate(weight = if_else(.data$month == startMonth, - startMonthFraction, - if_else(.data$month == endMonth, - endMonthFraction, - 1))) %>% - summarize(correction = sum(.data$weight * .data$totalRr) / sum(.data$weight)) %>% - pull(.data$correction) %>% - return() - } if (hasAdjustment) { + # Need to correct for the fact that a person may have seen only part of the spline, so + # techincally has a different intercept: cases <- cases %>% - rowwise() %>% - mutate(correction = computeCorrection(.data$startMonth, .data$endMonth, .data$startMonthFraction, .data$endMonthFraction)) %>% - ungroup() + mutate(correction = computeCorrections(cases, monthAdjustments)) } else { cases <- cases %>% mutate(correction = 1) @@ -134,24 +128,22 @@ computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) { #' Compute stability of outcome rate over time #' #' @details -#' Computes for each calendar month the rate of the outcome, and evaluates whether that rate is constant over time. If -#' splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration. For each -#' month a two-sided p-value is computed against the null hypothesis that the rate in that month deviates from the mean rate -#' no more than `maxRatio`. This p-value is compared to an alpha value, using a Bonferroni correction to adjust for the -#' multiple testing across months. +#' Computes for each month the observed and expected count, and computes the (weighted) mean ratio between the two. If +#' splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration when +#' considering the expected count. A one-sided p-value is computed against the null hypothesis that the ratio is smaller +#' than `maxRatio`. If this p-value exceeds the specified alpha value, the series is considered stable. #' #' @template StudyPopulation #' @param sccsModel Optional: A fitted SCCS model as created using [fitSccsModel()]. If the #' model contains splines for seasonality and or calendar time these will be adjusted #' for before computing stability. -#' @param maxRatio The maximum ratio between the (adjusted) rate in a month, and the mean (adjusted) rate that -#' we would consider to be irrelevant. -#' @param alpha The alpha (type 1 error) used to test for stability. A Bonferroni correction will -#' be applied for the number of months tested. +#' @param maxRatio The maximum global ratio between the observed and expected count. +#' @param alpha The alpha (type 1 error) used to test for stability. #' #' @return -#' A tibble with information on the temporal stability per month. The column `stable` indicates whether the rate -#' of the outcome is within the expected range for that month, assuming the rate is constant over time. +#' A tibble with one row and three columns: `ratio` indicates the estimated mean ratio between observed and expected. +#' `p` is the p-value against the null-hypothesis that the ratio is smaller than `maxRatio`, and `stable` is `TRUE` +#' if `p` is greater than `alpha`. #' #' @export computeTimeStability <- function(studyPopulation, sccsModel = NULL, maxRatio = 1.25, alpha = 0.05) { @@ -162,12 +154,25 @@ computeTimeStability <- function(studyPopulation, sccsModel = NULL, maxRatio = 1 checkmate::assertNumeric(alpha, lower = 0, upper = 1, len = 1, add = errorMessages) checkmate::reportAssertions(collection = errorMessages) - data <- computeOutcomeRatePerMonth(studyPopulation, sccsModel) + data <- SelfControlledCaseSeries:::computeOutcomeRatePerMonth(studyPopulation, sccsModel) + if (nrow(data) == 0) { + result <- tibble(ratio = NA, + p = 1, + stable = TRUE) + return(result) + } o <- data$observedCount e <- data$observedCount / data$adjustedRate + # logLikelihood <- function(x) { + # return(-sum(log(dpois(o, e*x) + dpois(o, e/x)))) + # } + # From https://cdsmithus.medium.com/the-logarithm-of-a-sum-69dd76199790 + smoothMax <- function(x, y) { + return(ifelse(abs(x-y) > 100, pmax(x,y), x + log(1 + exp(y-x)))) + } logLikelihood <- function(x) { - return(-sum(log(dpois(o, e*x) + dpois(o, e/x)))) + return(-sum(smoothMax(dpois(o, e*x, log = TRUE), dpois(o, e/x, log = TRUE)))) } likelihood <- function(x) { return(exp(-logLikelihood(x))) @@ -175,7 +180,7 @@ computeTimeStability <- function(studyPopulation, sccsModel = NULL, maxRatio = 1 vectorLikelihood <- function(x) { return(sapply(x, likelihood)) } - x <- seq(1,10, by = 0.1) + x <- seq(1, 10, by = 0.1) ll <- sapply(x, logLikelihood) maxX <- x[max(which(!is.na(ll) & !is.infinite(ll)))] minX <- x[min(which(!is.na(ll) & !is.infinite(ll)))] diff --git a/R/Export.R b/R/Export.R index 2261901..d75c6e9 100644 --- a/R/Export.R +++ b/R/Export.R @@ -26,7 +26,7 @@ #' (MDRR)? #' @param easeThreshold What is the maximum allowed expected absolute systematic error #' (EASE). -#' @param timeTrendPThreshold What family-wise p-value threshold (alpha) will be used to determine +#' @param timeTrendPThreshold What p-value threshold (alpha) will be used to determine #' temporal instability? #' @param preExposurePThreshold What p-value threshold (alpha) will be used to determine whether the #' rate of the outcome was higher just before exposure initiation? @@ -568,15 +568,17 @@ exportFromSccsDataStudyPopSccsModel <- function(outputFolder, exportFolder, data ) } - # sccsTimeTrend table - timeTrendData <- computeTimeStability( + # sccsTimeTrend table. No longer computing stability per month. Set flag + # to TRUE for all months for backwards compatability: + timeTrendData <- computeOutcomeRatePerMonth( studyPopulation = studyPop, sccsModel = sccsModel ) %>% mutate( calendarYear = floor(.data$month / 12), calendarMonth = floor(.data$month %% 12) + 1, - stable = as.integer(.data$stable) + stable = 1, + p = 1 ) %>% select( "calendarYear", @@ -596,11 +598,12 @@ exportFromSccsDataStudyPopSccsModel <- function(outputFolder, exportFolder, data bind_cols(timeTrendData) # sccsDiagnosticsSummary table + stability <- computeTimeStability(studyPopulation = studyPop, sccsModel = sccsModel) table <- ease %>% filter(.data$exposuresOutcomeSetId == refRow$exposuresOutcomeSetId & .data$analysisId == refRow$analysisId) %>% mutate( mdrr = as.numeric(NA), - timeTrendP = min(timeTrendData$p * nrow(timeTrendData)), + timeTrendP = stability$p, preExposureP = as.numeric(NA) ) for (j in seq_len(nrow(table))) { diff --git a/R/RcppExports.R b/R/RcppExports.R index 8585b25..479a93f 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -25,3 +25,7 @@ testEgid <- function(p, present, astart, aend, start, end) { .Call('_SelfControlledCaseSeries_testEgid', PACKAGE = 'SelfControlledCaseSeries', p, present, astart, aend, start, end) } +computeCorrections <- function(cases, monthAdjustments) { + .Call('_SelfControlledCaseSeries_computeCorrections', PACKAGE = 'SelfControlledCaseSeries', cases, monthAdjustments) +} + diff --git a/R/SelfControlledCaseSeries.R b/R/SelfControlledCaseSeries.R index f539dc0..28b0ca3 100644 --- a/R/SelfControlledCaseSeries.R +++ b/R/SelfControlledCaseSeries.R @@ -23,7 +23,7 @@ #' @importFrom SqlRender loadRenderTranslateSql translateSql #' @importFrom grDevices rgb #' @importFrom methods is -#' @importFrom stats aggregate coef confint dgamma nlm pgamma pnorm printCoefmat qnorm rnorm rpois runif splinefun quantile ppois +#' @importFrom stats aggregate coef confint dgamma nlm pgamma pnorm printCoefmat qnorm rnorm rpois runif splinefun quantile ppois dpois integrate optim pchisq #' @importFrom utils head tail setTxtProgressBar txtProgressBar packageVersion #' @import dplyr #' @import Cyclops diff --git a/extras/AgeAndSeasonSimulations.R b/extras/AgeAndSeasonSimulations.R index 5b85045..6b080c5 100644 --- a/extras/AgeAndSeasonSimulations.R +++ b/extras/AgeAndSeasonSimulations.R @@ -1,6 +1,6 @@ library(SelfControlledCaseSeries) options(andromedaTempFolder = "d:/andromedaTemp") -settings <- createSccsSimulationSettings(includeAgeEffect = F, +settings <- createSccsSimulationSettings(includeAgeEffect = T, includeCalendarTimeEffect = T, includeSeasonality = T) set.seed(123) diff --git a/man/computeTimeStability.Rd b/man/computeTimeStability.Rd index fa135dd..e395c0a 100644 --- a/man/computeTimeStability.Rd +++ b/man/computeTimeStability.Rd @@ -18,23 +18,21 @@ computeTimeStability( model contains splines for seasonality and or calendar time these will be adjusted for before computing stability.} -\item{maxRatio}{The maximum ratio between the (adjusted) rate in a month, and the mean (adjusted) rate that -we would consider to be irrelevant.} +\item{maxRatio}{The maximum global ratio between the observed and expected count.} -\item{alpha}{The alpha (type 1 error) used to test for stability. A Bonferroni correction will -be applied for the number of months tested.} +\item{alpha}{The alpha (type 1 error) used to test for stability.} } \value{ -A tibble with information on the temporal stability per month. The column \code{stable} indicates whether the rate -of the outcome is within the expected range for that month, assuming the rate is constant over time. +A tibble with one row and three columns: \code{ratio} indicates the estimated mean ratio between observed and expected. +\code{p} is the p-value against the null-hypothesis that the ratio is smaller than \code{maxRatio}, and \code{stable} is \code{TRUE} +if \code{p} is greater than \code{alpha}. } \description{ Compute stability of outcome rate over time } \details{ -Computes for each calendar month the rate of the outcome, and evaluates whether that rate is constant over time. If -splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration. For each -month a two-sided p-value is computed against the null hypothesis that the rate in that month deviates from the mean rate -no more than \code{maxRatio}. This p-value is compared to an alpha value, using a Bonferroni correction to adjust for the -multiple testing across months. +Computes for each month the observed and expected count, and computes the (weighted) mean ratio between the two. If +splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration when +considering the expected count. A one-sided p-value is computed against the null hypothesis that the ratio is smaller +than \code{maxRatio}. If this p-value exceeds the specified alpha value, the series is considered stable. } diff --git a/man/createSccsDiagnosticThresholds.Rd b/man/createSccsDiagnosticThresholds.Rd index ffc4271..0d1c019 100644 --- a/man/createSccsDiagnosticThresholds.Rd +++ b/man/createSccsDiagnosticThresholds.Rd @@ -18,7 +18,7 @@ createSccsDiagnosticThresholds( \item{easeThreshold}{What is the maximum allowed expected absolute systematic error (EASE).} -\item{timeTrendPThreshold}{What family-wise p-value threshold (alpha) will be used to determine +\item{timeTrendPThreshold}{What p-value threshold (alpha) will be used to determine temporal instability?} \item{preExposurePThreshold}{What p-value threshold (alpha) will be used to determine whether the diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 902146b..b6410f7 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -123,6 +123,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// computeCorrections +std::vector computeCorrections(const DataFrame& cases, const DataFrame& monthAdjustments); +RcppExport SEXP _SelfControlledCaseSeries_computeCorrections(SEXP casesSEXP, SEXP monthAdjustmentsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const DataFrame& >::type cases(casesSEXP); + Rcpp::traits::input_parameter< const DataFrame& >::type monthAdjustments(monthAdjustmentsSEXP); + rcpp_result_gen = Rcpp::wrap(computeCorrections(cases, monthAdjustments)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_SelfControlledCaseSeries_convertToSccs", (DL_FUNC) &_SelfControlledCaseSeries_convertToSccs, 18}, @@ -131,6 +143,7 @@ static const R_CallMethodDef CallEntries[] = { {"_SelfControlledCaseSeries_testEwid", (DL_FUNC) &_SelfControlledCaseSeries_testEwid, 6}, {"_SelfControlledCaseSeries_testEgad", (DL_FUNC) &_SelfControlledCaseSeries_testEgad, 6}, {"_SelfControlledCaseSeries_testEgid", (DL_FUNC) &_SelfControlledCaseSeries_testEgid, 6}, + {"_SelfControlledCaseSeries_computeCorrections", (DL_FUNC) &_SelfControlledCaseSeries_computeCorrections, 2}, {NULL, NULL, 0} }; diff --git a/src/RcppWrapper.cpp b/src/RcppWrapper.cpp index 240bf31..4f745c4 100644 --- a/src/RcppWrapper.cpp +++ b/src/RcppWrapper.cpp @@ -141,4 +141,39 @@ double testEgid(std::vector p, double present, double astart, double aen return weight; } +// [[Rcpp::export]] +std::vector computeCorrections(const DataFrame& cases, const DataFrame& monthAdjustments) { + NumericVector startMonths = cases["startMonth"]; + NumericVector startMonthFractions = cases["startMonthFraction"]; + NumericVector endMonths = cases["endMonth"]; + NumericVector endMonthFractions = cases["endMonthFraction"]; + NumericVector months = monthAdjustments["month"]; + NumericVector totalRrs = monthAdjustments["totalRr"]; + std::vector corrections(cases.nrows()); + for (int i = 0; i < cases.nrows(); i++) { + double startMonth = startMonths[i]; + double startMonthFraction = startMonthFractions[i]; + double endMonth = endMonths[i]; + double endMonthFraction = endMonthFractions[i]; + double sumW = 0; + double sumRr = 0; + for (int j = 0; j < monthAdjustments.nrows(); j++) { + double month = months[j]; + if (month >= startMonth) { + double w = 1; + if (month == startMonth) + w = startMonthFraction; + else if (month == endMonth) + w = endMonthFraction; + sumW += w; + sumRr += w * totalRrs[j]; + } + if (month == endMonth) + break; + } + corrections[i] = sumRr / sumW; + } + return corrections; +} + #endif // __RcppWrapper_cpp__