From bb0c91956a35e11a81679e7adf9592ab31c2fd10 Mon Sep 17 00:00:00 2001
From: Admin_mschuemi <Admin_mschuemi@its.jnj.com>
Date: Fri, 18 Aug 2023 09:08:10 -0400
Subject: [PATCH] Porting computation of correction to C++ for speed

---
 NAMESPACE                             |  4 ++
 R/Diagnostics.R                       | 69 ++++++++++++++-------------
 R/Export.R                            | 13 +++--
 R/RcppExports.R                       |  4 ++
 R/SelfControlledCaseSeries.R          |  2 +-
 extras/AgeAndSeasonSimulations.R      |  2 +-
 man/computeTimeStability.Rd           | 20 ++++----
 man/createSccsDiagnosticThresholds.Rd |  2 +-
 src/RcppExports.cpp                   | 13 +++++
 src/RcppWrapper.cpp                   | 35 ++++++++++++++
 10 files changed, 113 insertions(+), 51 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index f3b981b..f5beb82 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -77,7 +77,11 @@ importFrom(stats,aggregate)
 importFrom(stats,coef)
 importFrom(stats,confint)
 importFrom(stats,dgamma)
+importFrom(stats,dpois)
+importFrom(stats,integrate)
 importFrom(stats,nlm)
+importFrom(stats,optim)
+importFrom(stats,pchisq)
 importFrom(stats,pgamma)
 importFrom(stats,pnorm)
 importFrom(stats,ppois)
diff --git a/R/Diagnostics.R b/R/Diagnostics.R
index 9d179f7..e04a2a7 100644
--- a/R/Diagnostics.R
+++ b/R/Diagnostics.R
@@ -16,8 +16,14 @@
 # source("~/git/SelfControlledCaseSeries/R/SccsDataConversion.R")
 
 computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) {
+  # Computes not only the observed outcome rate per calendar month, but also
+  # observed / expected rate assuming everything is constant, and if sccsModel is
+  # provided, observed / expected using the adjustments in the model.
+  # To compute the expected rate for a month, we must consider only those cases
+  # observed during that month, summing their respective incidence rates.
+
   if (nrow(studyPopulation$cases) == 0) {
-    tibble(
+    result <- tibble(
       month = 1.0,
       observedCount = 1.0,
       observationPeriodCount = 1.0,
@@ -25,8 +31,8 @@ computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) {
       adjustedRate = 1.0,
       monthStartDate = as.Date("2000-01-01"),
       monthEndDate = as.Date("2000-01-01")) %>%
-      filter(month == 0) %>%
-      return()
+      filter(month == 0)
+    return(result)
   }
   cases <- studyPopulation$cases %>%
     mutate(startDate = .data$observationPeriodStartDate + .data$startDay,
@@ -78,23 +84,11 @@ computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) {
       select(-"monthOfYear") %>%
       mutate(totalRr = .data$totalRr * .data$seasonRr)
   }
-  computeCorrection <- function(startMonth, endMonth, startMonthFraction, endMonthFraction) {
-    monthAdjustments %>%
-      filter(.data$month >= startMonth, .data$month <= endMonth) %>%
-      mutate(weight = if_else(.data$month == startMonth,
-                              startMonthFraction,
-                              if_else(.data$month == endMonth,
-                                      endMonthFraction,
-                                      1))) %>%
-      summarize(correction = sum(.data$weight * .data$totalRr) / sum(.data$weight)) %>%
-      pull(.data$correction) %>%
-      return()
-  }
   if (hasAdjustment) {
+    # Need to correct for the fact that a person may have seen only part of the spline, so
+    # techincally has a different intercept:
     cases <- cases %>%
-      rowwise() %>%
-      mutate(correction = computeCorrection(.data$startMonth, .data$endMonth, .data$startMonthFraction, .data$endMonthFraction)) %>%
-      ungroup()
+      mutate(correction = computeCorrections(cases, monthAdjustments))
   } else {
     cases <- cases %>%
       mutate(correction = 1)
@@ -134,24 +128,22 @@ computeOutcomeRatePerMonth <- function(studyPopulation, sccsModel = NULL) {
 #' Compute stability of outcome rate over time
 #'
 #' @details
-#' Computes for each calendar month the rate of the outcome, and evaluates whether that rate is constant over time. If
-#' splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration. For each
-#' month a two-sided p-value is computed against the null hypothesis that the rate in that month deviates from the mean rate
-#' no more than `maxRatio`. This p-value is compared to an alpha value, using a Bonferroni correction to adjust for the
-#' multiple testing across months.
+#' Computes for each month the observed and expected count, and computes the (weighted) mean ratio between the two. If
+#' splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration when
+#' considering the expected count. A one-sided p-value is computed against the null hypothesis that the ratio is smaller
+#' than `maxRatio`. If this p-value exceeds the specified alpha value, the series is considered stable.
 #'
 #' @template StudyPopulation
 #' @param sccsModel         Optional: A fitted SCCS model as created using [fitSccsModel()]. If the
 #'                          model contains splines for seasonality and or calendar time these will be adjusted
 #'                          for before computing stability.
-#' @param maxRatio          The maximum ratio between the (adjusted) rate in a month, and the mean (adjusted) rate that
-#'                          we would consider to be irrelevant.
-#' @param alpha             The alpha (type 1 error) used to test for stability. A Bonferroni correction will
-#'                          be applied for the number of months tested.
+#' @param maxRatio          The maximum global ratio between the observed and expected count.
+#' @param alpha             The alpha (type 1 error) used to test for stability.
 #'
 #' @return
-#' A tibble with information on the temporal stability per month. The column `stable` indicates whether the rate
-#' of the outcome is within the expected range for that month, assuming the rate is constant over time.
+#' A tibble with one row and three columns: `ratio` indicates the estimated mean ratio between observed and expected.
+#' `p` is the p-value against the null-hypothesis that the ratio is smaller than `maxRatio`, and `stable` is `TRUE`
+#' if `p` is greater than `alpha`.
 #'
 #' @export
 computeTimeStability <- function(studyPopulation, sccsModel = NULL, maxRatio = 1.25, alpha = 0.05) {
@@ -162,12 +154,25 @@ computeTimeStability <- function(studyPopulation, sccsModel = NULL, maxRatio = 1
   checkmate::assertNumeric(alpha, lower = 0, upper = 1, len = 1, add = errorMessages)
   checkmate::reportAssertions(collection = errorMessages)
 
-  data <- computeOutcomeRatePerMonth(studyPopulation, sccsModel)
+  data <- SelfControlledCaseSeries:::computeOutcomeRatePerMonth(studyPopulation, sccsModel)
+  if (nrow(data) == 0) {
+    result <- tibble(ratio = NA,
+                     p = 1,
+                     stable = TRUE)
+    return(result)
+  }
   o <- data$observedCount
   e <- data$observedCount / data$adjustedRate
 
+  # logLikelihood <- function(x) {
+  #   return(-sum(log(dpois(o, e*x) + dpois(o, e/x))))
+  # }
+  # From https://cdsmithus.medium.com/the-logarithm-of-a-sum-69dd76199790
+  smoothMax <- function(x, y) {
+    return(ifelse(abs(x-y) > 100, pmax(x,y), x + log(1 + exp(y-x))))
+  }
   logLikelihood <- function(x) {
-    return(-sum(log(dpois(o, e*x) + dpois(o, e/x))))
+    return(-sum(smoothMax(dpois(o, e*x, log = TRUE), dpois(o, e/x, log = TRUE))))
   }
   likelihood <- function(x) {
     return(exp(-logLikelihood(x)))
@@ -175,7 +180,7 @@ computeTimeStability <- function(studyPopulation, sccsModel = NULL, maxRatio = 1
   vectorLikelihood <- function(x) {
     return(sapply(x, likelihood))
   }
-  x <- seq(1,10, by = 0.1)
+  x <- seq(1, 10, by = 0.1)
   ll <- sapply(x, logLikelihood)
   maxX <- x[max(which(!is.na(ll) & !is.infinite(ll)))]
   minX <- x[min(which(!is.na(ll) & !is.infinite(ll)))]
diff --git a/R/Export.R b/R/Export.R
index 2261901..d75c6e9 100644
--- a/R/Export.R
+++ b/R/Export.R
@@ -26,7 +26,7 @@
 #'                              (MDRR)?
 #' @param easeThreshold         What is the maximum allowed expected absolute systematic error
 #'                              (EASE).
-#' @param timeTrendPThreshold   What family-wise p-value threshold (alpha) will be used to determine
+#' @param timeTrendPThreshold   What p-value threshold (alpha) will be used to determine
 #'                              temporal instability?
 #' @param preExposurePThreshold What p-value threshold (alpha) will be used to determine whether the
 #'                              rate of the outcome was higher just before exposure initiation?
@@ -568,15 +568,17 @@ exportFromSccsDataStudyPopSccsModel <- function(outputFolder, exportFolder, data
         )
     }
 
-    # sccsTimeTrend table
-    timeTrendData <- computeTimeStability(
+    # sccsTimeTrend table. No longer computing stability per month. Set flag
+    # to TRUE for all months for backwards compatability:
+    timeTrendData <- computeOutcomeRatePerMonth(
       studyPopulation = studyPop,
       sccsModel = sccsModel
     ) %>%
       mutate(
         calendarYear = floor(.data$month / 12),
         calendarMonth = floor(.data$month %% 12) + 1,
-        stable = as.integer(.data$stable)
+        stable = 1,
+        p = 1
       ) %>%
       select(
         "calendarYear",
@@ -596,11 +598,12 @@ exportFromSccsDataStudyPopSccsModel <- function(outputFolder, exportFolder, data
       bind_cols(timeTrendData)
 
     # sccsDiagnosticsSummary table
+    stability <- computeTimeStability(studyPopulation = studyPop, sccsModel = sccsModel)
     table <- ease %>%
       filter(.data$exposuresOutcomeSetId == refRow$exposuresOutcomeSetId & .data$analysisId == refRow$analysisId) %>%
       mutate(
         mdrr = as.numeric(NA),
-        timeTrendP = min(timeTrendData$p * nrow(timeTrendData)),
+        timeTrendP = stability$p,
         preExposureP = as.numeric(NA)
       )
     for (j in seq_len(nrow(table))) {
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 8585b25..479a93f 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -25,3 +25,7 @@ testEgid <- function(p, present, astart, aend, start, end) {
     .Call('_SelfControlledCaseSeries_testEgid', PACKAGE = 'SelfControlledCaseSeries', p, present, astart, aend, start, end)
 }
 
+computeCorrections <- function(cases, monthAdjustments) {
+    .Call('_SelfControlledCaseSeries_computeCorrections', PACKAGE = 'SelfControlledCaseSeries', cases, monthAdjustments)
+}
+
diff --git a/R/SelfControlledCaseSeries.R b/R/SelfControlledCaseSeries.R
index f539dc0..28b0ca3 100644
--- a/R/SelfControlledCaseSeries.R
+++ b/R/SelfControlledCaseSeries.R
@@ -23,7 +23,7 @@
 #' @importFrom SqlRender loadRenderTranslateSql translateSql
 #' @importFrom grDevices rgb
 #' @importFrom methods is
-#' @importFrom stats aggregate coef confint dgamma nlm pgamma pnorm printCoefmat qnorm rnorm rpois runif splinefun quantile ppois
+#' @importFrom stats aggregate coef confint dgamma nlm pgamma pnorm printCoefmat qnorm rnorm rpois runif splinefun quantile ppois dpois integrate optim pchisq
 #' @importFrom utils head tail setTxtProgressBar txtProgressBar packageVersion
 #' @import dplyr
 #' @import Cyclops
diff --git a/extras/AgeAndSeasonSimulations.R b/extras/AgeAndSeasonSimulations.R
index 5b85045..6b080c5 100644
--- a/extras/AgeAndSeasonSimulations.R
+++ b/extras/AgeAndSeasonSimulations.R
@@ -1,6 +1,6 @@
 library(SelfControlledCaseSeries)
 options(andromedaTempFolder = "d:/andromedaTemp")
-settings <- createSccsSimulationSettings(includeAgeEffect = F,
+settings <- createSccsSimulationSettings(includeAgeEffect = T,
                                          includeCalendarTimeEffect = T,
                                          includeSeasonality = T)
 set.seed(123)
diff --git a/man/computeTimeStability.Rd b/man/computeTimeStability.Rd
index fa135dd..e395c0a 100644
--- a/man/computeTimeStability.Rd
+++ b/man/computeTimeStability.Rd
@@ -18,23 +18,21 @@ computeTimeStability(
 model contains splines for seasonality and or calendar time these will be adjusted
 for before computing stability.}
 
-\item{maxRatio}{The maximum ratio between the (adjusted) rate in a month, and the mean (adjusted) rate that
-we would consider to be irrelevant.}
+\item{maxRatio}{The maximum global ratio between the observed and expected count.}
 
-\item{alpha}{The alpha (type 1 error) used to test for stability. A Bonferroni correction will
-be applied for the number of months tested.}
+\item{alpha}{The alpha (type 1 error) used to test for stability.}
 }
 \value{
-A tibble with information on the temporal stability per month. The column \code{stable} indicates whether the rate
-of the outcome is within the expected range for that month, assuming the rate is constant over time.
+A tibble with one row and three columns: \code{ratio} indicates the estimated mean ratio between observed and expected.
+\code{p} is the p-value against the null-hypothesis that the ratio is smaller than \code{maxRatio}, and \code{stable} is \code{TRUE}
+if \code{p} is greater than \code{alpha}.
 }
 \description{
 Compute stability of outcome rate over time
 }
 \details{
-Computes for each calendar month the rate of the outcome, and evaluates whether that rate is constant over time. If
-splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration. For each
-month a two-sided p-value is computed against the null hypothesis that the rate in that month deviates from the mean rate
-no more than \code{maxRatio}. This p-value is compared to an alpha value, using a Bonferroni correction to adjust for the
-multiple testing across months.
+Computes for each month the observed and expected count, and computes the (weighted) mean ratio between the two. If
+splines are used to adjust for seasonality and/or calendar time, these adjustments are taken into consideration when
+considering the expected count. A one-sided p-value is computed against the null hypothesis that the ratio is smaller
+than \code{maxRatio}. If this p-value exceeds the specified alpha value, the series is considered stable.
 }
diff --git a/man/createSccsDiagnosticThresholds.Rd b/man/createSccsDiagnosticThresholds.Rd
index ffc4271..0d1c019 100644
--- a/man/createSccsDiagnosticThresholds.Rd
+++ b/man/createSccsDiagnosticThresholds.Rd
@@ -18,7 +18,7 @@ createSccsDiagnosticThresholds(
 \item{easeThreshold}{What is the maximum allowed expected absolute systematic error
 (EASE).}
 
-\item{timeTrendPThreshold}{What family-wise p-value threshold (alpha) will be used to determine
+\item{timeTrendPThreshold}{What p-value threshold (alpha) will be used to determine
 temporal instability?}
 
 \item{preExposurePThreshold}{What p-value threshold (alpha) will be used to determine whether the
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 902146b..b6410f7 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -123,6 +123,18 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// computeCorrections
+std::vector<double> computeCorrections(const DataFrame& cases, const DataFrame& monthAdjustments);
+RcppExport SEXP _SelfControlledCaseSeries_computeCorrections(SEXP casesSEXP, SEXP monthAdjustmentsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const DataFrame& >::type cases(casesSEXP);
+    Rcpp::traits::input_parameter< const DataFrame& >::type monthAdjustments(monthAdjustmentsSEXP);
+    rcpp_result_gen = Rcpp::wrap(computeCorrections(cases, monthAdjustments));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_SelfControlledCaseSeries_convertToSccs", (DL_FUNC) &_SelfControlledCaseSeries_convertToSccs, 18},
@@ -131,6 +143,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_SelfControlledCaseSeries_testEwid", (DL_FUNC) &_SelfControlledCaseSeries_testEwid, 6},
     {"_SelfControlledCaseSeries_testEgad", (DL_FUNC) &_SelfControlledCaseSeries_testEgad, 6},
     {"_SelfControlledCaseSeries_testEgid", (DL_FUNC) &_SelfControlledCaseSeries_testEgid, 6},
+    {"_SelfControlledCaseSeries_computeCorrections", (DL_FUNC) &_SelfControlledCaseSeries_computeCorrections, 2},
     {NULL, NULL, 0}
 };
 
diff --git a/src/RcppWrapper.cpp b/src/RcppWrapper.cpp
index 240bf31..4f745c4 100644
--- a/src/RcppWrapper.cpp
+++ b/src/RcppWrapper.cpp
@@ -141,4 +141,39 @@ double testEgid(std::vector<double> p, double present, double astart, double aen
   return weight;
 }
 
+// [[Rcpp::export]]
+std::vector<double> computeCorrections(const DataFrame& cases, const DataFrame& monthAdjustments) {
+  NumericVector startMonths = cases["startMonth"];
+  NumericVector startMonthFractions = cases["startMonthFraction"];
+  NumericVector endMonths = cases["endMonth"];
+  NumericVector endMonthFractions = cases["endMonthFraction"];
+  NumericVector months = monthAdjustments["month"];
+  NumericVector totalRrs = monthAdjustments["totalRr"];
+  std::vector<double> corrections(cases.nrows());
+  for (int i = 0; i < cases.nrows(); i++) {
+    double startMonth = startMonths[i];
+    double startMonthFraction = startMonthFractions[i];
+    double endMonth = endMonths[i];
+    double endMonthFraction = endMonthFractions[i];
+    double sumW = 0;
+    double sumRr = 0;
+    for (int j = 0; j < monthAdjustments.nrows(); j++) {
+      double month = months[j];
+      if (month >= startMonth) {
+        double w = 1;
+        if (month == startMonth)
+          w = startMonthFraction;
+        else if (month == endMonth)
+          w = endMonthFraction;
+        sumW += w;
+        sumRr += w * totalRrs[j];
+      }
+      if (month == endMonth)
+        break;
+    }
+    corrections[i] = sumRr / sumW;
+  }
+  return corrections;
+}
+
 #endif // __RcppWrapper_cpp__