remove estimate_coef, ame to amp

helske · Sep 4, 2024 · e97e0fe · e97e0fe
1 parent 5a7cdcd
commit e97e0fe
Show file tree

Hide file tree

Showing 22 changed files with 490 additions and 461 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,6 +29,8 @@ S3method(posterior_probs,hmm)
 S3method(posterior_probs,mhmm)
 S3method(posterior_probs,mnhmm)
 S3method(posterior_probs,nhmm)
+S3method(predict,mnhmm)
+S3method(predict,nhmm)
 S3method(print,hmm)
 S3method(print,mhmm)
 S3method(print,mnhmm)
@@ -47,14 +49,13 @@ S3method(vcov,mhmm)
 export("cluster_names<-")
 export("state_names<-")
 export(alphabet)
-export(average_marginal_effect)
+export(average_marginal_prediction)
 export(build_hmm)
 export(build_lcm)
 export(build_mhmm)
 export(build_mm)
 export(build_mmm)
 export(cluster_names)
-export(estimate_coef)
 export(estimate_mnhmm)
 export(estimate_nhmm)
 export(fit_model)

diff --git a/NEWS.md b/NEWS.md
@@ -1,13 +1,17 @@
-seqHMM 1.3.0
+seqHMM 2.0.0
 ==============
-  * Rewrote sequence visualization functions using `ggseqplot` and `patchwork` 
-  packages. Old plotting functions are deprecated and will be removed in the 
-  future.
   * Added support for non-homogeneous HMMs where initial, transition, and 
-  emission probabilities can depend on individual-specific covariates. 
-  Penalized maximum likelihood estimation of these models is based on 
-  L-BFGS-B with automatic differentiation with `Stan`.
+emission probabilities can depend on individual-specific covariates. 
+Penalized maximum likelihood estimation of these models is based on 
+L-BFGS-B with automatic differentiation with `Stan`.
+  * Rewrote sequence visualization functions using `ggseqplot` and `patchwork` 
+packages. Old plotting functions are deprecated and will be removed in the 
+future.
   * Warning and error messages were rewritten using `cli` package.
+  * Removed the function `estimate_coef`. A portion of the code was 
+accidentally commented out, rendering the function non-functional for 
+several years. Rather than correcting the code, the function was removed as it 
+was deemed unnecessary.
   * Added automatic tests using `testthat` package.
   * Internally switched from Rd syntax to Markdown in Roxygen documentation.
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -145,10 +145,6 @@ objectivex <- function(transition, emission, init, obs, ANZ, BNZ, INZ, nSymbols,
     .Call(`_seqHMM_objectivex`, transition, emission, init, obs, ANZ, BNZ, INZ, nSymbols, coef, X, numberOfStates, threads)
 }
 
-estimate_coefs <- function(transition, emission, init, obs, nSymbols, coef, X, numberOfStates, itermax, tol, trace, threads) {
-    .Call(`_seqHMM_estimate_coefs`, transition, emission, init, obs, nSymbols, coef, X, numberOfStates, itermax, tol, trace, threads)
-}
-
 softmax <- function(x, logspace) {
     .Call(`_seqHMM_softmax`, x, logspace)
 }

diff --git a/R/average_marginal_effect.R b/R/average_marginal_effect.R
diff --git a/R/average_marginal_prediction.R b/R/average_marginal_prediction.R
@@ -0,0 +1,146 @@
+#' Average Marginal Predictions for Non-homogenous Hidden Markov Models
+#' 
+#' @param model A Hidden Markov Model of class `nhmm` or `mnhmm`.
+#' @param variable Name of the variable of interest.
+#' @param values Vector containing one or two values for `variable`.
+#' @param newdata Optional data frame which is used for marginalization.
+#' @param nsim Non-negative integer defining the number of samples from the 
+#' normal approximation of the model parameters used in 
+#' computing the approximate quantiles of the estimates. If `0`, only point 
+#' estimates are returned.
+#' @param probs Vector defining the quantiles of interest. Default is 
+#' `c(0.025, 0.5, 0.975)`.
+#' @export
+average_marginal_prediction <- function(
+    model, variable, values, marginalize_B_over = "sequences", newdata = NULL, 
+    nsim = 0, probs = c(0.025, 0.5, 0.975)) {
+  marginalize_over <- match.arg(
+    marginalize_over, c("sequences", "states", "clusters"), several.ok = TRUE)
+  stopifnot_(
+    checkmate::test_count(nsim),
+    "Argument {.arg nsim} should be a single non-negative integer."
+  )
+  stopifnot_(
+    checkmate::test_string(x = variable), 
+    "Argument {.arg variable} must be a single character string."
+  )
+  stopifnot_(
+    length(values) != 2, 
+    "Argument {.arg values} should contain two values for 
+    variable {.var variable}.")
+  if (is.null(newdata)) {
+    time <- model$time_variable
+    id <- model$id_variable
+    stopifnot_(
+      is.data.frame(newdata), 
+      "Argument {.arg newdata} must be a {.cls data.frame} object."
+    )
+    stopifnot_(
+      !is.null(newdata[[id]]), 
+      "Can't find grouping variable {.var {id}} in {.arg newdata}."
+    )
+    stopifnot_(
+      !is.null(newdata[[time]]), 
+      "Can't find time index variable {.var {time}} in {.arg newdata}."
+    )
+    stopifnot_(
+      !is.null(newdata[[variable]]), 
+      "Can't find time variable {.var {variable}} in {.arg newdata}."
+    )
+  } else {
+    stopifnot(
+      !is.null(model$data),
+      "Model does not contain original data and argument {.arg newdata} is 
+      {.var NULL}."
+    )
+    newdata <- model$data
+  }
+  # use same RNG seed so that the same samples of coefficients are drawn
+  newdata[[variable]] <- values[1]
+  seed <- sample(.Machine$integer.max, 1)
+  set.seed(seed)
+  pred <- predict(model, newdata, nsim, return_samples = TRUE)
+  if (length(values) == 2) {
+    newdata[[variable]] <- values[2]
+    set.seed(seed)
+    pred2 <- predict(model, newdata, nsim, return_samples = TRUE)
+    pred <- mapply("-", pred, pred2, SIMPLIFY = FALSE)
+  }
+  T <- model$length_of_sequences
+  N <- model$n_sequences
+  S <- model$n_states
+  M <- model$n_symbols
+  C <- model$n_channels
+  D <- model$n_clusters
+  if (C == 1) {
+    ids <- rownames(model$observations)
+    times <- colnames(model$observations)
+    symbol_names <- list(model$symbol_names)
+  } else {
+    ids <- rownames(model$observations[[1]])
+    times <- colnames(model$observations[[1]])
+    symbol_names <- model$symbol_names
+  }
+  marginalize <- dplyr::recode(
+    marginalize_B_over,
+    "clusters" = "cluster",
+    "states" = "state",
+    "sequences" = "id")
+  id_var <- model$id_variable
+  time_var <- model$time_variable
+
+  pi <- data.frame(
+    cluster = rep(model$cluster_names, each = S * N),
+    id = rep(ids, each = S),
+    state = model$state_names,
+    estimate = unlist(pred$pi)
+  ) |> 
+    dplyr::group_by(.data[[marginalize]]) |>
+    summarise(estimate = mean(estimate))
+  colnames(pi)[2] <- id_var
+
+  A <- data.frame(
+    cluster = rep(model$cluster_names, each = S^2 * T * N),
+    id = rep(ids, each = S^2 * T),
+    time = rep(times, each = S^2),
+    state_from = model$state_names,
+    state_to = rep(model$state_names, each = S),
+    estimate = unlist(pred$A)
+  )
+  colnames(A)[2] <- id_var
+  colnames(A)[3] <- time_var
+
+  B <- data.frame(
+    cluster =  rep(model$cluster_names, each = S * sum(M) * T * N),
+    id = unlist(lapply(seq_len(C), function(i) rep(ids, each = S * M[i] * T))),
+    time = unlist(lapply(seq_len(C), function(i) rep(times, each = S * M[i]))),
+    state = model$state_names,
+    channel = unlist(lapply(seq_len(C), function(i) {
+      rep(model$channel_names[i], each = S * M[i]* T * N)
+    })),
+    observation = unlist(lapply(seq_len(C), function(i) {
+      rep(symbol_names[[i]], each = S)
+    })),
+    estimate = unlist(pred$B)
+  ) |> 
+    dplyr::group_by(.data[[marginalize]]) |>
+    summarise(estimate = mean(estimate))
+  idx <- which(names(B) == "id")
+  if(length(idx) == 1) names(B)[idx] <- id_var
+  idx <- which(names(B) == "time")
+  if(length(idx) == 1) names(B)[idx] <- time_var
+  if (C == 1) emission_probs$channel <- NULL
+
+  if (D > 1) {
+    omega <- data.frame(
+      cluster = model$cluster_names,
+      id = rep(ids, each = D),
+      estimate = c(pred$omega)
+    )
+    out <- list(omega = omega, pi = pi, A = A, B = B)  
+  } else {
+    out <- list(pi = pi, A = A, B = B)
+  }
+  class(out) <- "amp"
+  out
+}