stan-dev · yannmclatchie · Oct 6, 2023 · Oct 6, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -66,8 +66,11 @@ export(cvfolds)
 export(do_call)
 export(extend_family)
 export(force_search_terms)
+export(forced_search_terms)
 export(get_refmodel)
 export(init_refmodel)
+export(lfo_folds)
+export(ordered_search_terms)
 export(performances)
 export(predictor_terms)
 export(proj_linpred)

diff --git a/R/cv_varsel.R b/R/cv_varsel.R
@@ -1584,6 +1584,249 @@ run_cvfun.refmodel <- function(object,
   return(structure(cvfits, folds = folds))
 }
 
+# Exact LFO CV ------------------------------------------------------------
+
+lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters,
+                       ndraws_pred, nclusters_pred, refit_prj, penalty,
+                       verbose, search_control, L, cvfits, validate_search,
+                       search_path_fulldata, search_terms, search_out_rks,
+                       parallel, ...) {
+  # Fetch the K reference model fits (or fit them now if not already done) and
+  # create objects of class `refmodel` from them (and also store the `omitted`
+  # indices):
+  list_cv <- get_lfo(refmodel, L = L, cvfits = cvfits, verbose = verbose)
+  K <- length(list_cv)
+
+  search_out_rks_was_null <- is.null(search_out_rks)
+  if (search_out_rks_was_null) {
+    search_out_rks <- replicate(K, NULL, simplify = FALSE)
+  }
+
+  if (refmodel$family$for_latent) {
+    # Need to set the latent response values in `refmodel$y` to `NA`s because
+    # `refmodel$y` resulted from applying `colMeans(posterior_linpred())` to the
+    # original (full-data) reference model fit, so using the `fold$omitted`
+    # subset of `refmodel$y` as (latent) response values in fold k of K would
+    # induce a dependency between training and test data:
+    refmodel$y <- rep(NA, refmodel$nobs)
+  }
+  y_wobs_test <- as.data.frame(refmodel[nms_y_wobs_test()])
+
+  if (verbose) {
+    verb_txt_start <- "-----\nRunning "
+    if (!search_out_rks_was_null || !validate_search) {
+      verb_txt_mid <- ""
+    } else {
+      verb_txt_mid <- "the search and "
+    }
+    verb_out(verb_txt_start, verb_txt_mid, "the performance evaluation with ",
+             "`refit_prj = ", refit_prj, "` for each of the K = ", K, " CV ",
+             "folds separately ...")
+  }
+  one_fold <- function(fold,
+                       rk,
+                       verbose_search = verbose &&
+                         getOption("projpred.extra_verbose", FALSE),
+                       ...) {
+    # Run the search for the current fold:
+    if (!validate_search) {
+      search_path <- search_path_fulldata
+    } else if (!search_out_rks_was_null) {
+      search_path <- list(predictor_ranking = rk)
+    } else {
+      search_path <- select(
+        refmodel = fold$refmodel, ndraws = ndraws, nclusters = nclusters,
+        method = method, nterms_max = nterms_max, penalty = penalty,
+        verbose = verbose_search, search_control = search_control,
+        search_terms = search_terms, est_runtime = FALSE, ...
+      )
+    }
+
+    # Run the performance evaluation for the submodels along the predictor
+    # ranking:
+    perf_eval_out <- perf_eval(
+      search_path = search_path, refmodel = fold$refmodel,
+      refit_prj = refit_prj, ndraws = ndraws_pred, nclusters = nclusters_pred,
+      refmodel_fulldata = refmodel, indices_test = fold$omitted, ...
+    )
+
+    # Performance evaluation for the reference model of the current fold:
+    eta_test <- fold$refmodel$ref_predfun(
+      fold$refmodel$fit,
+      newdata = refmodel$fetch_data(obs = fold$omitted),
+      excl_offs = FALSE
+    )
+    mu_test <- fold$refmodel$family$linkinv(eta_test)
+    summaries_ref <- weighted_summary_means(
+      y_wobs_test = y_wobs_test[fold$omitted, , drop = FALSE],
+      family = fold$refmodel$family,
+      wdraws = fold$refmodel$wdraws_ref,
+      mu = mu_test,
+      dis = fold$refmodel$dis,
+      cl_ref = seq_along(fold$refmodel$wdraws_ref)
+    )
+
+    return(nlist(predictor_ranking = search_path[["predictor_ranking"]],
+                 summaries_sub = perf_eval_out[["sub_summaries"]],
+                 summaries_ref, clust_used_eval = perf_eval_out[["clust_used"]],
+                 nprjdraws_eval = perf_eval_out[["nprjdraws"]]))
+  }
+  if (!parallel) {
+    # Sequential case. Actually, we could simply use ``%do_projpred%` <-
+    # foreach::`%do%`` here and then proceed as in the parallel case, but that
+    # would require adding more "hard" dependencies (because packages 'foreach'
+    # and 'doRNG' would have to be moved from `Suggests:` to `Imports:`).
+    if (verbose) {
+      pb <- utils::txtProgressBar(min = 0, max = K, style = 3, initial = 0)
+    }
+    res_cv <- lapply(seq_along(list_cv), function(k) {
+      if (verbose) {
+        on.exit(utils::setTxtProgressBar(pb, k))
+      }
+      one_fold(fold = list_cv[[k]], rk = search_out_rks[[k]], ...)
+    })
+    if (verbose) {
+      close(pb)
+    }
+  } else {
+    # Parallel case.
+    if (!requireNamespace("foreach", quietly = TRUE)) {
+      stop("Please install the 'foreach' package.")
+    }
+    if (!requireNamespace("doRNG", quietly = TRUE)) {
+      stop("Please install the 'doRNG' package.")
+    }
+    dot_args <- list(...)
+    `%do_projpred%` <- doRNG::`%dorng%`
+    res_cv <- foreach::foreach(
+      list_cv_k = list_cv,
+      search_out_rks_k = search_out_rks,
+      .export = c("one_fold", "dot_args"),
+      .noexport = c("list_cv", "search_out_rks")
+    ) %do_projpred% {
+      do_call(one_fold, c(list(fold = list_cv_k, rk = search_out_rks_k,
+                               verbose_search = FALSE),
+                          dot_args))
+    }
+  }
+  verb_out("-----", verbose = verbose)
+  predictor_ranking_cv <- do.call(rbind,
+                                  lapply(res_cv, "[[", "predictor_ranking"))
+  clust_used_eval <- element_unq(res_cv, nm = "clust_used_eval")
+  nprjdraws_eval <- element_unq(res_cv, nm = "nprjdraws_eval")
+
+  # Handle the submodels' performance evaluation results:
+  sub_foldwise <- lapply(res_cv, "[[", "summaries_sub")
+  if (getRversion() >= package_version("4.2.0")) {
+    sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE, except = NULL)
+  } else {
+    sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE)
+    if (is.null(dim(sub_foldwise))) {
+      sub_dim <- dim(predictor_ranking_cv)
+      sub_dim[2] <- sub_dim[2] + 1L # +1 is for the empty model
+      dim(sub_foldwise) <- rev(sub_dim)
+    }
+  }
+  sub <- apply(sub_foldwise, 1, rbind2list)
+  idxs_sorted_by_fold <- unlist(lapply(list_cv, function(fold) {
+    fold$omitted
+  }))
+  idxs_sorted_by_fold_aug <- idxs_sorted_by_fold
+  if (!is.null(refmodel$family$cats)) {
+    idxs_sorted_by_fold_aug <- idxs_sorted_by_fold_aug + rep(
+      (seq_along(refmodel$family$cats) - 1L) * refmodel$nobs,
+      each = length(idxs_sorted_by_fold_aug)
+    )
+  }
+  idxs_sorted_by_fold_flx <- idxs_sorted_by_fold_aug
+  if (refmodel$family$for_latent && !is.null(refmodel$family$cats)) {
+    idxs_sorted_by_fold_flx <- idxs_sorted_by_fold
+  }
+  sub <- lapply(sub, function(summ) {
+    summ$mu <- summ$mu[order(idxs_sorted_by_fold_flx)]
+    summ$lppd <- summ$lppd[order(idxs_sorted_by_fold)]
+
+    # Add fold-specific weights (see the discussion at GitHub issue #94 for why
+    # this might have to be changed):
+    summ$wcv <- rep(1, length(summ$lppd))
+    summ$wcv <- summ$wcv / sum(summ$wcv)
+
+    if (!is.null(summ$oscale)) {
+      summ$oscale$mu <- summ$oscale$mu[order(idxs_sorted_by_fold_aug)]
+      summ$oscale$lppd <- summ$oscale$lppd[order(idxs_sorted_by_fold)]
+      summ$oscale$wcv <- summ$wcv
+    }
+    return(summ)
+  })
+
+  # Handle the reference model's performance evaluation results:
+  ref <- rbind2list(lapply(res_cv, "[[", "summaries_ref"))
+  ref$mu <- ref$mu[order(idxs_sorted_by_fold_flx)]
+  ref$lppd <- ref$lppd[order(idxs_sorted_by_fold)]
+  if (!is.null(ref$oscale)) {
+    ref$oscale$mu <- ref$oscale$mu[order(idxs_sorted_by_fold_aug)]
+    ref$oscale$lppd <- ref$oscale$lppd[order(idxs_sorted_by_fold)]
+  }
+
+  if (!validate_search) {
+    out_list <- list()
+  } else {
+    out_list <- nlist(predictor_ranking_cv)
+  }
+  out_list <- c(out_list,
+                nlist(summaries = nlist(sub, ref), y_wobs_test, clust_used_eval,
+                      nprjdraws_eval))
+  return(out_list)
+}
+
+# Refit the reference model T-L times (once for each observation from L to the
+# second-to-last observation; `cvfun` case) or fetch the fits if already
+# computed (`cvfits` case). This function will return a list of length T-L,
+# where each element is a list with elements `refmodel` (output of
+# init_refmodel()) and `omitted` (vector of indices of those observations which
+# were left out for the corresponding fold).
+get_lfo <- function(refmodel, L, cvfits, verbose) {
+  if (is.null(cvfits)) {
+    if (!is.null(refmodel$cvfun)) {
+      # In this case, cvfun() provided (and `cvfits` not), so run cvfun() now.
+      if (verbose && !inherits(refmodel, "datafit")) {
+        verb_out("-----\nRefitting the reference model K = ", K, " times ",
+                 "(using the fold-wise training data) ...")
+      }
+      folds <- lfo_folds(refmodel$nobs, L = L,
+                         seed = sample.int(.Machine$integer.max, 1))
+      if (getOption("projpred.warn_kfold_refits", TRUE)) {
+        cvfits <- refmodel$cvfun(folds)
+      } else {
+        cvfits <- suppressWarnings(refmodel$cvfun(folds))
+      }
+      verb_out("-----", verbose = verbose)
+    } else {
+      stop("For a reference model which is not of class `datafit`, either ",
+           "`cvfits` or `cvfun` needs to be provided for K-fold CV (see ",
+           "`?init_refmodel`).")
+    }
+  } else {
+    folds <- attr(cvfits, "folds")
+  }
+  stopifnot(!is.null(folds))
+  return(lapply(seq_len(K), function(k) {
+    cvfit <- cvfits[[k]]
+    # Add the omitted observation indices for this fold (and the fold index `k`
+    # itself):
+    omitted_idxs <- which(folds == k)
+    if (is.list(cvfit)) {
+      cvfit$omitted <- omitted_idxs
+      cvfit$projpred_k <- k
+    } else {
+      attr(cvfit, "omitted") <- omitted_idxs
+      attr(cvfit, "projpred_k") <- k
+    }
+    return(list(refmodel = refmodel$cvrefbuilder(cvfit),
+                omitted = omitted_idxs))
+  }))
+}
+
 # PSIS-LOO CV helpers -----------------------------------------------------
 
 # ## decide which points to go through in the validation (i.e., which points

diff --git a/R/methods.R b/R/methods.R
@@ -2466,7 +2466,8 @@ mat2drmat <- function(xmat) {
 #'
 #' These are helper functions to create cross-validation (CV) folds, i.e., to
 #' split up the indices from 1 to `n` into `K` subsets ("folds") for
-#' \eqn{K}-fold CV. These functions are potentially useful when creating the
+#' \eqn{K}-fold CV (via [cv_folds()] or [cv_ids()]) or to **TODO** (via
+#' [lfo_folds()]). These functions are potentially useful when creating the
 #' input for arguments `cvfits` and `cvfun` of [init_refmodel()] (or argument
 #' `cvfits` of [cv_varsel.refmodel()]). Function [cvfolds()] is deprecated;
 #' please use [cv_folds()] instead (apart from the name, they are the same). The
@@ -2475,7 +2476,8 @@ mat2drmat <- function(xmat) {
 #'
 #' @name cv-indices
 #'
-#' @param n Number of observations.
+#' @param n Number of observations (in case of a time-series model, this is the
+#'   number of time points).
 #' @param K Number of folds. Must be at least 2 and not exceed `n`.
 #' @param out Format of the output, either `"foldwise"` or `"indices"`. See
 #'   below for details.
@@ -2484,17 +2486,23 @@ mat2drmat <- function(xmat) {
 #'   [set.seed()], but can also be `NA` to not call [set.seed()] at all. If not
 #'   `NA`, then the PRNG state is reset (to the state before calling
 #'   [cv_folds()] or [cv_ids()]) upon exiting [cv_folds()] or [cv_ids()].
+#' @param L Number of observations (time points) to fit the initial fold with.
+#'   Must be at least 1 and not exceed `n`.
 #'
 #' @return [cv_folds()] returns a vector of length `n` such that each element is
 #'   an integer between 1 and `K` denoting which fold the corresponding data
-#'   point belongs to. The return value of [cv_ids()] depends on the `out`
+#'   point belongs to.
+#'
+#'   The return value of [cv_ids()] depends on the `out`
 #'   argument. If `out = "foldwise"`, the return value is a `list` with `K`
 #'   elements, each being a `list` with elements `tr` and `ts` giving the
 #'   training and test indices, respectively, for the corresponding fold. If
 #'   `out = "indices"`, the return value is a `list` with elements `tr` and `ts`
 #'   each being a `list` with `K` elements giving the training and test indices,
 #'   respectively, for each fold.
 #'
+#'   The return value of [lfo_folds()] is **TODO**.
+#'
 #' @examples
 #' n <- 100
 #' set.seed(1234)
@@ -2503,6 +2511,8 @@ mat2drmat <- function(xmat) {
 #' # Mean within the test set of each fold:
 #' cvmeans <- sapply(cv, function(fold) mean(y[fold$ts]))
 #'
+#' # TODO: lfo_folds() example
+#'
 NULL
 
 #' @rdname cv-indices
@@ -2576,6 +2586,21 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) {
   return(cv)
 }
 
+#' @rdname cv-indices
+#' @export
+lfo_folds <- function(n, L) {
+  validate_num_folds_lfo(L = L, n = n)
+
+  ## create fold indices
+  # these indices, unlike for K-fold indicate the observations to use to the
+  # fit the model at the i-th fold
+  folds <- rep(1, n)
+  for (t in (L + 1) : (n)) {
+    folds[t] <- t - L + 1
+  }
+  return(folds)
+}
+
 #' Retrieve the full-data solution path from a [varsel()] or [cv_varsel()] run
 #' or the predictor combination from a [project()] run
 #'

diff --git a/R/misc.R b/R/misc.R
@@ -141,6 +141,18 @@ validate_num_folds <- function(k, n) {
   }
 }
 
+validate_num_folds_lfo <- function(L, n) {
+  if (!is.numeric(L) || length(L) != 1 || !is_wholenumber(L)) {
+    stop("Number of folds must be a single integer value.")
+  }
+  if (L < 1) {
+    stop("Number of initial observations must be at least 1.")
+  }
+  if (L > n) {
+    stop("Number of initial observations cannot exceed total observations.")
+  }
+}
+
 validate_vsel_object_stats <- function(object, stats, resp_oscale = TRUE) {
   if (!inherits(object, c("vsel"))) {
     stop("The object is not a variable selection object. Run variable ",