Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LFO-CV to projpred #467

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
468c9ae
add skeleton LFO varsel method
yannmclatchie Oct 6, 2023
99f728c
update comment
yannmclatchie Oct 6, 2023
63b0475
re-document
fweber144 Oct 26, 2023
2dcdc2b
minor cleaning
fweber144 Oct 26, 2023
5ba8e06
Merge remote-tracking branch 'upstream/master' into lfo
fweber144 Oct 26, 2023
7050b82
update `lfo_varsel()` to the most recent version of `kfold_varsel()`
fweber144 Oct 26, 2023
11ba4c8
Add alias `forced_search_terms()` for `force_search_terms()`.
fweber144 Oct 26, 2023
190d0bc
Add function `ordered_search_terms()`.
fweber144 Oct 26, 2023
186047b
Docs: Use a consistent location for arguments of functions documented…
fweber144 Oct 26, 2023
62276ef
In R, using `T` as an object name is bad practice (because `T` is ide…
fweber144 Oct 26, 2023
e852590
Docs: Minor enhancements; add `TODO`s.
fweber144 Oct 26, 2023
bdb6475
fixup! In R, using `T` as an object name is bad practice (because `T`…
fweber144 Oct 26, 2023
20a06a0
Use a consistent naming scheme for the `validate_num_folds[_lfo]()` f…
fweber144 Oct 26, 2023
94ab848
Use a consistent argument order for the `validate_num_folds[_lfo]()` …
fweber144 Oct 26, 2023
5c44bd9
`lfo_folds()` doesn't involve PRNG, so seed-related code is removed h…
fweber144 Oct 26, 2023
8900dd0
`lfo_folds()`: Fix a bug.
fweber144 Oct 26, 2023
5c19f7d
Fix typos in internal documentation.
fweber144 Oct 26, 2023
9d04ef3
Merge remote-tracking branch 'upstream/master' into lfo
fweber144 Nov 30, 2023
c8efad5
Update the LFO functions to their most recent K-fold analogs.
fweber144 Nov 30, 2023
bf11472
Merge remote-tracking branch 'upstream/master' into lfo
fweber144 Dec 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,11 @@ export(cvfolds)
export(do_call)
export(extend_family)
export(force_search_terms)
export(forced_search_terms)
export(get_refmodel)
export(init_refmodel)
export(lfo_folds)
export(ordered_search_terms)
export(performances)
export(predictor_terms)
export(proj_linpred)
Expand Down
243 changes: 243 additions & 0 deletions R/cv_varsel.R
Original file line number Diff line number Diff line change
Expand Up @@ -1584,6 +1584,249 @@ run_cvfun.refmodel <- function(object,
return(structure(cvfits, folds = folds))
}

# Exact LFO CV ------------------------------------------------------------

lfo_varsel <- function(refmodel, method, nterms_max, ndraws, nclusters,
ndraws_pred, nclusters_pred, refit_prj, penalty,
verbose, search_control, L, cvfits, validate_search,
search_path_fulldata, search_terms, search_out_rks,
parallel, ...) {
# Fetch the K reference model fits (or fit them now if not already done) and
# create objects of class `refmodel` from them (and also store the `omitted`
# indices):
list_cv <- get_lfo(refmodel, L = L, cvfits = cvfits, verbose = verbose)
K <- length(list_cv)

search_out_rks_was_null <- is.null(search_out_rks)
if (search_out_rks_was_null) {
search_out_rks <- replicate(K, NULL, simplify = FALSE)
}

if (refmodel$family$for_latent) {
# Need to set the latent response values in `refmodel$y` to `NA`s because
# `refmodel$y` resulted from applying `colMeans(posterior_linpred())` to the
# original (full-data) reference model fit, so using the `fold$omitted`
# subset of `refmodel$y` as (latent) response values in fold k of K would
# induce a dependency between training and test data:
refmodel$y <- rep(NA, refmodel$nobs)
}
y_wobs_test <- as.data.frame(refmodel[nms_y_wobs_test()])

if (verbose) {
verb_txt_start <- "-----\nRunning "
if (!search_out_rks_was_null || !validate_search) {
verb_txt_mid <- ""
} else {
verb_txt_mid <- "the search and "
}
verb_out(verb_txt_start, verb_txt_mid, "the performance evaluation with ",
"`refit_prj = ", refit_prj, "` for each of the K = ", K, " CV ",
"folds separately ...")
}
one_fold <- function(fold,
rk,
verbose_search = verbose &&
getOption("projpred.extra_verbose", FALSE),
...) {
# Run the search for the current fold:
if (!validate_search) {
search_path <- search_path_fulldata
} else if (!search_out_rks_was_null) {
search_path <- list(predictor_ranking = rk)
} else {
search_path <- select(
refmodel = fold$refmodel, ndraws = ndraws, nclusters = nclusters,
method = method, nterms_max = nterms_max, penalty = penalty,
verbose = verbose_search, search_control = search_control,
search_terms = search_terms, est_runtime = FALSE, ...
)
}

# Run the performance evaluation for the submodels along the predictor
# ranking:
perf_eval_out <- perf_eval(
search_path = search_path, refmodel = fold$refmodel,
refit_prj = refit_prj, ndraws = ndraws_pred, nclusters = nclusters_pred,
refmodel_fulldata = refmodel, indices_test = fold$omitted, ...
)

# Performance evaluation for the reference model of the current fold:
eta_test <- fold$refmodel$ref_predfun(
fold$refmodel$fit,
newdata = refmodel$fetch_data(obs = fold$omitted),
excl_offs = FALSE
)
mu_test <- fold$refmodel$family$linkinv(eta_test)
summaries_ref <- weighted_summary_means(
y_wobs_test = y_wobs_test[fold$omitted, , drop = FALSE],
family = fold$refmodel$family,
wdraws = fold$refmodel$wdraws_ref,
mu = mu_test,
dis = fold$refmodel$dis,
cl_ref = seq_along(fold$refmodel$wdraws_ref)
)

return(nlist(predictor_ranking = search_path[["predictor_ranking"]],
summaries_sub = perf_eval_out[["sub_summaries"]],
summaries_ref, clust_used_eval = perf_eval_out[["clust_used"]],
nprjdraws_eval = perf_eval_out[["nprjdraws"]]))
}
if (!parallel) {
# Sequential case. Actually, we could simply use ``%do_projpred%` <-
# foreach::`%do%`` here and then proceed as in the parallel case, but that
# would require adding more "hard" dependencies (because packages 'foreach'
# and 'doRNG' would have to be moved from `Suggests:` to `Imports:`).
if (verbose) {
pb <- utils::txtProgressBar(min = 0, max = K, style = 3, initial = 0)
}
res_cv <- lapply(seq_along(list_cv), function(k) {
if (verbose) {
on.exit(utils::setTxtProgressBar(pb, k))
}
one_fold(fold = list_cv[[k]], rk = search_out_rks[[k]], ...)
})
if (verbose) {
close(pb)
}
} else {
# Parallel case.
if (!requireNamespace("foreach", quietly = TRUE)) {
stop("Please install the 'foreach' package.")
}
if (!requireNamespace("doRNG", quietly = TRUE)) {
stop("Please install the 'doRNG' package.")
}
dot_args <- list(...)
`%do_projpred%` <- doRNG::`%dorng%`
res_cv <- foreach::foreach(
list_cv_k = list_cv,
search_out_rks_k = search_out_rks,
.export = c("one_fold", "dot_args"),
.noexport = c("list_cv", "search_out_rks")
) %do_projpred% {
do_call(one_fold, c(list(fold = list_cv_k, rk = search_out_rks_k,
verbose_search = FALSE),
dot_args))
}
}
verb_out("-----", verbose = verbose)
predictor_ranking_cv <- do.call(rbind,
lapply(res_cv, "[[", "predictor_ranking"))
clust_used_eval <- element_unq(res_cv, nm = "clust_used_eval")
nprjdraws_eval <- element_unq(res_cv, nm = "nprjdraws_eval")

# Handle the submodels' performance evaluation results:
sub_foldwise <- lapply(res_cv, "[[", "summaries_sub")
if (getRversion() >= package_version("4.2.0")) {
sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE, except = NULL)
} else {
sub_foldwise <- simplify2array(sub_foldwise, higher = FALSE)
if (is.null(dim(sub_foldwise))) {
sub_dim <- dim(predictor_ranking_cv)
sub_dim[2] <- sub_dim[2] + 1L # +1 is for the empty model
dim(sub_foldwise) <- rev(sub_dim)
}
}
sub <- apply(sub_foldwise, 1, rbind2list)
idxs_sorted_by_fold <- unlist(lapply(list_cv, function(fold) {
fold$omitted
}))
idxs_sorted_by_fold_aug <- idxs_sorted_by_fold
if (!is.null(refmodel$family$cats)) {
idxs_sorted_by_fold_aug <- idxs_sorted_by_fold_aug + rep(
(seq_along(refmodel$family$cats) - 1L) * refmodel$nobs,
each = length(idxs_sorted_by_fold_aug)
)
}
idxs_sorted_by_fold_flx <- idxs_sorted_by_fold_aug
if (refmodel$family$for_latent && !is.null(refmodel$family$cats)) {
idxs_sorted_by_fold_flx <- idxs_sorted_by_fold
}
sub <- lapply(sub, function(summ) {
summ$mu <- summ$mu[order(idxs_sorted_by_fold_flx)]
summ$lppd <- summ$lppd[order(idxs_sorted_by_fold)]

# Add fold-specific weights (see the discussion at GitHub issue #94 for why
# this might have to be changed):
summ$wcv <- rep(1, length(summ$lppd))
summ$wcv <- summ$wcv / sum(summ$wcv)

if (!is.null(summ$oscale)) {
summ$oscale$mu <- summ$oscale$mu[order(idxs_sorted_by_fold_aug)]
summ$oscale$lppd <- summ$oscale$lppd[order(idxs_sorted_by_fold)]
summ$oscale$wcv <- summ$wcv
}
return(summ)
})

# Handle the reference model's performance evaluation results:
ref <- rbind2list(lapply(res_cv, "[[", "summaries_ref"))
ref$mu <- ref$mu[order(idxs_sorted_by_fold_flx)]
ref$lppd <- ref$lppd[order(idxs_sorted_by_fold)]
if (!is.null(ref$oscale)) {
ref$oscale$mu <- ref$oscale$mu[order(idxs_sorted_by_fold_aug)]
ref$oscale$lppd <- ref$oscale$lppd[order(idxs_sorted_by_fold)]
}

if (!validate_search) {
out_list <- list()
} else {
out_list <- nlist(predictor_ranking_cv)
}
out_list <- c(out_list,
nlist(summaries = nlist(sub, ref), y_wobs_test, clust_used_eval,
nprjdraws_eval))
return(out_list)
}

# Refit the reference model T-L times (once for each observation from L to the
# second-to-last observation; `cvfun` case) or fetch the fits if already
# computed (`cvfits` case). This function will return a list of length T-L,
# where each element is a list with elements `refmodel` (output of
# init_refmodel()) and `omitted` (vector of indices of those observations which
# were left out for the corresponding fold).
get_lfo <- function(refmodel, L, cvfits, verbose) {
if (is.null(cvfits)) {
if (!is.null(refmodel$cvfun)) {
# In this case, cvfun() provided (and `cvfits` not), so run cvfun() now.
if (verbose && !inherits(refmodel, "datafit")) {
verb_out("-----\nRefitting the reference model K = ", K, " times ",
"(using the fold-wise training data) ...")
}
folds <- lfo_folds(refmodel$nobs, L = L,
seed = sample.int(.Machine$integer.max, 1))
if (getOption("projpred.warn_kfold_refits", TRUE)) {
cvfits <- refmodel$cvfun(folds)
} else {
cvfits <- suppressWarnings(refmodel$cvfun(folds))
}
verb_out("-----", verbose = verbose)
} else {
stop("For a reference model which is not of class `datafit`, either ",
"`cvfits` or `cvfun` needs to be provided for K-fold CV (see ",
"`?init_refmodel`).")
}
} else {
folds <- attr(cvfits, "folds")
}
stopifnot(!is.null(folds))
return(lapply(seq_len(K), function(k) {
cvfit <- cvfits[[k]]
# Add the omitted observation indices for this fold (and the fold index `k`
# itself):
omitted_idxs <- which(folds == k)
if (is.list(cvfit)) {
cvfit$omitted <- omitted_idxs
cvfit$projpred_k <- k
} else {
attr(cvfit, "omitted") <- omitted_idxs
attr(cvfit, "projpred_k") <- k
}
return(list(refmodel = refmodel$cvrefbuilder(cvfit),
omitted = omitted_idxs))
}))
}

# PSIS-LOO CV helpers -----------------------------------------------------

# ## decide which points to go through in the validation (i.e., which points
Expand Down
31 changes: 28 additions & 3 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -2466,7 +2466,8 @@ mat2drmat <- function(xmat) {
#'
#' These are helper functions to create cross-validation (CV) folds, i.e., to
#' split up the indices from 1 to `n` into `K` subsets ("folds") for
#' \eqn{K}-fold CV. These functions are potentially useful when creating the
#' \eqn{K}-fold CV (via [cv_folds()] or [cv_ids()]) or to **TODO** (via
#' [lfo_folds()]). These functions are potentially useful when creating the
#' input for arguments `cvfits` and `cvfun` of [init_refmodel()] (or argument
#' `cvfits` of [cv_varsel.refmodel()]). Function [cvfolds()] is deprecated;
#' please use [cv_folds()] instead (apart from the name, they are the same). The
Expand All @@ -2475,7 +2476,8 @@ mat2drmat <- function(xmat) {
#'
#' @name cv-indices
#'
#' @param n Number of observations.
#' @param n Number of observations (in case of a time-series model, this is the
#' number of time points).
#' @param K Number of folds. Must be at least 2 and not exceed `n`.
#' @param out Format of the output, either `"foldwise"` or `"indices"`. See
#' below for details.
Expand All @@ -2484,17 +2486,23 @@ mat2drmat <- function(xmat) {
#' [set.seed()], but can also be `NA` to not call [set.seed()] at all. If not
#' `NA`, then the PRNG state is reset (to the state before calling
#' [cv_folds()] or [cv_ids()]) upon exiting [cv_folds()] or [cv_ids()].
#' @param L Number of observations (time points) to fit the initial fold with.
#' Must be at least 1 and not exceed `n`.
#'
#' @return [cv_folds()] returns a vector of length `n` such that each element is
#' an integer between 1 and `K` denoting which fold the corresponding data
#' point belongs to. The return value of [cv_ids()] depends on the `out`
#' point belongs to.
#'
#' The return value of [cv_ids()] depends on the `out`
#' argument. If `out = "foldwise"`, the return value is a `list` with `K`
#' elements, each being a `list` with elements `tr` and `ts` giving the
#' training and test indices, respectively, for the corresponding fold. If
#' `out = "indices"`, the return value is a `list` with elements `tr` and `ts`
#' each being a `list` with `K` elements giving the training and test indices,
#' respectively, for each fold.
#'
#' The return value of [lfo_folds()] is **TODO**.
#'
#' @examples
#' n <- 100
#' set.seed(1234)
Expand All @@ -2503,6 +2511,8 @@ mat2drmat <- function(xmat) {
#' # Mean within the test set of each fold:
#' cvmeans <- sapply(cv, function(fold) mean(y[fold$ts]))
#'
#' # TODO: lfo_folds() example
#'
NULL

#' @rdname cv-indices
Expand Down Expand Up @@ -2576,6 +2586,21 @@ cv_ids <- function(n, K, out = c("foldwise", "indices"), seed = NA) {
return(cv)
}

#' @rdname cv-indices
#' @export
lfo_folds <- function(n, L) {
validate_num_folds_lfo(L = L, n = n)

## create fold indices
# these indices, unlike for K-fold indicate the observations to use to the
# fit the model at the i-th fold
folds <- rep(1, n)
for (t in (L + 1) : (n)) {
folds[t] <- t - L + 1
}
return(folds)
}

#' Retrieve the full-data solution path from a [varsel()] or [cv_varsel()] run
#' or the predictor combination from a [project()] run
#'
Expand Down
12 changes: 12 additions & 0 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,18 @@ validate_num_folds <- function(k, n) {
}
}

validate_num_folds_lfo <- function(L, n) {
if (!is.numeric(L) || length(L) != 1 || !is_wholenumber(L)) {
stop("Number of folds must be a single integer value.")
}
if (L < 1) {
stop("Number of initial observations must be at least 1.")
}
if (L > n) {
stop("Number of initial observations cannot exceed total observations.")
}
}

validate_vsel_object_stats <- function(object, stats, resp_oscale = TRUE) {
if (!inherits(object, c("vsel"))) {
stop("The object is not a variable selection object. Run variable ",
Expand Down
Loading