From 404da436e9ccbdc1aba245badebc25991328d527 Mon Sep 17 00:00:00 2001 From: DanielaCorbetta Date: Wed, 22 May 2024 12:01:13 -0400 Subject: [PATCH] indent --- R/getPredictionSets.R | 193 +++++++++++++------------ R/old/ancestors.R | 11 +- R/old/children.R | 11 +- R/old/conformal.R | 138 +++++++++--------- R/old/getHierarchicalPredSets.R | 112 ++++++++------- R/old/predSets.R | 25 ++-- R/old/scores.R | 2 +- R/old/somegraphs.R | 126 ++++++++-------- R/old/utils_Hier.R | 160 +++++++++++---------- R/old/utils_Resample.R | 248 ++++++++++++++++---------------- R/utils_ConformalPS.R | 18 +-- R/utils_HierarchicalPS.R | 89 +++++++----- R/utils_Resample.R | 32 +++-- man/getPredictionSets.Rd | 32 ++--- vignettes/vignette.Rmd | 10 +- 15 files changed, 624 insertions(+), 583 deletions(-) diff --git a/R/getPredictionSets.R b/R/getPredictionSets.R index 7c1a408..e4385e2 100644 --- a/R/getPredictionSets.R +++ b/R/getPredictionSets.R @@ -109,7 +109,7 @@ #' @examples #' # random p matrix #' set.seed(1040) -#' p <- matrix(rnorm(2000*4), ncol=4) +#' p <- matrix(rnorm(2000 * 4), ncol = 4) #' # Normalize the matrix p to have all numbers between 0 and 1 that sum to 1 #' # by row #' p <- exp(p - apply(p, 1, max)) @@ -118,25 +118,25 @@ #' colnames(p) <- cell.types #' #' # Take 1000 rows as calibration and 1000 as test -#' p.cal <- p[1:1000,] -#' p.test <- p[1001:2000,] +#' p.cal <- p[1:1000, ] +#' p.test <- p[1001:2000, ] #' #' # Randomly create the vector of real cell types for p.cal and p.test -#' y.cal <- sample(cell.types, 1000, replace=TRUE) -#' y.test <- sample(cell.types, 1000, replace=TRUE) +#' y.cal <- sample(cell.types, 1000, replace = TRUE) +#' y.test <- sample(cell.types, 1000, replace = TRUE) #' #' # Obtain conformal prediction sets -#' conf.sets <- getPredictionSets(x.query=p.test, -#' x.cal=p.cal, -#' y.cal=y.cal, -#' onto=NULL, -#' alpha=0.1, -#' follow.ontology=FALSE, -#' resample.cal=FALSE, -#' labels=cell.types, -#' return.sc=FALSE -#' ) -#' +#' conf.sets <- getPredictionSets( +#' x.query = p.test, +#' x.cal = p.cal, +#' y.cal = y.cal, +#' onto = NULL, +#' alpha = 0.1, +#' follow.ontology = FALSE, +#' resample.cal = FALSE, +#' labels = cell.types, +#' return.sc = FALSE +#' ) #' #' @importFrom foreach %dopar% #' @importFrom foreach foreach @@ -147,52 +147,53 @@ #' @importFrom stats quantile #' @export -getPredictionSets <- function(x.query, x.cal, y.cal, onto=NULL, alpha = 0.1, - lambdas = seq(0.001,0.999,length.out=100), - follow.ontology=TRUE, - resample.cal=FALSE, - labels=NULL, - return.sc=NULL, - pr.name="pred.set"){ - +getPredictionSets <- function(x.query, x.cal, y.cal, onto = NULL, alpha = 0.1, + lambdas = seq(0.001, 0.999, length.out = 100), + follow.ontology = TRUE, + resample.cal = FALSE, + labels = NULL, + return.sc = NULL, + pr.name = "pred.set") { ## Sanity checks - if(follow.ontology & is.null(onto)){ + if (follow.ontology & is.null(onto)) { stop("An ontology is required for hierarchical prediction set. Please provide one or ask for conformal prediction set (follow.ontology=FALSE)") } - if(is.null(onto) & is.null(labels)){ + if (is.null(onto) & is.null(labels)) { stop("Please provide cell labels (labels parameter)") } - if(isa(x.query, "SpatialExperiment") | - isa(x.query, "SingleCellExperiment") | - isa(x.query, "SummarizedExperiment")) + if (isa(x.query, "SpatialExperiment") | + isa(x.query, "SingleCellExperiment") | + isa(x.query, "SummarizedExperiment")) { sc <- TRUE - else if(is.matrix(x.query)) + } else if (is.matrix(x.query)) { sc <- FALSE - else + } else { stop("Please provide as input in x.query a SpatialExperiment, SingleCellExperiment or a matrix") + } - if(!is.null(return.sc)){ - if(return.sc==TRUE & !sc){ - stop("If x.query is a matrix output has to be a list + if (!is.null(return.sc)) { + if (return.sc == TRUE & !sc) { + stop("If x.query is a matrix output has to be a list (return.sc=FALSE)") - } + } } # Retrieve labels from the ontology (need to add retrieval from y.cal/data # when follow.ontology=FALSE) - if(is.null(labels)) - labels <- V(onto)$name[degree(onto, mode="out")==0] + if (is.null(labels)) { + labels <- V(onto)$name[degree(onto, mode = "out") == 0] + } K <- length(labels) # If input is not a matrix, retrieve prediction matrix from colData # might turn this into a helper function - if(!is.matrix(x.query)){ + if (!is.matrix(x.query)) { # n.query <- ncol(x.query) # p.query <- matrix(NA, nrow=n.query, ncol=K) # colnames(p.query) <- labels @@ -200,10 +201,11 @@ getPredictionSets <- function(x.query, x.cal, y.cal, onto=NULL, alpha = 0.1, # p.query[,i] <- colData(x.query)[[i]] # } p.query <- .retrievePredMatrix(x.query) + } else { + p.query <- x.query } - else p.query <- x.query - if(!is.matrix(x.cal)){ + if (!is.matrix(x.cal)) { # n.cal <- ncol(x.cal) # p.cal <- matrix(NA, nrow=n.cal, ncol=K) # colnames(p.cal) <- labels @@ -211,48 +213,62 @@ getPredictionSets <- function(x.query, x.cal, y.cal, onto=NULL, alpha = 0.1, # p.cal[,i] <- colData(x.cal)[[i]] # } p.cal <- .retrievePredMatrix(x.cal) + } else { + p.cal <- x.cal } - else p.cal <- x.cal - if(!resample.cal){ - if (follow.ontology){ - pred.sets <- .getHierarchicalPredSets(p.cal=p.cal, p.test=p.query, - y.cal=y.cal, onto=onto, - alpha=alpha, - lambdas=lambdas) + if (!resample.cal) { + if (follow.ontology) { + pred.sets <- .getHierarchicalPredSets( + p.cal = p.cal, p.test = p.query, + y.cal = y.cal, onto = onto, + alpha = alpha, + lambdas = lambdas + ) + } else { + pred.sets <- .getConformalPredSets( + p.cal = p.cal, p.test = p.query, + y.cal = y.cal, alpha = alpha + ) } - else - pred.sets <- .getConformalPredSets(p.cal=p.cal, p.test=p.query, - y.cal=y.cal, alpha=alpha) } - if(resample.cal){ - data <- resample.two(p.cal=p.cal, p.test=p.query, y.cal=y.cal, - labels=labels) - if (follow.ontology){ - pred.sets1 <- .getHierarchicalPredSets(p.cal=data$p.cal2, - p.test=data$p.test1, - y.cal=data$y.cal2, - onto=onto, - alpha=alpha, - lambdas=lambdas) - pred.sets2 <- .getHierarchicalPredSets(p.cal=data$p.cal1, - p.test=data$p.test2, - y.cal=data$y.cal1, - onto=onto, - alpha=alpha, - lambdas=lambdas) + if (resample.cal) { + data <- resample.two( + p.cal = p.cal, p.test = p.query, y.cal = y.cal, + labels = labels + ) + if (follow.ontology) { + pred.sets1 <- .getHierarchicalPredSets( + p.cal = data$p.cal2, + p.test = data$p.test1, + y.cal = data$y.cal2, + onto = onto, + alpha = alpha, + lambdas = lambdas + ) + pred.sets2 <- .getHierarchicalPredSets( + p.cal = data$p.cal1, + p.test = data$p.test2, + y.cal = data$y.cal1, + onto = onto, + alpha = alpha, + lambdas = lambdas + ) pred.sets <- c(pred.sets1, pred.sets2) - } - else { - pred.sets1 <- .getConformalPredSets(p.cal=data$p.cal2, - p.test=data$p.test1, - y.cal=data$y.cal2, - alpha=alpha) - pred.sets2 <- .getConformalPredSets(p.cal=data$p.cal1, - p.test=data$p.test2, - y.cal=data$y.cal1, - alpha=alpha) + } else { + pred.sets1 <- .getConformalPredSets( + p.cal = data$p.cal2, + p.test = data$p.test1, + y.cal = data$y.cal2, + alpha = alpha + ) + pred.sets2 <- .getConformalPredSets( + p.cal = data$p.cal1, + p.test = data$p.test2, + y.cal = data$y.cal1, + alpha = alpha + ) pred.sets <- c(pred.sets1, pred.sets2) } # Order the prediction set @@ -261,35 +277,30 @@ getPredictionSets <- function(x.query, x.cal, y.cal, onto=NULL, alpha = 0.1, # if not specified, return a sc object if the input was a sc object, # a matrix if the input was a matrix - if(is.null(return.sc) & sc) + if (is.null(return.sc) & sc) { return.sc <- TRUE - if(is.null(return.sc) & !sc) + } + if (is.null(return.sc) & !sc) { return.sc <- FALSE - if(return.sc){ + } + if (return.sc) { colData(x.query)[[pr.name]] <- pred.sets return(x.query) } - if(!return.sc){ + if (!return.sc) { return(pred.sets) } - } ## function to retrieve prediction matrix from the colData of a ## SingleCellExperiment object -.retrievePredMatrix <- function(sc){ +.retrievePredMatrix <- function(sc) { n.sc <- ncol(sc) - p.sc <- matrix(NA, nrow=n.sc, ncol=K) + p.sc <- matrix(NA, nrow = n.sc, ncol = K) colnames(p.sc) <- labels - for(i in labels){ - p.sc[,i] <- colData(sc)[[i]] + for (i in labels) { + p.sc[, i] <- colData(sc)[[i]] } return(p.sc) } - - - - - - diff --git a/R/old/ancestors.R b/R/old/ancestors.R index 76af3a9..22312ab 100644 --- a/R/old/ancestors.R +++ b/R/old/ancestors.R @@ -8,9 +8,10 @@ #' @importFrom igraph degree #' @importFrom igraph distances -.ancestors <- function(node, onto, include_self=TRUE){ - if(include_self) - return(V(onto)$name[is.finite(distances(onto, node, mode="in"))]) - else - return(V(onto)$name[is.finite(distances(onto, node, mode="in")) & V(onto)$name!=node]) +.ancestors <- function(node, onto, include_self = TRUE) { + if (include_self) { + return(V(onto)$name[is.finite(distances(onto, node, mode = "in"))]) + } else { + return(V(onto)$name[is.finite(distances(onto, node, mode = "in")) & V(onto)$name != node]) + } } diff --git a/R/old/children.R b/R/old/children.R index e425946..5e69eee 100644 --- a/R/old/children.R +++ b/R/old/children.R @@ -10,9 +10,10 @@ #' @importFrom igraph distances -.children <- function(node, onto, leaf=TRUE){ - if(leaf) - return(V(onto)$name[is.finite(distances(onto, node, mode="out")) & degree(onto, mode="out")==0]) - else - return(V(onto)$name[is.finite(distances(onto, node, mode="out"))]) +.children <- function(node, onto, leaf = TRUE) { + if (leaf) { + return(V(onto)$name[is.finite(distances(onto, node, mode = "out")) & degree(onto, mode = "out") == 0]) + } else { + return(V(onto)$name[is.finite(distances(onto, node, mode = "out"))]) + } } diff --git a/R/old/conformal.R b/R/old/conformal.R index 58b553b..faf479a 100644 --- a/R/old/conformal.R +++ b/R/old/conformal.R @@ -1,80 +1,80 @@ #### Function to get the conformal quantile -getConfQuant <- function(p, label, alpha){ - #p prediction matrix for data in the calibration set (ncal x num labels) - #label true label for data in the calibration set - #alpha desired error rate - true <- rep(NA, dim(p)[1]) - for (i in 1:dim(p)[1]) - true[i] <- p[i, label[i]] - - cal.scores <- 1-true - - # get adjusted quantile - n <- dim(p)[1] - q_level = ceiling((n+1)*(1-alpha))/n - qhat = quantile(cal.scores, q_level) - return(list(scores=cal.scores, qhat=qhat)) +getConfQuant <- function(p, label, alpha) { + # p prediction matrix for data in the calibration set (ncal x num labels) + # label true label for data in the calibration set + # alpha desired error rate + true <- rep(NA, dim(p)[1]) + for (i in 1:dim(p)[1]) { + true[i] <- p[i, label[i]] + } + + cal.scores <- 1 - true + + # get adjusted quantile + n <- dim(p)[1] + q_level <- ceiling((n + 1) * (1 - alpha)) / n + qhat <- quantile(cal.scores, q_level) + return(list(scores = cal.scores, qhat = qhat)) } #### Function to get the prediction sets for test data -getPredSets <- function(pred, label, qhat, getClass=T, acc.p=NULL, summary=F){ - #pred prediction matrix for data in the test set (ntest x num labels) - #label true label for data in the test set (needed to evaluate accuracy and coverage) - #qhat conformal quantile - - #get predicted class from prediction matrix - if (getClass) - acc.p <- apply(pred, 1, function(row) colnames(pred)[which.max(row)]) - acc <- mean(acc.p==label) - prediction_sets <- pred >=(1-qhat) - rs <- rowSums(prediction_sets) # to have summary on size of prediction sets - - # Get prediction set colnames - pr.list <- lapply(1:nrow(prediction_sets), function(i) { - colnames(prediction_sets)[prediction_sets[i, ]] - }) - - # Get coverage - tf <- rep(NA, length(pr.list)) - for (i in 1:length(pr.list)) - tf[i] <- label[i] %in% pr.list[[i]] - coverage <- sum(tf)/length(pr.list) - - if (summary) - return(list(accuracy=acc, pred.sets=pr.list, coverage=coverage, summary=summary(rs))) - else - return(list(accuracy=acc, pred.sets=pr.list, coverage=coverage)) -} +getPredSets <- function(pred, label, qhat, getClass = T, acc.p = NULL, summary = F) { + # pred prediction matrix for data in the test set (ntest x num labels) + # label true label for data in the test set (needed to evaluate accuracy and coverage) + # qhat conformal quantile -#### Function to get class conditional coverage -class_sp_conf <- function(classes, pred.cal, pred.test, labels.cal, labels.test){ - s <- rep(NA, length(classes)) - names(s) <- classes - - for (i in 1:length(classes)){ - p <- as.matrix(pred.cal[labels.cal==classes[i],]) - s[i] <- getConfQuant(p, rep(classes[i], nrow(p)), 0.05)$qhat - } - - acc.p <- apply(pred.test, 1, function(row) colnames(pred.test)[which.max(row)]) - p.sets <- matrix(NA, nrow=nrow(pred.test), ncol=ncol(pred.test)) - colnames(p.sets) <- colnames(pred.test) - for(i in 1:length(classes)) - p.sets[,classes[i]] <- pred.test[,classes[i]] >= (1-s[classes[i]]) - - pr.list <- lapply(1:nrow(p.sets), function(i) { - colnames(p.sets)[p.sets[i, ]] - }) - - # Get coverage - tf <- rep(NA, length(pr.list)) - for (i in 1:length(pr.list)) - tf[i] <- labels.test[i] %in% pr.list[[i]] - return(list(pred.sets = pr.list, coverage=mean(tf))) -} + # get predicted class from prediction matrix + if (getClass) { + acc.p <- apply(pred, 1, function(row) colnames(pred)[which.max(row)]) + } + acc <- mean(acc.p == label) + prediction_sets <- pred >= (1 - qhat) + rs <- rowSums(prediction_sets) # to have summary on size of prediction sets + # Get prediction set colnames + pr.list <- lapply(1:nrow(prediction_sets), function(i) { + colnames(prediction_sets)[prediction_sets[i, ]] + }) + # Get coverage + tf <- rep(NA, length(pr.list)) + for (i in 1:length(pr.list)) { + tf[i] <- label[i] %in% pr.list[[i]] + } + coverage <- sum(tf) / length(pr.list) + if (summary) { + return(list(accuracy = acc, pred.sets = pr.list, coverage = coverage, summary = summary(rs))) + } else { + return(list(accuracy = acc, pred.sets = pr.list, coverage = coverage)) + } +} + +#### Function to get class conditional coverage +class_sp_conf <- function(classes, pred.cal, pred.test, labels.cal, labels.test) { + s <- rep(NA, length(classes)) + names(s) <- classes + for (i in 1:length(classes)) { + p <- as.matrix(pred.cal[labels.cal == classes[i], ]) + s[i] <- getConfQuant(p, rep(classes[i], nrow(p)), 0.05)$qhat + } + acc.p <- apply(pred.test, 1, function(row) colnames(pred.test)[which.max(row)]) + p.sets <- matrix(NA, nrow = nrow(pred.test), ncol = ncol(pred.test)) + colnames(p.sets) <- colnames(pred.test) + for (i in 1:length(classes)) { + p.sets[, classes[i]] <- pred.test[, classes[i]] >= (1 - s[classes[i]]) + } + pr.list <- lapply(1:nrow(p.sets), function(i) { + colnames(p.sets)[p.sets[i, ]] + }) + + # Get coverage + tf <- rep(NA, length(pr.list)) + for (i in 1:length(pr.list)) { + tf[i] <- labels.test[i] %in% pr.list[[i]] + } + return(list(pred.sets = pr.list, coverage = mean(tf))) +} diff --git a/R/old/getHierarchicalPredSets.R b/R/old/getHierarchicalPredSets.R index 1d316c3..0ba27e6 100644 --- a/R/old/getHierarchicalPredSets.R +++ b/R/old/getHierarchicalPredSets.R @@ -30,87 +30,93 @@ # Angelopoulus (2023), Conformal Risk Control. Finally, builds prediction sets # for p.test based on the selected lambda value. -.getHierarchicalPredSets <- function(p.cal, p.test, y.cal, onto, alpha, lambdas){ +.getHierarchicalPredSets <- function(p.cal, p.test, y.cal, onto, alpha, lambdas) { y.cal <- as.character(y.cal) # Get prediction sets for each value of lambda for all the calibration data j <- NULL - exportedFn = c(".predSets", ".scores", ".children", ".ancestors") - sets <- foreach(j = lambdas, .export=exportedFn) %dopar% { - lapply(1:nrow(p.cal), - function(i) .predSets(lambda=j, pred=p.cal[i, ], onto=onto)) - } + exportedFn <- c(".predSets", ".scores", ".children", ".ancestors") + sets <- foreach(j = lambdas, .export = exportedFn) %dopar% { + lapply( + 1:nrow(p.cal), + function(i) .predSets(lambda = j, pred = p.cal[i, ], onto = onto) + ) + } # Get the loss table (ncal x length(lambda) table with TRUE\FALSE) loss <- sapply(1:length(lambdas), function(lambda) { - sapply(seq_along(y.cal), function(i) { - !(y.cal[i] %in% sets[[lambda]][[i]]) - }) + sapply(seq_along(y.cal), function(i) { + !(y.cal[i] %in% sets[[lambda]][[i]]) + }) }) # Get lhat n <- nrow(loss) rhat <- colMeans(loss) - lhat_idx <- min(which(((n/(n+1)) * rhat + 1/(n+1) ) <= alpha)) + lhat_idx <- min(which(((n / (n + 1)) * rhat + 1 / (n + 1)) <= alpha)) lhat <- lambdas[lhat_idx] # Get prediction sets for test data - sets.test <- apply(p.test, 1, function(x) .predSets(lambda=lhat, pred=x, onto=onto)) + sets.test <- apply(p.test, 1, function(x) .predSets(lambda = lhat, pred = x, onto = onto)) - return(list(sets.test=sets.test, lhat=lhat)) + return(list(sets.test = sets.test, lhat = lhat)) } -.ancestors <- function(node, onto, include_self=TRUE){ - if(include_self) - return(V(onto)$name[is.finite(distances(onto, node, mode="in"))]) - else - return(V(onto)$name[is.finite(distances(onto, node, mode="in")) & V(onto)$name!=node]) +.ancestors <- function(node, onto, include_self = TRUE) { + if (include_self) { + return(V(onto)$name[is.finite(distances(onto, node, mode = "in"))]) + } else { + return(V(onto)$name[is.finite(distances(onto, node, mode = "in")) & V(onto)$name != node]) + } } -.children <- function(node, onto, leaf=TRUE){ - if(leaf) - return(V(onto)$name[is.finite(distances(onto, node, mode="out")) & degree(onto, mode="out")==0]) - else - return(V(onto)$name[is.finite(distances(onto, node, mode="out"))]) +.children <- function(node, onto, leaf = TRUE) { + if (leaf) { + return(V(onto)$name[is.finite(distances(onto, node, mode = "out")) & degree(onto, mode = "out") == 0]) + } else { + return(V(onto)$name[is.finite(distances(onto, node, mode = "out"))]) + } } -.scores <- function(pred, int_node, onto){ - c <- .children(node = int_node, onto = onto, leaf = TRUE) - return(sum(pred[c])) +.scores <- function(pred, int_node, onto) { + c <- .children(node = int_node, onto = onto, leaf = TRUE) + return(sum(pred[c])) } -.predSets <- function(lambda, pred, onto){ - # Get the predicted class and its ancestors - pred_class <- names(pred)[which.max(pred)] - anc <- .ancestors(node = pred_class, onto = onto, include_self = TRUE) - - # Compute scores for all the ancestor of the predicted class - s <- sapply(as.character(anc), function(i) .scores(pred=pred, int_node=i, - onto=onto)) - names(s) <- anc +.predSets <- function(lambda, pred, onto) { + # Get the predicted class and its ancestors + pred_class <- names(pred)[which.max(pred)] + anc <- .ancestors(node = pred_class, onto = onto, include_self = TRUE) + + # Compute scores for all the ancestor of the predicted class + s <- sapply(as.character(anc), function(i) { + .scores( + pred = pred, int_node = i, + onto = onto + ) + }) + names(s) <- anc - #Sort them by score and if there are ties by distance to the predicted class - ## compute distance from predicted class - pos <- distances(onto, v=anc, to=pred_class, mode="out") - tie_breaker <- as.vector(t(pos)) - names(tie_breaker) <- colnames(t(pos)) - sorted_indices <- order(s, tie_breaker, decreasing=FALSE) - sorted_scores <- s[sorted_indices] + # Sort them by score and if there are ties by distance to the predicted class + ## compute distance from predicted class + pos <- distances(onto, v = anc, to = pred_class, mode = "out") + tie_breaker <- as.vector(t(pos)) + names(tie_breaker) <- colnames(t(pos)) + sorted_indices <- order(s, tie_breaker, decreasing = FALSE) + sorted_scores <- s[sorted_indices] - # Select the first score that is geq than lambda - sel_node <- names(sorted_scores)[round(sorted_scores, 15) >= lambda][1] + # Select the first score that is geq than lambda + sel_node <- names(sorted_scores)[round(sorted_scores, 15) >= lambda][1] - # Add also the subgraphs we would have obtained with smaller lambda - selected <- c(lapply(anc[round(s, 15) < lambda], function(x) - .children(node = x, onto = onto)), - list(.children(sel_node, onto))) + # Add also the subgraphs we would have obtained with smaller lambda + selected <- c( + lapply(anc[round(s, 15) < lambda], function(x) { + .children(node = x, onto = onto) + }), + list(.children(sel_node, onto)) + ) - return(Reduce(union, selected)) + return(Reduce(union, selected)) } - - - - - diff --git a/R/old/predSets.R b/R/old/predSets.R index 3e577c4..b041d50 100644 --- a/R/old/predSets.R +++ b/R/old/predSets.R @@ -6,22 +6,26 @@ #' @author Daniela Corbetta #' @return vector with names of the selected leaf nodes -.predSets <- function(lambda, pred, onto){ +.predSets <- function(lambda, pred, onto) { # Get the predicted class and its ancestors pred_class <- names(pred)[which.max(pred)] anc <- .ancestors(node = pred_class, onto = onto, include_self = TRUE) # Compute scores for all the ancestor of the predicted class - s <- sapply(as.character(anc), function(i) .scores(pred=pred, int_node=i, - onto=onto)) + s <- sapply(as.character(anc), function(i) { + .scores( + pred = pred, int_node = i, + onto = onto + ) + }) names(s) <- anc - #Sort them by score and if there are ties by distance to the predicted class + # Sort them by score and if there are ties by distance to the predicted class ## compute distance from predicted class - pos <- distances(onto, v=anc, to=pred_class, mode="out") + pos <- distances(onto, v = anc, to = pred_class, mode = "out") tie_breaker <- as.vector(t(pos)) names(tie_breaker) <- colnames(t(pos)) - sorted_indices <- order(s, tie_breaker, decreasing=FALSE) + sorted_indices <- order(s, tie_breaker, decreasing = FALSE) sorted_scores <- s[sorted_indices] # Select the first score that is geq than lambda @@ -29,9 +33,12 @@ # Add also the subgraphs we would have obtained with smaller lambda - selected <- c(lapply(anc[round(s, 15) < lambda], function(x) - .children(node = x, onto = onto)), - list(.children(sel_node, onto))) + selected <- c( + lapply(anc[round(s, 15) < lambda], function(x) { + .children(node = x, onto = onto) + }), + list(.children(sel_node, onto)) + ) return(Reduce(union, selected)) } diff --git a/R/old/scores.R b/R/old/scores.R index 4f69d7d..7e8cc0b 100644 --- a/R/old/scores.R +++ b/R/old/scores.R @@ -8,7 +8,7 @@ #' @author Daniela Corbetta #' @return returns the score -.scores <- function(pred, int_node, onto){ +.scores <- function(pred, int_node, onto) { c <- .children(node = int_node, onto = onto, leaf = TRUE) return(sum(pred[c])) } diff --git a/R/old/somegraphs.R b/R/old/somegraphs.R index e9079a5..9219417 100644 --- a/R/old/somegraphs.R +++ b/R/old/somegraphs.R @@ -6,11 +6,13 @@ # Build example graph ##################### library(igraph) -t <- graph_from_literal(animale-+cane:gatto:topo, gatto-+british:persiano, - gatto-+retriever, - cane-+cocker:retriever, retriever-+golden:labrador) -plot(t, layout=layout_as_tree(t, root="animale")) -p <- c(0.2, 0.3, 0.2, 0.10, 0.15,0.05) +t <- graph_from_literal( + animale - +cane:gatto:topo, gatto - +british:persiano, + gatto - +retriever, + cane - +cocker:retriever, retriever - +golden:labrador +) +plot(t, layout = layout_as_tree(t, root = "animale")) +p <- c(0.2, 0.3, 0.2, 0.10, 0.15, 0.05) names(p) <- c("cocker", "golden", "labrador", "british", "persiano", "topo") p @@ -25,22 +27,22 @@ source("/Users/daniela/Documents/GitHub/ConfCell/R/old/utils_Hier.R") nam <- V(t)$name s <- sapply(nam, function(x) g(p, x, t)) s1 <- sapply(nam, function(x) .scores(p, x, t)) -sum(s!=s1) # ok +sum(s != s1) # ok # Check ancestors anc <- sapply(nam, function(x) ancestors(x, graph = t, include_self = T)) anc1 <- sapply(nam, function(x) .ancestors(x, onto = t, include_self = T)) tf <- rep(NA, length(anc)) -for(i in 1:length(anc)){ - tf[i] <- mean(anc[[i]]==anc1[[i]]) +for (i in 1:length(anc)) { + tf[i] <- mean(anc[[i]] == anc1[[i]]) } tf anc <- sapply(nam, function(x) ancestors(x, graph = t, include_self = F)) anc1 <- sapply(nam, function(x) .ancestors(x, onto = t, include_self = F)) tf <- rep(NA, length(anc)) -for(i in 1:length(anc)){ - tf[i] <- mean(anc[[i]]==anc1[[i]]) +for (i in 1:length(anc)) { + tf[i] <- mean(anc[[i]] == anc1[[i]]) } tf @@ -48,42 +50,42 @@ tf child <- sapply(nam, function(x) children(x, graph = t, leaf = T)) child1 <- sapply(nam, function(x) .children(x, onto = t, leaf = T)) tf <- rep(NA, length(child)) -for(i in 1:length(child)){ - tf[i] <- mean(child[[i]]==child1[[i]]) +for (i in 1:length(child)) { + tf[i] <- mean(child[[i]] == child1[[i]]) } tf child <- sapply(nam, function(x) children(x, graph = t, leaf = F)) child1 <- sapply(nam, function(x) .children(x, onto = t, leaf = F)) tf <- rep(NA, length(child)) -for(i in 1:length(child)){ - tf[i] <- mean(child[[i]]==child1[[i]]) +for (i in 1:length(child)) { + tf[i] <- mean(child[[i]] == child1[[i]]) } tf # .predSets pred_sets1(0.5, p, t) -.predSets(lambda=0.5, pred=p, onto=t) +.predSets(lambda = 0.5, pred = p, onto = t) pred_sets1(0.7, p, t) -.predSets(lambda=0.7, pred=p, onto=t) +.predSets(lambda = 0.7, pred = p, onto = t) pred_sets1(0.75, p, t) -.predSets(lambda=0.75, pred=p, onto=t) +.predSets(lambda = 0.75, pred = p, onto = t) -pred_sets1(0.8, p, t) #should be all -.predSets(lambda=0.8, pred=p, onto=t) +pred_sets1(0.8, p, t) # should be all +.predSets(lambda = 0.8, pred = p, onto = t) pred_sets1(1, p, t) -.predSets(lambda=1, pred=p, onto=t) +.predSets(lambda = 1, pred = p, onto = t) pred_sets1(0.6, p, t) # cane (golden, labrador, cocker) -.predSets(lambda=0.6, pred=p, onto=t) +.predSets(lambda = 0.6, pred = p, onto = t) p1 <- c(0.3, 0.4, 0.3, 0, 0, 0) names(p1) <- c("cocker", "golden", "labrador", "british", "persiano", "topo") pred_sets1(0.7, p1, t) -.predSets(lambda=0.7, pred=p1, onto=t) +.predSets(lambda = 0.7, pred = p1, onto = t) ######################################### ### Create a random dataset @@ -91,31 +93,33 @@ pred_sets1(0.7, p1, t) set.seed(1010) n <- 2100 -leaves <- c(4,6,7,8,9) +leaves <- c(4, 6, 7, 8, 9) nclasses <- length(leaves) -lab <- sample(leaves, n, replace=T) -x <- (runif(n,0,3.1) + lab) +lab <- sample(leaves, n, replace = T) +x <- (runif(n, 0, 3.1) + lab) y <- rep(NA, length(lab)) -y[lab==4] <- "cocker" -y[lab==6] <- "golden" -y[lab==7] <- "labrador" -y[lab==8] <- "british" -y[lab==9] <- "persiano" -xydf <- data.frame(y=as.factor(y), x=x) +y[lab == 4] <- "cocker" +y[lab == 6] <- "golden" +y[lab == 7] <- "labrador" +y[lab == 8] <- "british" +y[lab == 9] <- "persiano" +xydf <- data.frame(y = as.factor(y), x = x) head(xydf) # Divide in train, cal and test and fit model library(VGAM) -train <- xydf[1:1000,] -cal <- xydf[1001:1100,] -test <- xydf[1101:2100,] +train <- xydf[1:1000, ] +cal <- xydf[1001:1100, ] +test <- xydf[1101:2100, ] -fit <- vglm(y~x, family = multinomial(refLevel = "cocker"), - data = train) -lambdas <- seq(0.001,0.999,length.out=100) +fit <- vglm(y ~ x, + family = multinomial(refLevel = "cocker"), + data = train +) +lambdas <- seq(0.001, 0.999, length.out = 100) # Compute predicted probabilities for calibration points -p.cal <- predict(fit, newdata=cal, type="response") +p.cal <- predict(fit, newdata = cal, type = "response") library(doParallel) library(foreach) # @@ -123,16 +127,17 @@ num_cores <- 4 cl <- makeCluster(num_cores) registerDoParallel(cl) -t3 <- graph_from_literal(animale-+cane:gatto, gatto-+british:persiano, - gatto-+retriever, - cane-+cocker:retriever, retriever-+golden:labrador) -plot(t3, layout=layout_as_tree(t3, root="animale")) -exportedFn = c(".predSets", ".scores", ".children", ".ancestors") -system.time(sets1 <- foreach(lambda = lambdas, .packages = c("ConfCell", "igraph")) %dopar% { - library(igraph) - lapply(1:nrow(p.cal), function(i) pred_sets1(lambda, p.cal[i, ], t3)) -} +t3 <- graph_from_literal( + animale - +cane:gatto, gatto - +british:persiano, + gatto - +retriever, + cane - +cocker:retriever, retriever - +golden:labrador ) +plot(t3, layout = layout_as_tree(t3, root = "animale")) +exportedFn <- c(".predSets", ".scores", ".children", ".ancestors") +system.time(sets1 <- foreach(lambda = lambdas, .packages = c("ConfCell", "igraph")) %dopar% { + library(igraph) + lapply(1:nrow(p.cal), function(i) pred_sets1(lambda, p.cal[i, ], t3)) +}) # system.time(l1 <- get_loss_table_mis(lambdas, sets1, as.character(cal$y), t3)) @@ -140,9 +145,9 @@ Rhat <- colMeans(l1) plot(Rhat) all(diff(Rhat) <= 0) -lhat <- get_lhat(l1,lambdas,0.1) -p.test <- predict(fit, newdata=test, type="response") -sets.test <- apply(p.test, 1, function(x) pred_sets1(lambda=lhat, x, t3)) +lhat <- get_lhat(l1, lambdas, 0.1) +p.test <- predict(fit, newdata = test, type = "response") +sets.test <- apply(p.test, 1, function(x) pred_sets1(lambda = lhat, x, t3)) t <- .getHierarchicalPredSets(p.cal, p.test, cal$y, onto = t3, alpha = 0.1, lambdas = lambdas) t1 <- .getConformalPredSets(p.cal, p.test, as.character(cal$y), 0.1) @@ -154,11 +159,13 @@ l.crc <- sapply(t$sets.test, length) barplot(table(l.std)) barplot(table(l.crc)) cvg <- cvg1 <- rep(NA, length(t)) -for(i in 1:length(t1)) - cvg[i] <- test$y[i] %in% t1[[i]] +for (i in 1:length(t1)) { + cvg[i] <- test$y[i] %in% t1[[i]] +} -for(i in 1:length(t$sets.test)) - cvg1[i] <- test$y[i] %in% t$sets.test[[i]] +for (i in 1:length(t$sets.test)) { + cvg1[i] <- test$y[i] %in% t$sets.test[[i]] +} mean(cvg) mean(cvg1) @@ -166,9 +173,10 @@ mean(cvg1) -sets <- foreach(j = lambdas, .export=exportedFn) %dopar% { - library(igraph) - lapply(1:nrow(p.cal), - function(i) .predSets(lambda=j, pred=p.cal[i, ], onto=t3)) +sets <- foreach(j = lambdas, .export = exportedFn) %dopar% { + library(igraph) + lapply( + 1:nrow(p.cal), + function(i) .predSets(lambda = j, pred = p.cal[i, ], onto = t3) + ) } - diff --git a/R/old/utils_Hier.R b/R/old/utils_Hier.R index c02a84b..28be1e2 100644 --- a/R/old/utils_Hier.R +++ b/R/old/utils_Hier.R @@ -4,32 +4,34 @@ # Conformal risk control -# Function to retrieve children nodes of a given interior node. +# Function to retrieve children nodes of a given interior node. # If leaf=T gives leaf nodes, else gives all descendants -children <- function(node, graph, leaf=T){ - require(igraph) - if(leaf) - return(V(graph)$name[is.finite(distances(graph, node, mode="out")) & igraph::degree(graph, mode="out")==0]) - else - return(V(graph)$name[is.finite(distances(graph, node, mode="out"))]) +children <- function(node, graph, leaf = T) { + require(igraph) + if (leaf) { + return(V(graph)$name[is.finite(distances(graph, node, mode = "out")) & igraph::degree(graph, mode = "out") == 0]) + } else { + return(V(graph)$name[is.finite(distances(graph, node, mode = "out"))]) + } } # Function to retrieve ancestors of a given node -ancestors <- function(node, graph, include_self=T){ - require(igraph) - if(include_self) - return(V(graph)$name[is.finite(distances(graph, node, mode="in"))]) - else - return(V(graph)$name[is.finite(distances(graph, node, mode="in")) & V(graph)$name!=node]) +ancestors <- function(node, graph, include_self = T) { + require(igraph) + if (include_self) { + return(V(graph)$name[is.finite(distances(graph, node, mode = "in"))]) + } else { + return(V(graph)$name[is.finite(distances(graph, node, mode = "in")) & V(graph)$name != node]) + } } -# Define function to compute g(a,x), where a is a node +# Define function to compute g(a,x), where a is a node # (g(a,x)=sum_{y in P(a)}(pi(y))) # @param pred named vector with predicted probabilities for one observation to be in one of the leaves class -g <- function(pred, int_node, graph){ - c <- children(node = int_node, graph = graph, leaf = T) - return(sum(pred[c])) +g <- function(pred, int_node, graph) { + c <- children(node = int_node, graph = graph, leaf = T) + return(sum(pred[c])) } # Function to construct confidence sets (for one obs for now) @@ -37,84 +39,86 @@ g <- function(pred, int_node, graph){ # @param p predicted probabilities for leaf nodes # @param graph graph -pred_sets1 <- function(lambda, p, graph){ - pred_class <- names(p)[which.max(p)] - anc <- ancestors(node = pred_class, graph = graph, include_self = TRUE) - - # Obtain scores for ancestors of the predicted class - scores <- sapply(as.character(anc), function(i) g(p, i, graph)) - names(scores) <- anc - - # Sort them by score and if there are ties by distance to the predicted class - ## compute distance from predicted class - pos <- distances(graph, v=anc, to=pred_class, mode="out") - tie_breaker <- as.vector(t(pos)) - names(tie_breaker) <- colnames(t(pos)) - sorted_indices <- order(scores, tie_breaker, decreasing=F) - sorted_scores <- scores[sorted_indices] - - # Select the first score that is geq than lambda - sel_node <- names(sorted_scores)[round(sorted_scores, 15) >= lambda][1] - - # c(lapply(anc[round(scores, 15) <= lambda] - selected <- c(lapply(anc[round(scores, 15) < lambda], function(x) children(node = x, graph = graph)), - list(children(sel_node, graph))) - - return(Reduce(union, selected)) +pred_sets1 <- function(lambda, p, graph) { + pred_class <- names(p)[which.max(p)] + anc <- ancestors(node = pred_class, graph = graph, include_self = TRUE) + + # Obtain scores for ancestors of the predicted class + scores <- sapply(as.character(anc), function(i) g(p, i, graph)) + names(scores) <- anc + + # Sort them by score and if there are ties by distance to the predicted class + ## compute distance from predicted class + pos <- distances(graph, v = anc, to = pred_class, mode = "out") + tie_breaker <- as.vector(t(pos)) + names(tie_breaker) <- colnames(t(pos)) + sorted_indices <- order(scores, tie_breaker, decreasing = F) + sorted_scores <- scores[sorted_indices] + + # Select the first score that is geq than lambda + sel_node <- names(sorted_scores)[round(sorted_scores, 15) >= lambda][1] + + # c(lapply(anc[round(scores, 15) <= lambda] + selected <- c( + lapply(anc[round(scores, 15) < lambda], function(x) children(node = x, graph = graph)), + list(children(sel_node, graph)) + ) + + return(Reduce(union, selected)) } # Do not consider this -pred_sets2 <- function(lambda, p, graph){ - pred_class <- names(p)[which.max(p)] - anc <- ancestors(node = pred_class, graph = graph, include_self = TRUE) - - scores <- sapply(anc, function(i) g(p, i, graph)) - names(scores) <- anc - - selected <- lapply(anc[round(scores, 15) <= lambda], function(x) children(node = x, graph = graph)) - if(length(selected)==0) selected <- pred_class - - return(Reduce(union, selected)) +pred_sets2 <- function(lambda, p, graph) { + pred_class <- names(p)[which.max(p)] + anc <- ancestors(node = pred_class, graph = graph, include_self = TRUE) + + scores <- sapply(anc, function(i) g(p, i, graph)) + names(scores) <- anc + + selected <- lapply(anc[round(scores, 15) <= lambda], function(x) children(node = x, graph = graph)) + if (length(selected) == 0) selected <- pred_class + + return(Reduce(union, selected)) } # Loss table with miscoverage get_loss_table_mis <- function(lambdas, prediction, ycal, graph) { - loss <- sapply(1:length(lambdas), function(lambda) { - sapply(seq_along(ycal), function(i) { - !(ycal[i] %in% prediction[[lambda]][[i]]) + loss <- sapply(1:length(lambdas), function(lambda) { + sapply(seq_along(ycal), function(i) { + !(ycal[i] %in% prediction[[lambda]][[i]]) + }) }) - }) - - return(loss) + + return(loss) } # Hierarchical loss proposed by Angelopoulus and Bates -hier_loss <- function(set, true_class, graph){ - anc <- ancestors(true_class, graph) - set_distances <- distances(graph, v = anc, to = set) - - vroot <- V(graph)[degree(graph, mode = "in") == 0] - depth <- max(distances(graph, vroot, mode = "out")) - - return(min(set_distances) / depth) +hier_loss <- function(set, true_class, graph) { + anc <- ancestors(true_class, graph) + set_distances <- distances(graph, v = anc, to = set) + + vroot <- V(graph)[degree(graph, mode = "in") == 0] + depth <- max(distances(graph, vroot, mode = "out")) + + return(min(set_distances) / depth) } # Loss table with hierarchical loss get_loss_table <- function(lambdas, prediction, ycal, graph) { - loss <- sapply(1:length(lambdas), function(lambda) { - sapply(seq_along(ycal), function(i) { - hier_loss(prediction[[lambda]][[i]], true_class = ycal[i], graph = graph) + loss <- sapply(1:length(lambdas), function(lambda) { + sapply(seq_along(ycal), function(i) { + hier_loss(prediction[[lambda]][[i]], true_class = ycal[i], graph = graph) + }) }) - }) - - return(loss) + + return(loss) } # Find lambda hat on a grid of lambda values -get_lhat <- function(calib_loss_table, lambdas, alpha, B=1) { - n <- nrow(calib_loss_table) - rhat <- colMeans(calib_loss_table) - lhat_idx <- min(which(((n/(n+1)) * rhat + B/(n+1) ) <= alpha)) - # Return the corresponding lambda value at lhat_idx - return(lambdas[lhat_idx]) +get_lhat <- function(calib_loss_table, lambdas, alpha, B = 1) { + n <- nrow(calib_loss_table) + rhat <- colMeans(calib_loss_table) + lhat_idx <- min(which(((n / (n + 1)) * rhat + B / (n + 1)) <= alpha)) + # Return the corresponding lambda value at lhat_idx + return(lambdas[lhat_idx]) } diff --git a/R/old/utils_Resample.R b/R/old/utils_Resample.R index d975aa7..0736a4a 100644 --- a/R/old/utils_Resample.R +++ b/R/old/utils_Resample.R @@ -1,36 +1,38 @@ ########## Utils for resampling strategy -resample.two <- function(cal, test, cal.pred, test.pred, seed=NA){ - if(!is.na(seed)) set.seed(seed) - s <- sample(1:nrow(test), round(nrow(test)/2)) - test1 <- test[s,] - test2 <- test[-s,] - p.test1 <- test.pred[s,] - p.test2 <- test.pred[-s,] - cal_freq <- prop.table(table(cal$Y)) - pr.class1 <- apply(p.test1, 1, function(row) colnames(p.test1)[which.max(row)]) - pr.class2 <- apply(p.test2, 1, function(row) colnames(p.test2)[which.max(row)]) - test_freq1 <- prop.table(table(pr.class1)) - test_freq2 <- prop.table(table(pr.class2)) - des_freq1 <- round(test_freq1 * length(cal$Y)) - des_freq2 <- round(test_freq2 * length(cal$Y)) - - idx1 <- idx2 <- NULL - for (i in cl.types) { - cat <- which(cal$Y == i) - if(!is.na(des_freq1[i])){ - idx_cat1 <- sample(cat, size = des_freq1[i], replace = TRUE) - idx1 <- c(idx1, idx_cat1) +resample.two <- function(cal, test, cal.pred, test.pred, seed = NA) { + if (!is.na(seed)) set.seed(seed) + s <- sample(1:nrow(test), round(nrow(test) / 2)) + test1 <- test[s, ] + test2 <- test[-s, ] + p.test1 <- test.pred[s, ] + p.test2 <- test.pred[-s, ] + cal_freq <- prop.table(table(cal$Y)) + pr.class1 <- apply(p.test1, 1, function(row) colnames(p.test1)[which.max(row)]) + pr.class2 <- apply(p.test2, 1, function(row) colnames(p.test2)[which.max(row)]) + test_freq1 <- prop.table(table(pr.class1)) + test_freq2 <- prop.table(table(pr.class2)) + des_freq1 <- round(test_freq1 * length(cal$Y)) + des_freq2 <- round(test_freq2 * length(cal$Y)) + + idx1 <- idx2 <- NULL + for (i in cl.types) { + cat <- which(cal$Y == i) + if (!is.na(des_freq1[i])) { + idx_cat1 <- sample(cat, size = des_freq1[i], replace = TRUE) + idx1 <- c(idx1, idx_cat1) + } + if (!is.na(des_freq2[i])) { + idx_cat2 <- sample(cat, size = des_freq2[i], replace = TRUE) + idx2 <- c(idx2, idx_cat2) + } } - if(!is.na(des_freq2[i])){ - idx_cat2 <- sample(cat, size = des_freq2[i], replace = TRUE) - idx2 <- c(idx2, idx_cat2) - } - } - - return(list(cal1=cal[idx1,], cal2=cal[idx2,], p.cal1=cal.pred[idx1,], - p.cal2=cal.pred[idx2,], test1=test1, test2=test2, - p.test1=p.test1, p.test2=p.test2)) + + return(list( + cal1 = cal[idx1, ], cal2 = cal[idx2, ], p.cal1 = cal.pred[idx1, ], + p.cal2 = cal.pred[idx2, ], test1 = test1, test2 = test2, + p.test1 = p.test1, p.test2 = p.test2 + )) } # resample.oracle <- function(cal, test, cal.pred, test.pred, seed=NA){ @@ -45,7 +47,7 @@ resample.two <- function(cal, test, cal.pred, test.pred, seed=NA){ # test_freq2 <- prop.table(table(test2$Y)) # des_freq1 <- round(test_freq1 * length(cal$Y)) # des_freq2 <- round(test_freq2 * length(cal$Y)) -# +# # idx1 <- idx2 <- NULL # for (i in cl.types) { # cat <- which(cal$Y == i) @@ -58,120 +60,112 @@ resample.two <- function(cal, test, cal.pred, test.pred, seed=NA){ # idx2 <- c(idx2, idx_cat2) # } # } -# +# # return(list(cal1=cal[idx1,], cal2=cal[idx2,], p.cal1=cal.pred[idx1,], # p.cal2=cal.pred[idx2,], test1=test1, test2=test2, # p.test1=p.test1, p.test2=p.test2)) # } -resample.oracle <- function(cal, test, cal.pred, test.pred, seed=NA){ - if(!is.na(seed)) set.seed(seed) - - cal_freq <- prop.table(table(cal$Y)) - test_freq <- prop.table(table(test$Y)) - des_freq <- round(test_freq * length(cal$Y)) - - idx <- NULL - for (i in cl.types) { - cat <- which(cal$Y == i) - if(!is.na(des_freq[i])){ - idx_cat <- sample(cat, size = des_freq[i], replace = TRUE) - idx <- c(idx, idx_cat) - } - } - - return(list(cal=cal[idx,], p.cal=cal.pred[idx,])) -} - -resample.freq <- function(cal, test, cal.pred, test.pred, freqs, seed=NA){ - if(!is.na(seed)) set.seed(seed) - - cal_freq <- prop.table(table(cal$Y)) - test_freq <- freqs - des_freq <- round(test_freq * length(cal$Y)) - - idx <- NULL - for (i in cl.types) { - cat <- which(cal$Y == i) - if(!is.na(des_freq[i])){ - idx_cat <- sample(cat, size = des_freq[i], replace = TRUE) - idx <- c(idx, idx_cat) - } - } - - return(list(cal=cal[idx,], p.cal=cal.pred[idx,])) -} +resample.oracle <- function(cal, test, cal.pred, test.pred, seed = NA) { + if (!is.na(seed)) set.seed(seed) + cal_freq <- prop.table(table(cal$Y)) + test_freq <- prop.table(table(test$Y)) + des_freq <- round(test_freq * length(cal$Y)) - - - -exportedFn <- c("pred_sets1", "children", "ancestors", "g") -conformal <- function(cal, test, p.cal, p.test, alpha=0.1, graph=opi2, - lambdas=seq(0.001,0.999,length.out=100), - expFn=exportedFn){ - conf.mln <- getConfQuant(p.cal, cal$Y, alpha) - pred.mln <- getPredSets(p.test, test$Y, conf.mln$qhat, - summary=T) - tf.conf <- rep(NA, length(pred.mln$pred.sets)) - for (i in 1:length(pred.mln$pred.sets)) - tf.conf[i] <- test$Y[i] %in% pred.mln$pred.sets[[i]] - - sets <- foreach(lambda = lambdas, .export = expFn) %dopar% { - lapply(1:nrow(p.cal), - function(i) pred_sets1(lambda, p.cal[i, ], graph))} - l <- get_loss_table_mis(lambdas, sets, as.character(cal$Y), graph) - lhat <- get_lhat(l, lambdas, alpha) - sets.test <- apply(p.test, 1, function(x) pred_sets1(lhat, x, graph)) - tf.crc <- rep(NA, nrow(test)) - for (j in 1:nrow(test)) - tf.crc[j] <- as.character(test$Y)[j] %in% sets.test[[j]] - return(list(tf.conf=tf.conf, tf.crc=tf.crc, sets.conf=pred.mln$pred.sets, sets.crc=sets.test)) - -} - -conf.loo <- function(p.cal, p.test, Ycal, Ytest, alpha){ - ntest <- nrow(p.test) - emp.cov <- rep(NA, ntest) - test.sets <- list() - for(i in 1:ntest){ - p.test.loo <- p.test[-i,] - cal_freq <- prop.table(table(Ycal)) - test_prfreq <- apply(p.test.loo, 1, function(row) colnames(p.test.loo)[which.max(row)]) - test_freq <- prop.table(table(test_prfreq)) - des_freq <- round(test_freq * length(Ycal)) - - # Resample idx <- NULL - for (j in cl.types) { - cat <- which(Ycal == j) - if(!is.na(des_freq[j])){ - idx_cat <- sample(cat, size = des_freq[j], replace = TRUE) - idx <- c(idx, idx_cat) - } + for (i in cl.types) { + cat <- which(cal$Y == i) + if (!is.na(des_freq[i])) { + idx_cat <- sample(cat, size = des_freq[i], replace = TRUE) + idx <- c(idx, idx_cat) + } } - p.cal.loo <- p.cal[idx,] - Ycal.loo <- Ycal[idx] - - # Conformal prediction for i obs - q <- getConfQuant(p.cal.loo, Ycal.loo, alpha) - p <- getPredSets(t(as.matrix(p.test[i,])), Ytest[i], q$qhat) - emp.cov[i] <- p$coverage - test.sets[i] <- p$pred.sets - #if(i%%100 == 0) cat(i) - } - return(list(emp.cov=mean(emp.cov), sets=test.sets)) -} - + return(list(cal = cal[idx, ], p.cal = cal.pred[idx, ])) +} +resample.freq <- function(cal, test, cal.pred, test.pred, freqs, seed = NA) { + if (!is.na(seed)) set.seed(seed) + cal_freq <- prop.table(table(cal$Y)) + test_freq <- freqs + des_freq <- round(test_freq * length(cal$Y)) + idx <- NULL + for (i in cl.types) { + cat <- which(cal$Y == i) + if (!is.na(des_freq[i])) { + idx_cat <- sample(cat, size = des_freq[i], replace = TRUE) + idx <- c(idx, idx_cat) + } + } + return(list(cal = cal[idx, ], p.cal = cal.pred[idx, ])) +} +exportedFn <- c("pred_sets1", "children", "ancestors", "g") +conformal <- function(cal, test, p.cal, p.test, alpha = 0.1, graph = opi2, + lambdas = seq(0.001, 0.999, length.out = 100), + expFn = exportedFn) { + conf.mln <- getConfQuant(p.cal, cal$Y, alpha) + pred.mln <- getPredSets(p.test, test$Y, conf.mln$qhat, + summary = T + ) + tf.conf <- rep(NA, length(pred.mln$pred.sets)) + for (i in 1:length(pred.mln$pred.sets)) { + tf.conf[i] <- test$Y[i] %in% pred.mln$pred.sets[[i]] + } + sets <- foreach(lambda = lambdas, .export = expFn) %dopar% { + lapply( + 1:nrow(p.cal), + function(i) pred_sets1(lambda, p.cal[i, ], graph) + ) + } + l <- get_loss_table_mis(lambdas, sets, as.character(cal$Y), graph) + lhat <- get_lhat(l, lambdas, alpha) + sets.test <- apply(p.test, 1, function(x) pred_sets1(lhat, x, graph)) + tf.crc <- rep(NA, nrow(test)) + for (j in 1:nrow(test)) { + tf.crc[j] <- as.character(test$Y)[j] %in% sets.test[[j]] + } + return(list(tf.conf = tf.conf, tf.crc = tf.crc, sets.conf = pred.mln$pred.sets, sets.crc = sets.test)) +} +conf.loo <- function(p.cal, p.test, Ycal, Ytest, alpha) { + ntest <- nrow(p.test) + emp.cov <- rep(NA, ntest) + test.sets <- list() + for (i in 1:ntest) { + p.test.loo <- p.test[-i, ] + cal_freq <- prop.table(table(Ycal)) + test_prfreq <- apply(p.test.loo, 1, function(row) colnames(p.test.loo)[which.max(row)]) + test_freq <- prop.table(table(test_prfreq)) + des_freq <- round(test_freq * length(Ycal)) + + # Resample + idx <- NULL + for (j in cl.types) { + cat <- which(Ycal == j) + if (!is.na(des_freq[j])) { + idx_cat <- sample(cat, size = des_freq[j], replace = TRUE) + idx <- c(idx, idx_cat) + } + } + p.cal.loo <- p.cal[idx, ] + Ycal.loo <- Ycal[idx] + + # Conformal prediction for i obs + q <- getConfQuant(p.cal.loo, Ycal.loo, alpha) + p <- getPredSets(t(as.matrix(p.test[i, ])), Ytest[i], q$qhat) + emp.cov[i] <- p$coverage + test.sets[i] <- p$pred.sets + # if(i%%100 == 0) cat(i) + } + return(list(emp.cov = mean(emp.cov), sets = test.sets)) +} diff --git a/R/utils_ConformalPS.R b/R/utils_ConformalPS.R index d9828cf..7d6a605 100644 --- a/R/utils_ConformalPS.R +++ b/R/utils_ConformalPS.R @@ -1,26 +1,22 @@ # Function to get conformal prediction sets with split conformal inference -.getConformalPredSets <- function(p.cal, p.test, y.cal, alpha){ +.getConformalPredSets <- function(p.cal, p.test, y.cal, alpha) { # Get calibration scores (1-predicted probability for the true class) true <- rep(NA, dim(p.cal)[1]) - for (i in 1:dim(p.cal)[1]) - true[i] <- p.cal[i, y.cal[i]] - s <- 1-true + for (i in 1:dim(p.cal)[1]) { + true[i] <- p.cal[i, y.cal[i]] + } + s <- 1 - true # Get adjusted quantile n <- nrow(p.cal) - q_level <- ceiling((n+1)*(1-alpha))/n + q_level <- ceiling((n + 1) * (1 - alpha)) / n qhat <- quantile(s, q_level) # Get prediction sets - prediction_sets <- p.test >= 1-qhat + prediction_sets <- p.test >= 1 - qhat pr.list <- lapply(1:nrow(prediction_sets), function(i) { colnames(prediction_sets)[prediction_sets[i, ]] }) return(pr.list) } - - - - - diff --git a/R/utils_HierarchicalPS.R b/R/utils_HierarchicalPS.R index 6b7c84c..4dd5248 100644 --- a/R/utils_HierarchicalPS.R +++ b/R/utils_HierarchicalPS.R @@ -7,81 +7,94 @@ # for p.test based on the selected lambda value. .getHierarchicalPredSets <- function(p.cal, p.test, y.cal, onto, alpha, - lambdas){ + lambdas) { y.cal <- as.character(y.cal) # Get prediction sets for each value of lambda for all the calibration data j <- NULL - exportedFn = c(".predSets", ".scores", ".children", ".ancestors") - sets <- foreach(j = lambdas, .export=exportedFn) %dopar% { - lapply(1:nrow(p.cal), - function(i) .predSets(lambda=j, pred=p.cal[i, ], onto=onto)) - } + exportedFn <- c(".predSets", ".scores", ".children", ".ancestors") + sets <- foreach(j = lambdas, .export = exportedFn) %dopar% { + lapply( + 1:nrow(p.cal), + function(i) .predSets(lambda = j, pred = p.cal[i, ], onto = onto) + ) + } # Get the loss table (ncal x length(lambda) table with TRUE\FALSE) loss <- sapply(1:length(lambdas), function(lambda) { - sapply(seq_along(y.cal), function(i) { - !(y.cal[i] %in% sets[[lambda]][[i]]) - }) + sapply(seq_along(y.cal), function(i) { + !(y.cal[i] %in% sets[[lambda]][[i]]) + }) }) # Get lhat n <- nrow(loss) rhat <- colMeans(loss) - lhat_idx <- min(which(((n/(n+1)) * rhat + 1/(n+1) ) <= alpha)) + lhat_idx <- min(which(((n / (n + 1)) * rhat + 1 / (n + 1)) <= alpha)) lhat <- lambdas[lhat_idx] # Get prediction sets for test data - sets.test <- apply(p.test, 1, function(x) .predSets(lambda=lhat, pred=x, - onto=onto)) + sets.test <- apply(p.test, 1, function(x) { + .predSets( + lambda = lhat, pred = x, + onto = onto + ) + }) return(sets.test) } # Function to retrieve all ancestors of a given node, starting from an igraph # object. Default includes also the given node -.ancestors <- function(node, onto, include_self=TRUE){ - if(include_self) - return(V(onto)$name[is.finite(distances(onto, node, mode="in"))]) - else - return(V(onto)$name[is.finite(distances(onto, node, mode="in")) & - V(onto)$name!=node]) +.ancestors <- function(node, onto, include_self = TRUE) { + if (include_self) { + return(V(onto)$name[is.finite(distances(onto, node, mode = "in"))]) + } else { + return(V(onto)$name[is.finite(distances(onto, node, mode = "in")) & + V(onto)$name != node]) + } } # Function to retrieve children of a given node, starting from an igraph object. # By default, gives only the children that are leaves in the ontology. -.children <- function(node, onto, leaf=TRUE){ - if(leaf) - return(V(onto)$name[is.finite(distances(onto, node, mode="out")) & - degree(onto, mode="out")==0]) - else - return(V(onto)$name[is.finite(distances(onto, node, mode="out"))]) +.children <- function(node, onto, leaf = TRUE) { + if (leaf) { + return(V(onto)$name[is.finite(distances(onto, node, mode = "out")) & + degree(onto, mode = "out") == 0]) + } else { + return(V(onto)$name[is.finite(distances(onto, node, mode = "out"))]) + } } # Function to compute scores of a given node in an ontology (sum of estimated # probabilities for the leaf nodes that are children of the given one) -.scores <- function(pred, int_node, onto){ +.scores <- function(pred, int_node, onto) { c <- .children(node = int_node, onto = onto, leaf = TRUE) return(sum(pred[c])) } # Function to get prediction sets following the ontology -.predSets <- function(lambda, pred, onto){ +.predSets <- function(lambda, pred, onto) { # Get the predicted class and its ancestors pred_class <- names(pred)[which.max(pred)] anc <- .ancestors(node = pred_class, onto = onto, include_self = TRUE) # Compute scores for all the ancestor of the predicted class - s <- sapply(as.character(anc), function(i) .scores(pred=pred, int_node=i, - onto=onto)) + s <- sapply(as.character(anc), function(i) { + .scores( + pred = pred, int_node = i, + onto = onto + ) + }) names(s) <- anc - #Sort them by score and if there are ties by distance to the predicted class + ## Sort them by score and if there are ties by distance to the predicted + ## class ## compute distance from predicted class - pos <- distances(onto, v=anc, to=pred_class, mode="out") + pos <- distances(onto, v = anc, to = pred_class, mode = "out") tie_breaker <- as.vector(t(pos)) names(tie_breaker) <- colnames(t(pos)) - sorted_indices <- order(s, tie_breaker, decreasing=FALSE) + sorted_indices <- order(s, tie_breaker, decreasing = FALSE) sorted_scores <- s[sorted_indices] # Select the first score that is geq than lambda @@ -89,14 +102,12 @@ # Add also the subgraphs we would have obtained with smaller lambda - selected <- c(lapply(anc[round(s, 15) < lambda], function(x) - .children(node = x, onto = onto)), - list(.children(sel_node, onto))) + selected <- c( + lapply(anc[round(s, 15) < lambda], function(x) { + .children(node = x, onto = onto) + }), + list(.children(sel_node, onto)) + ) return(Reduce(union, selected)) } - - - - - diff --git a/R/utils_Resample.R b/R/utils_Resample.R index aab41ce..f97689f 100644 --- a/R/utils_Resample.R +++ b/R/utils_Resample.R @@ -3,10 +3,10 @@ # Right now it implements a two-fold strategy, dividing randomly # test data in two. -resample.two <- function(p.cal, p.test, y.cal, labels){ - s <- sample(1:nrow(p.test), round(nrow(p.test)/2)) - test1 <- p.test[s,] - test2 <- p.test[-s,] +resample.two <- function(p.cal, p.test, y.cal, labels) { + s <- sample(1:nrow(p.test), round(nrow(p.test) / 2)) + test1 <- p.test[s, ] + test2 <- p.test[-s, ] # Compute predicted class pr.class1 <- apply(test1, 1, function(row) colnames(test1)[which.max(row)]) @@ -20,23 +20,25 @@ resample.two <- function(p.cal, p.test, y.cal, labels){ idx1 <- idx2 <- NULL for (i in labels) { cat <- which(y.cal == i) - if(!is.na(des.freq1[i])){ + if (!is.na(des.freq1[i])) { idx.cat1 <- sample(cat, size = des.freq1[i], replace = TRUE) idx1 <- c(idx1, idx.cat1) } - if(!is.na(des.freq2[i])){ + if (!is.na(des.freq2[i])) { idx.cat2 <- sample(cat, size = des.freq2[i], replace = TRUE) idx2 <- c(idx2, idx.cat2) } } - return(list(p.cal1=p.cal[idx1,], - p.cal2=p.cal[idx2,], - p.test1=test1, - p.test2=test2, - y.cal1=y.cal[idx1], - y.cal2=y.cal[idx2], - idx=c(s, setdiff(1:nrow(p.test), s)))#index in the original data - ) + return( + list( + p.cal1 = p.cal[idx1, ], + p.cal2 = p.cal[idx2, ], + p.test1 = test1, + p.test2 = test2, + y.cal1 = y.cal[idx1], + y.cal2 = y.cal[idx2], + idx = c(s, setdiff(1:nrow(p.test), s)) + ) # index in the original data + ) } - diff --git a/man/getPredictionSets.Rd b/man/getPredictionSets.Rd index df15094..0bf4884 100644 --- a/man/getPredictionSets.Rd +++ b/man/getPredictionSets.Rd @@ -135,7 +135,7 @@ still guaranteed that \examples{ # random p matrix set.seed(1040) -p <- matrix(rnorm(2000*4), ncol=4) +p <- matrix(rnorm(2000 * 4), ncol = 4) # Normalize the matrix p to have all numbers between 0 and 1 that sum to 1 # by row p <- exp(p - apply(p, 1, max)) @@ -144,25 +144,25 @@ cell.types <- c("T (CD4+)", "T (CD8+)", "B", "NK") colnames(p) <- cell.types # Take 1000 rows as calibration and 1000 as test -p.cal <- p[1:1000,] -p.test <- p[1001:2000,] +p.cal <- p[1:1000, ] +p.test <- p[1001:2000, ] # Randomly create the vector of real cell types for p.cal and p.test -y.cal <- sample(cell.types, 1000, replace=TRUE) -y.test <- sample(cell.types, 1000, replace=TRUE) +y.cal <- sample(cell.types, 1000, replace = TRUE) +y.test <- sample(cell.types, 1000, replace = TRUE) # Obtain conformal prediction sets -conf.sets <- getPredictionSets(x.query=p.test, - x.cal=p.cal, - y.cal=y.cal, - onto=NULL, - alpha=0.1, - follow.ontology=FALSE, - resample.cal=FALSE, - labels=cell.types, - return.sc=FALSE - ) - +conf.sets <- getPredictionSets( + x.query = p.test, + x.cal = p.cal, + y.cal = y.cal, + onto = NULL, + alpha = 0.1, + follow.ontology = FALSE, + resample.cal = FALSE, + labels = cell.types, + return.sc = FALSE +) } \references{ diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index 08f2ead..08bed3b 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -17,8 +17,8 @@ vignette: > ```{r, include = FALSE} knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" + collapse = TRUE, + comment = "#>" ) ``` @@ -34,10 +34,10 @@ library(scConform) # library(doParallel) # library(igraph) # library(scuttle) -# +# # `%notin%` = Negate(`%in%`) - -# num_cores <- 8 + +# num_cores <- 8 # cl1 <- makeCluster(num_cores) # registerDoParallel(cl1) ```