diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml new file mode 100644 index 0000000..1d99ae6 --- /dev/null +++ b/.github/workflows/check-standard.yaml @@ -0,0 +1,58 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + workflow_dispatch: + pull_request: + +name: check-standard + +permissions: read-all + +jobs: + check-standard: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + dependencies: '"hard"' + extra-packages: | + any::rcmdcheck + any::testthat + any::knitr + any::rmarkdown + local::. + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + args: 'c("--no-manual","--no-build-vignettes","--no-examples")' + build_args: 'c("--no-manual","--no-build-vignettes", "--no-resave-data")' + error-on: '"error"' \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index a30d424..2026a3a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: nichenetr Type: Package -Title: NicheNet: Modeling Intercellular Communication by Linking Ligands to Target Genes -Version: 2.1.0 +Title: Modeling Intercellular Communication by Linking Ligands to Target Genes with NicheNet +Version: 2.2.0 Authors@R: c(person("Robin", "Browaeys", role = c("aut")), person("Chananchida", "Sang-aram", role = c("aut", "cre"), email = "chananchida.sangaram@ugent.be")) Description: This package allows you the investigate intercellular communication from a computational perspective. More specifically, it allows to investigate how interacting cells influence each other's gene expression. Functionalities of this package (e.g. including predicting extracellular upstream regulators and their affected target genes) build upon a probabilistic model of ligand-target links that was inferred by data-integration. @@ -10,8 +10,9 @@ Encoding: UTF-8 LazyData: true URL: https://github.com/saeyslab/nichenetr BugReports: https://github.com/saeyslab/nichenetr/issues -RoxygenNote: 7.1.2 +RoxygenNote: 7.3.1 Depends: R (>= 3.0.0) +biocViews: Imports: tidyverse, data.table, @@ -27,7 +28,6 @@ Imports: fdrtool, ROCR, caTools, - limma, Hmisc, caret, randomForest, @@ -52,6 +52,7 @@ Suggests: rmarkdown, testthat, doMC, + limma, mco, parallel, covr, diff --git a/NAMESPACE b/NAMESPACE index 4808f39..3bc51b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -185,7 +185,6 @@ importFrom(igraph,get.data.frame) importFrom(igraph,graph_from_adjacency_matrix) importFrom(igraph,page_rank) importFrom(igraph,sample_degseq) -importFrom(limma,wilcoxGST) importFrom(magrittr,set_colnames) importFrom(magrittr,set_rownames) importFrom(purrr,reduce) diff --git a/R/application_prediction.R b/R/application_prediction.R index 9a8ce15..76981a8 100644 --- a/R/application_prediction.R +++ b/R/application_prediction.R @@ -129,10 +129,8 @@ get_weighted_ligand_target_links = function(ligand, geneset,ligand_target_matrix targets = intersect(ligand_target_matrix[,ligand] %>% .[. >= top_n_score ] %>% names(),geneset) if (length(targets) == 0){ ligand_target_weighted_df = tibble(ligand = ligand, target = NA, weight = NA) - } else if (length(targets) == 1) { - ligand_target_weighted_df = tibble(ligand = ligand, target = targets, weight = ligand_target_matrix[targets,ligand]) } else { - ligand_target_weighted_df = tibble(ligand = ligand, target = names(ligand_target_matrix[targets,ligand])) %>% inner_join(tibble(target = names(ligand_target_matrix[targets,ligand]), weight = ligand_target_matrix[targets,ligand]), by = "target") + ligand_target_weighted_df = tibble(ligand = ligand, target = targets, weight = ligand_target_matrix[targets,ligand]) } return(ligand_target_weighted_df) } @@ -2189,9 +2187,9 @@ get_lfc_celltype = function(celltype_oi, seurat_obj, condition_colname, conditio SeuratV4 = c("avg_log2FC") %in% colnames(DE_table_sender) if(SeuratV4 == TRUE){ - DE_table_sender = DE_table_sender %>% as_tibble() %>% select(-p_val) %>% select(gene, avg_log2FC) + DE_table_sender = DE_table_sender %>% as_tibble() %>% select(gene, avg_log2FC) } else { - DE_table_sender = DE_table_sender %>% as_tibble() %>% select(-p_val) %>% select(gene, avg_logFC) + DE_table_sender = DE_table_sender %>% as_tibble() %>% select(gene, avg_logFC) } colnames(DE_table_sender) = c("gene",celltype_oi) diff --git a/R/application_visualization.R b/R/application_visualization.R index 9028bf4..cdc9e01 100644 --- a/R/application_visualization.R +++ b/R/application_visualization.R @@ -552,19 +552,28 @@ make_heatmap_bidir_lt_ggplot = function(matrix, y_name, x_name, y_axis = TRUE, x #' #' @description \code{make_mushroom_plot} Make a plot in which each glyph consists of two semicircles corresponding to ligand- and receptor- information. The size of the semicircle is the percentage of cells that express the protein, while the saturation corresponds to the scaled average expression value. #' -#' @param prioritization_table A prioritization table as generated by \code{\link{generate_prioritization_tables}} +#' @param prioritization_table A prioritization table as generated by \code{\link{generate_prioritization_tables}}. #' @param top_n An integer indicating how many ligand-receptor pairs to show #' @param show_rankings A logical indicating whether to show the ranking of the ligand-receptor pairs (default: FALSE) #' @param show_all_datapoints A logical indicating whether to show all ligand-receptor pairs (default: FALSE, if true they will be grayed out) -#' @param true_color_range A logical indicating whether to use the true color range for the ligand-receptor pairs (default: FALSE; range 0-1 is used) +#' @param true_color_range A logical indicating whether to use the default color range as determined by ggplot (TRUE, default) or set the limits to a range of 0-1 (FALSE) +#' @param use_absolute_rank A logical indicating to whether use the absolute prioritization rank to filter the top_n ligand-receptor pairs (default: FALSE) #' @param size A string indicating which column to use for the size of the semicircles (default: "scaled_avg_exprs"; use column name without "_ligand" or "_receptor" suffix) -#' @param color A string indicating which column to use for the color of the semicircles (default: "scaled_lfc"; use column name without "_ligand" or "_receptor" suffix) +#' @param color A string indicating which column to use for the color of the semicircles (default: "scaled_p_val_adapted"; use column name without "_ligand" or "_receptor" suffix) #' @param ligand_fill_colors A vector of the low and high colors to use for the ligand semicircle fill gradient (default: c("#DEEBF7", "#08306B")) #' @param receptor_fill_colors A vector of the low and high colors to use for the receptor semicircle fill gradient (default: c("#FEE0D2", "#A50F15")) #' @param unranked_ligand_fill_colors A vector of the low and high colors to use for the unranked ligands when show_all_datapoints is TRUE (default: c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2))) -#' @param unranked_receptor_fill_colors A vector of the low and high colors to use for the unkraed receptors when show_all_datapoints is TRUE (default: c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2))) +#' @param unranked_receptor_fill_colors A vector of the low and high colors to use for the unranked receptors when show_all_datapoints is TRUE (default: c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2))) #' @param ... Additional arguments passed to \code{\link{ggplot2::theme}}. As there are often issues with the scales legend, it is recommended to change legend sizes and positions using this argument, i.e., \code{legend.key.height}, \code{legend.key.width}, \code{legend.title}, and \code{legend.text}. #' +#' @details +#' If the values range of the column used as the "size" parameter is not between 0 and 1.001, an error will be thrown. +#' +#' The sender cell types can be ordered by encoding the "sender" column as a factor. If the "sender" column is not a factor, the sender cell types will be ordered alphabetically. +#' +#' By default, the top_n ligand-receptor pairs are shown despite their absolute ranking. So, if a receiver cell type has LR pairs that are only ranked from 31-40 and the top_n is set to 20, the LR pairs will be shown. If use_absolute_rank is set to TRUE, only LR pairs with absolute ranking from 1-20 will be shown. +#' +#' #' @return A ggplot object #' #' @import ggplot2 @@ -589,14 +598,18 @@ make_heatmap_bidir_lt_ggplot = function(matrix, y_name, x_name, y_axis = TRUE, x #' #' # Change the size and color columns #' make_mushroom_plot(prior_table, size = "pct_expressed", color = "scaled_avg_exprs") -#' } #' #' +#' # For a prioritization table with multiple receiver cell types +#' make_mushroom_plot(prior_table_combined %>% filter(receiver == celltype_oi)) +#'} +#' #' @export #' make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = FALSE, - show_all_datapoints = FALSE, true_color_range = FALSE, - size = "scaled_avg_exprs", color = "scaled_lfc", + show_all_datapoints = FALSE, true_color_range = TRUE, + use_absolute_rank = FALSE, + size = "scaled_avg_exprs", color = "scaled_p_val_adapted", ligand_fill_colors = c("#DEEBF7", "#08306B"), receptor_fill_colors = c("#FEE0D2", "#A50F15"), unranked_ligand_fill_colors = c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2)), @@ -617,6 +630,8 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = stop("show_all_datapoints should be a TRUE or FALSE") if(!is.logical(true_color_range) | length(true_color_range) != 1) stop("true_color_range should be a TRUE or FALSE") + if(!is.logical(use_absolute_rank) | length(use_absolute_rank) != 1) + stop("use_absolute_rank should be a TRUE or FALSE") if(!is.numeric(top_n) | length(top_n) != 1) stop("top_n should be a numeric vector of length 1") if(length(ligand_fill_colors) != 2) @@ -635,27 +650,53 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = requireNamespace("shadowtext") requireNamespace("cowplot") - # Filter to top_n, create a new column of ligand-receptor interactions - filtered_table <- prioritization_table %>% dplyr::mutate(prioritization_rank = rank(desc(prioritization_score))) %>% - dplyr::mutate(lr_interaction = paste(ligand, receptor, sep = " - ")) - order_interactions <- unique(filtered_table %>% filter(prioritization_rank <= top_n) %>% pull(lr_interaction)) + if (!"prioritization_rank" %in% colnames(prioritization_table)){ + prioritization_table <- prioritization_table %>% dplyr::mutate(prioritization_rank = rank(desc(prioritization_score))) + } + # Add 'relative rank' column which is basically 1:n + prioritization_table <- prioritization_table %>% dplyr::mutate(relative_rank = rank(desc(prioritization_score))) + + # If use_absolute_rank, use 'prioritization_rank' column to filter top_n + rank_filter_col <- ifelse(use_absolute_rank, "prioritization_rank", "relative_rank") + + # Create a new column of ligand-receptor interactions, and filter table to + # only include LR interactions that appear in the top_n + filtered_table <- prioritization_table %>% dplyr::mutate(lr_interaction = paste(ligand, receptor, sep = " - ")) + order_interactions <- unique(filtered_table %>% filter(.data[[rank_filter_col]] <= top_n) %>% pull(lr_interaction)) filtered_table <- filtered_table %>% filter(lr_interaction %in% order_interactions) %>% mutate(lr_interaction = factor(lr_interaction, levels = rev(order_interactions))) - celltypes_vec <- 1:length(unique(filtered_table$sender)) %>% setNames(sort(unique(filtered_table$sender))) + # Check if filtered_table is empty + if (nrow(filtered_table) == 0){ + stop("No ligand-receptor interactions found in the top_n. Please try use_absolute_rank = FALSE or increase top_n.") + } + + # Keep order of senders, if present (if not, sort alphabetically) + if (!is.factor(filtered_table$sender)){ + filtered_table$sender <- as.factor(filtered_table$sender) + } else { + # Drop levels that are not present in the filtered table + filtered_table$sender <- droplevels(filtered_table$sender) + } + lr_interaction_vec <- 1:length(order_interactions) %>% setNames(order_interactions) # Make each ligand and receptor into separate rows (to draw 1 semicircle per row) - filtered_table <- filtered_table %>% select(c("lr_interaction", all_of(cols_to_use), "prioritization_rank")) %>% + filtered_table <- filtered_table %>% select(c("lr_interaction", all_of(cols_to_use), "prioritization_rank", "relative_rank")) %>% pivot_longer(c(ligand, receptor), names_to = "type", values_to = "protein") %>% mutate(size = ifelse(type == "ligand", get(paste0(size, "_", size_ext[1])), get(paste0(size, "_", size_ext[2]))), color = ifelse(type == "ligand", get(paste0(color, "_", color_ext[1])), get(paste0(color, "_", color_ext[2])))) %>% select(-contains(c("_ligand", "_receptor", "_sender", "_receiver"))) %>% mutate(start = rep(c(-pi, 0), nrow(filtered_table))) %>% - mutate(x = celltypes_vec[sender], y = lr_interaction_vec[lr_interaction]) + mutate(x = as.numeric(sender), y = lr_interaction_vec[lr_interaction]) + + # Warning if size column is not scaled between 0 and 1.001 + if (any(filtered_table$size < 0) | any(filtered_table$size > 1.001)){ + stop("Size column is not scaled between 0 and 1. Please use this column as the color instead.") + } # Rename size and color columns to be more human-readable - keywords_adj <- c("LFC", "p-val", "product", "mean", "adjusted", "expression") %>% setNames(c("lfc", "pval", "prod", "avg", "adj", "exprs")) + keywords_adj <- c("LFC", "pval", "", "product", "mean", "adjusted", "expression") %>% setNames(c("lfc", "p", "val", "prod", "avg", "adj", "exprs")) size_title <- sapply(stringr::str_split(size, "_")[[1]], function(k) ifelse(is.na(keywords_adj[k]), k, keywords_adj[k])) %>% paste0(., collapse = " ") %>% stringr::str_replace("^\\w{1}", toupper) color_title <- sapply(stringr::str_split(color, "_")[[1]], function(k) ifelse(is.na(keywords_adj[k]), k, keywords_adj[k])) %>% @@ -666,7 +707,7 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = scale <- 0.5 - ncelltypes <- length(celltypes_vec) + ncelltypes <- length(unique(filtered_table$sender)) n_interactions <- length(lr_interaction_vec) legend2_df <- data.frame(values = c(0.25, 0.5, 0.75, 1), x=(ncelltypes+2.5):(ncelltypes+5.5), y=rep(floor(n_interactions/3), 4), start=-pi) axis_rect <- data.frame(xmin=0, xmax=ncelltypes+1, ymin=0, ymax=n_interactions+1) @@ -699,7 +740,7 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = p1 <- ggplot() + # Draw ligand semicircle - geom_arc_bar(data = filtered_table %>% filter(type=="ligand", prioritization_rank <= top_n), + geom_arc_bar(data = filtered_table %>% filter(type=="ligand", .data[[rank_filter_col]] <= top_n), aes(x0 = x, y0 = y, r0 = 0, r = sqrt(size)*scale, start = start, end = start + pi, fill=color), color = "white") + @@ -707,10 +748,10 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = limits=color_lims, oob=scales::squish, n.breaks = 3, guide = guide_colorbar(order = 1), - name=paste0(color_title, " (", color_ext[1], ")") %>% str_wrap(width=15)) + + name=paste0(color_title, " (", color_ext[1], ")") %>% stringr::str_wrap(width=15)) + # Create new fill scale for receptor semicircles new_scale_fill() + - geom_arc_bar(data = filtered_table %>% filter(type=="receptor", prioritization_rank <= top_n), + geom_arc_bar(data = filtered_table %>% filter(type=="receptor", .data[[rank_filter_col]] <= top_n), aes(x0 = x, y0 = y, r0 = 0, r = sqrt(size)*scale, start = start, end = start + pi, fill=color), color = "white") + @@ -719,7 +760,7 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = geom_rect(data = legend2_df, aes(xmin=x-0.5, xmax=x+0.5, ymin=y-0.5, ymax=y+0.5), color="gray90", fill=NA) + geom_text(data = legend2_df, aes(label=values, x=x, y=y-0.6), vjust=1, size = scale_legend_text_size) + geom_text(data = data.frame(x = (ncelltypes+4), y = floor(n_interactions/3)+1, - label = size_title %>% str_wrap(width=15)), + label = size_title %>% stringr::str_wrap(width=15)), aes(x=x, y=y, label=label), size = scale_legend_title_size, vjust=0, lineheight = .75) + # Panel grid geom_line(data = panel_grid_y, aes(x=x, y=y, group=group), color = "gray90") + @@ -728,10 +769,10 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = geom_rect(data = axis_rect, aes(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax), color = "black", fill = "transparent") + # Other plot information scale_fill_gradient(low = receptor_fill_colors[1], high=receptor_fill_colors[2] , limits=color_lims, oob=scales::squish, n.breaks = 3, - name=paste0(color_title, " (", color_ext[2], ")") %>% str_wrap(width=15), + name=paste0(color_title, " (", color_ext[2], ")") %>% stringr::str_wrap(width=15), guide = guide_colorbar(order = 2)) + scale_y_continuous(breaks=n_interactions:1, labels=names(lr_interaction_vec), expand = expansion(add=c(0,0))) + - scale_x_continuous(breaks=1:ncelltypes, labels=names(celltypes_vec), position="top", expand = expansion(add=c(0,0))) + + scale_x_continuous(breaks=1:ncelltypes, labels=levels(filtered_table$sender), position="top", expand = expansion(add=c(0,0))) + xlab("Sender cell types") + ylab("Ligand-receptor interaction") + coord_fixed() + do.call(theme, theme_args) @@ -743,14 +784,14 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = unranked_ligand_lims <- c(0,1); unranked_receptor_lims <- c(0,1) if (true_color_range){ # Follow limits of the top_n lr pairs - unranked_ligand_lims <- filtered_table %>% filter(type=="ligand", prioritization_rank <= top_n) %>% + unranked_ligand_lims <- filtered_table %>% filter(type=="ligand", .data[[rank_filter_col]] <= top_n) %>% select(color) %>% range - unranked_receptor_lims <- filtered_table %>% filter(type=="receptor", prioritization_rank <= top_n) %>% + unranked_receptor_lims <- filtered_table %>% filter(type=="receptor", .data[[rank_filter_col]] <= top_n) %>% select(color) %>% range } p1 <- p1 + new_scale_fill() + - geom_arc_bar(data = filtered_table %>% filter(type=="ligand", prioritization_rank > top_n), + geom_arc_bar(data = filtered_table %>% filter(type=="ligand", .data[[rank_filter_col]] > top_n), aes(x0 = x, y0 = y, r0 = 0, r = sqrt(size)*scale, start = start, end = start + pi, fill=color), color = "white") + @@ -758,7 +799,7 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = limits=unranked_ligand_lims, oob = scales::oob_squish, guide = "none") + new_scale_fill() + - geom_arc_bar(data = filtered_table %>% filter(type=="receptor", prioritization_rank > top_n), + geom_arc_bar(data = filtered_table %>% filter(type=="receptor", .data[[rank_filter_col]] > top_n), aes(x0 = x, y0 = y, r0 = 0, r = sqrt(size)*scale, start = start, end = start + pi, fill=color), color = "white") + @@ -769,7 +810,7 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = # Add ranking numbers if requested if (show_rankings){ - p1 <- p1 + geom_shadowtext(data = filtered_table %>% filter(prioritization_rank <= top_n), + p1 <- p1 + geom_shadowtext(data = filtered_table %>% filter(.data[[rank_filter_col]] <= top_n), aes(x=x, y=y, label=prioritization_rank)) } @@ -779,26 +820,26 @@ make_mushroom_plot <- function(prioritization_table, top_n = 30, show_rankings = ## Circos plot functions #' @title Assign ligands to cell types -#' @usage assign_ligands_to_celltype(seuratObj, ligands, celltype_col, func.agg=mean, func.assign=function(x) mean(x) + sd(x), slot="data", condition_oi=NULL, condition_col=NULL) #' @description Assign ligands to a sender cell type, based on the strongest expressing cell type of that ligand. Ligands are only assigned to a cell type if that cell type is the only one to show an expression that is higher than the average + SD. Otherwise, it is assigned to "General". #' @param seuratObj Seurat object #' @param ligands Vector of ligands to assign to cell types #' @param celltype_col Metadata column name in the Seurat object that contains the cell type information #' @param func.agg Function to use to aggregate the expression of a ligand across all cells in a cell type (default = mean) #' @param func.assign Function to use to assign a ligand to a cell type (default = mean + SD) -#' @param slot Slot in the Seurat object to use (default = "data"). If "data", the normalized counts are first exponentiated before aggregation is performed #' @param condition_oi Condition of interest to subset the Seurat object (default = NULL) #' @param condition_col Metadata column name in the Seurat object that contains the condition of interest (default = NULL) +#' @param ... Arguments passed to Seurat::GetAssayData, e.g., for the slot/layer to use (default: data) #' @return A data frame of two columns, the cell type the ligand has been assigned to (\code{ligand_type}) and the ligand name (\code{ligand}) +#' @details If the provided slot/layer is "data", the normalized counts are first exponentiated before aggregation is performed #' @export #' @examples \dontrun{ #' assign_ligands_to_celltype(seuratObj = seuratObj, ligands = best_upstream_ligands[1:20], #' celltype_col = "celltype", func.agg = mean, func.assign = function(x) {mean(x)+sd(x)}, -#' slot = "data", condition_oi = "LCMV", condition_col = "aggregate") +#' condition_oi = "LCMV", condition_col = "aggregate", slot = "data") #' } #' assign_ligands_to_celltype <- function(seuratObj, ligands, celltype_col, func.agg = mean, func.assign = function(x) {mean(x)+sd(x)}, - condition_oi = NULL, condition_col = NULL, slot = "data") { + condition_oi = NULL, condition_col = NULL, ...) { # Check that if condition_oi is given, then so is condition_oi, and vice versa if (any(!is.na(condition_col), !is.na(condition_oi)) & !all(!is.na(condition_col), !is.na(condition_oi))){ stop("Please input both condition_colname and condition_oi") @@ -809,26 +850,34 @@ assign_ligands_to_celltype <- function(seuratObj, ligands, celltype_col, func.ag stop("Not all ligands are in the Seurat object") } + slot <- "data" + # Check if slot or layer is provided + if (length(list(...)) > 0) { + if (any(grepl("slot|layer", names(list(...))))){ + slot <- list(...)[[which(grepl("slot|layer", names(list(...))))]] + } else { + warning("No slot/layer provided even though extra argument was provided, using default slot = 'data'") + } + } + seuratObj_subset <- subset(seuratObj, features = ligands) # Calculate average ligand expression in sender cells if (!is.null(condition_oi)){ seuratObj_subset <- seuratObj_subset[, seuratObj_subset[[condition_col]] == condition_oi ] } - - avg_expression_ligands <- lapply(unique(seuratObj_subset$celltype), function (celltype) { + avg_expression_ligands <- lapply(unique(seuratObj_subset[[celltype_col, drop=TRUE]]), function (celltype) { if (slot == "data"){ # Exponentiate-1 and calculate in non-log space - expm1(GetAssayData(seuratObj_subset[, seuratObj_subset[[celltype_col]] == celltype], slot = slot)) %>% + expm1(GetAssayData(seuratObj_subset[, seuratObj_subset[[celltype_col]] == celltype], ...)) %>% apply(1, func.agg) } else { - apply(GetAssayData(seuratObj_subset[, seuratObj_subset[[celltype_col]] == celltype], slot = slot), 1, func.agg) + apply(GetAssayData(seuratObj_subset[, seuratObj_subset[[celltype_col]] == celltype], ...), 1, func.agg) } - }) %>% setNames(unique(seuratObj_subset$celltype)) %>% - do.call(cbind, .) %>% - set_rownames(ligands) + }) %>% setNames(unique(seuratObj_subset[[celltype_col, drop=TRUE]])) %>% + do.call(cbind, .) sender_ligand_assignment <- avg_expression_ligands %>% apply(1, function(ligand_expression){ ligand_expression > func.assign(ligand_expression) @@ -841,11 +890,15 @@ assign_ligands_to_celltype <- function(seuratObj, ligands, celltype_col, func.ag ligand_type_indication_df <- lapply(names(sender_ligand_assignment), function(sender) { unique_ligands_sender <- names(sender_ligand_assignment[[sender]]) %>% setdiff(general_ligands) - data.frame(ligand_type = sender, ligand = unique_ligands_sender) + + if (length(unique_ligands_sender) > 0) { + return(data.frame(ligand_type = sender, ligand = unique_ligands_sender)) + } + }) %>% bind_rows() ligand_type_indication_df <- bind_rows(ligand_type_indication_df, - data.frame(ligand_type = "General", ligand = general_ligands)) + data.frame(ligand = general_ligands) %>% mutate(ligand_type = "General")) return(ligand_type_indication_df) } @@ -1223,7 +1276,8 @@ make_line_plot <- function(ligand_activities, potential_ligands, ranking_range = # Use equation of a line to find the x value at the cutoff # Different lines for the top and bottom cutoff mutate(m = (y2-y1)/(x2-x1), x0 = case_when(y2 > by_n ~ ((cutoff-y1)/m)+x1, - y2 <= by_n ~ (((n_ligands*margin)-y1)/m)+x1)) + y2 <= by_n ~ (((n_ligands*margin)-y1)/m)+x1)) %>% + filter(m != 0) # Highlight ties ties_df <- rankings_df %>% group_by(type, rank) %>% @@ -1235,12 +1289,18 @@ make_line_plot <- function(ligand_activities, potential_ligands, ranking_range = mutate(xstart = case_when(unique(group$type) == "agnostic" ~ agnostic_x, unique(group$type) == "focused" ~ focused_x), xend = xstart) - }) %>% bind_rows() %>% + }) %>% bind_rows() + + if (nrow(ties_df) > 0){ # Clip the lines to the cutoff - filter(ystart < cutoff, yend > by_n) %>% mutate(yend = case_when(yend > cutoff ~ cutoff, - TRUE ~ yend), - ystart = case_when(ystart <= (n_ligands*margin) ~ (n_ligands*margin)-(by_n*0.5), - TRUE ~ ystart)) + ties_df <- ties_df %>% filter(ystart < cutoff, yend > by_n) %>% + mutate(yend = case_when(yend > cutoff ~ cutoff, + TRUE ~ yend), + ystart = case_when(ystart <= (n_ligands*margin) ~ (n_ligands*margin)-(by_n*0.5), + TRUE ~ ystart)) + } else { + ties_df <- data.frame(ystart = numeric(), yend = numeric(), xstart = numeric(), xend = numeric()) + } # Subset the dataframe to the range of interest rankings_df_subset <- rankings_df %>% filter(new_rank <= end_n, new_rank >= start_n) diff --git a/R/characterization_data_sources.R b/R/characterization_data_sources.R index e4f2638..0537cb9 100644 --- a/R/characterization_data_sources.R +++ b/R/characterization_data_sources.R @@ -353,10 +353,15 @@ evaluate_model = function(parameters_setting, lr_network, sig_network, gr_networ ligand_importances$spearman[is.na(ligand_importances$spearman)] = 0 ligand_importances$pearson_log_pval[is.na(ligand_importances$pearson_log_pval)] = 0 ligand_importances$spearman_log_pval[is.na(ligand_importances$spearman_log_pval)] = 0 - ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 ligand_importances$pearson_log_pval[is.infinite(ligand_importances$pearson_log_pval)] = 10000 ligand_importances$spearman_log_pval[is.infinite(ligand_importances$spearman_log_pval)] = 10000 - ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + + if ("mean_rank_GST_log_pval" %in% colnames(ligand_importances)){ + ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 + ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + } else{ + warning("mean_rank_GST_log_pval not in ligand_importances; do you have limma installed?") + } all_importances = ligand_importances %>% select_if(.predicate = function(x){sum(is.na(x)) == 0}) # all_importances = full_join(ligand_importances, ligand_importances_glm, by = c("setting","test_ligand","ligand")) %>% full_join(ligand_importances_discrete, by = c("setting","test_ligand", "ligand")) diff --git a/R/evaluate_model_ligand_prediction.R b/R/evaluate_model_ligand_prediction.R index ad8e14e..120f810 100644 --- a/R/evaluate_model_ligand_prediction.R +++ b/R/evaluate_model_ligand_prediction.R @@ -1,828 +1,826 @@ -#' @title Convert settings to correct settings format for ligand prediction. -#' -#' @description \code{convert_settings_ligand_prediction} Converts settings to correct settings format for ligand activity prediction. In this prediction problem, ligands (out of a set of possibly active ligands) will be ranked based on feature importance scores. The format can be made suited for: 1) validation of ligand activity state prediction by calculating individual feature importane scores or 2) feature importance based on models with embedded feature importance determination; applications in which ligands need to be scores based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. -#' -#' @usage -#' convert_settings_ligand_prediction(settings, all_ligands, validation = TRUE, single = TRUE) -#' -#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. -#' @param all_ligands A character vector of possible ligands that will be considered for the ligand activity state prediction. -#' @param validation TRUE if seetings need to be prepared for validation of ligand activity state predictions (this implies that the true active ligand of a setting is known); FALSE for application purposes when the true active ligand(s) is/are not known. -#' @param single TRUE if feature importance scores for ligands will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. - -#' @return A list with following elements: $name, $ligand: name of active ligand(s) (only if validation is TRUE), $from (ligand(s) that will be tested for activity prediction), $response -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) -#' ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, ligands, validation = TRUE, single = TRUE) -#' } -#' @export -#' -#' -convert_settings_ligand_prediction = function(settings,all_ligands,validation = TRUE, single = TRUE){ - - # input check - if(!is.list(settings)) - stop("settings should be a list") - if(!is.character(all_ligands)) - stop("all_ligands should be a character vector") - if(!is.logical(validation) | length(validation) != 1) - stop("validation should be TRUE or FALSE") - if(!is.logical(single) | length(single) != 1) - stop("single should be TRUE or FALSE") - - requireNamespace("dplyr") - - new_settings = list() - if (validation == TRUE && single == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - for (k in 1:length(all_ligands)){ - test_ligand = all_ligands[[k]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_validation(setting,test_ligand)) - } - } - } else if (validation == TRUE && single == FALSE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_validation(setting,all_ligands)) - } - } else if (validation == FALSE && single == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - for (k in 1:length(all_ligands)){ - test_ligand = all_ligands[[k]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_ligand)) - } - } - } else if (validation == FALSE && single == FALSE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_ligands)) - } - } - return(new_settings %>% unlist(recursive = FALSE)) -} -#' @title Get ligand importances based on target gene prediction performance of single ligands. -#' -#' @description \code{get_single_ligand_importances} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. -#' -#' @usage -#' get_single_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @inheritParams evaluate_target_prediction -#' -#' @return A data.frame with for each ligand - data set combination, classification evaluation metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' print(head(ligand_importances)) -#' } -#' @export -#' -get_single_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - metrics = evaluate_target_prediction(setting, ligand_target_matrix, ligands_position) - metrics = metrics %>% rename(test_ligand = ligand) - if (known == TRUE){ - true_ligand = setting$ligand - metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) - metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) - } - return(metrics) -} -#' @title Get ligand importances from a multi-ligand classfication model. -#' -#' @description \code{get_multi_ligand_importances} A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_glm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances,ligand_target_matrix, algorithm = "glm")) -#' print(head(ligand_importances_glm)) -#' } -#' @export -#' -get_multi_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - output = evaluate_multi_ligand_target_prediction(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors, continuous) - metrics = output$var_imps - metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, importance) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, importance) - return(metrics) - -} -#' @title Evaluation of ligand activity prediction based on ligand importance scores. -#' -#' @description \code{evaluate_importances_ligand_prediction} Evaluate how well a trained model of ligand importance scores is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. A classificiation algorithm chosen by the user is trained to construct one model based on the ligand importance scores of all ligands of interest (ligands importance scores are considered as features). Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. -#' -#' @usage -#' evaluate_importances_ligand_prediction(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) -#' -#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' -#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each ligand-setting combination the ligand importance scores for the individual importance scores, the complete model of importance scores and the ligand activity as well (TRUE or FALSE); $model: the caret model object that can be used on new importance scores to predict the ligand activity state. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom limma wilcoxGST -#' @import caret -#' @importFrom purrr safely -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") -#' print(head(evaluation)) -#' } -#' @export -#' -evaluate_importances_ligand_prediction = function(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ - if (!is.data.frame(importances)) - stop("importances must be a data frame") - if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) - stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") - if(normalization != "mean" & normalization != "median") - stop("normalization should be 'mean' or 'median'") - if(!is.character(algorithm)) - stop("algorithm should be a character vector") - if(!is.logical(var_imps) | length(var_imps) > 1) - stop("var_imps should be a logical vector: TRUE or FALSE") - if(!is.logical(cv) | length(cv) > 1) - stop("cv should be a logical vector: TRUE or FALSE") - if(!is.numeric(cv_number) | length(cv_number) > 1) - stop("cv_number should be a numeric vector of length 1") - if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) - stop("cv_repeats should be a numeric vector of length 1") - if(!is.logical(parallel) | length(parallel) > 1) - stop("parallel should be a logical vector: TRUE or FALSE") - if(!is.numeric(n_cores) | length(n_cores) > 1) - stop("n_cores should be a numeric vector of length 1") - if(!is.logical(ignore_errors) | length(ignore_errors) > 1) - stop("ignore_errors should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - -# importances = importances %>% tidyr::drop_na() - added = is_ligand_active(importances) - importances = importances %>% mutate(class = added) - - if (normalization == "mean"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "median"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) - } - - response_vector = importances$class %>% make.names() %>% as.factor() - train_data = normalized_importances %>% mutate(obs = response_vector) %>% data.frame() - - output = wrapper_caret_classification(train_data,algorithm,TRUE,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = bind_cols(importances %>% select(setting,ligand,test_ligand,class), normalized_importances),ignore_errors,return_model = TRUE) - return(output) -} -#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: aggregate all datasets. -#' -#' @description \code{evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. -#' -#' @usage -#' evaluate_single_importances_ligand_prediction(importances,normalization) -#' -#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) or "no" (use unnormalized feature importance scores - only recommended when evaluating ligand activity prediction on individual datasets) -#' -#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom limma wilcoxGST -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_single_importances_ligand_prediction(ligand_importances,normalization = "median") -#' print(head(evaluation)) -#' } -#' @export -#' -evaluate_single_importances_ligand_prediction = function(importances,normalization){ - if (!is.data.frame(importances)) - stop("importances must be a data frame") - if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) - stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") - if(normalization != "mean" & normalization != "median" & normalization != "no") - stop("normalization should be 'mean' or 'median' or 'no'") - - requireNamespace("dplyr") - importances0 = importances %>% select(-setting,-ligand,-test_ligand) -# importances = importances %>% tidyr::drop_na() - added = is_ligand_active(importances) - - if (nrow(importances) == 0){ - performances = lapply(importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) - output = tibble(importance_measure = names(performances)) - performances = bind_rows(performances) - return(bind_cols(output,performances)) - } - - importances = importances %>% select_if(.predicate = function(x) { - sum(is.na(x)) == 0 - }) - - if (normalization == "mean"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "median"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "no") { - normalized_importances = importances %>% select(-c(setting,test_ligand,ligand)) - } - - performances = lapply(normalized_importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) - output = tibble(importance_measure = names(performances)) - performances = bind_rows(performances) - return(bind_cols(output,performances)) -} -#' @title Prediction of ligand activity prediction by a model trained on ligand importance scores. -#' -#' @description \code{model_based_ligand_activity_prediction} Predict the activity state of a ligand based on a classification model that was trained to predict ligand activity state based on ligand importance scores. -#' -#' @usage -#' model_based_ligand_activity_prediction(importances, model, normalization) -#' -#' @param model A model object of a classification object as e.g. generated via caret. -#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) -#' -#' @return A data frame containing the ligand importance scores and the probabilities that according to the trained model, the ligands are active based on their importance scores. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") -#' -#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) -#' activity_predictions = model_based_ligand_activity_prediction(ligand_importances, evaluation$model,"median") -#' print(head(activity_predictions)) -#' } -#' -#' @export -#' -model_based_ligand_activity_prediction = function(importances, model, normalization){ - if (!is.list(model)) - stop("model must be a list, derived as model object from model training (e.g. via the caret package)") - if(model$finalModel$problemType != "Classification" & model$finalModel$problemType != "Regression") - stop("model should be model object (derived from model training)") - if (!is.data.frame(importances)) - stop("importances must be a data frame") - if(!is.character(importances$setting) | !is.character(importances$test_ligand)) - stop("importances$setting and importances$test_ligand should be character vectors") - if(normalization != "mean" & normalization != "median") - stop("normalization should be 'mean' or 'median'") - - requireNamespace("dplyr") - -# importances = importances %>% tidyr::drop_na() - - if (normalization == "mean"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "median"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) - } - - final_model_predictions = predict(model,newdata = normalized_importances, type = "prob") - final_model_predictions = final_model_predictions %>% as_tibble() %>% mutate(active = TRUE. > FALSE.) %>% select(-FALSE.) %>% rename(model = TRUE.) - return(bind_cols(importances,final_model_predictions) %>% as_tibble()) - -} -#' @title Get ligand importances from a multi-ligand trained random forest model. -#' -#' @description \code{get_multi_ligand_rf_importances} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_rf_importances(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. -#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). -#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @importFrom randomForest randomForest importance -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances,ligand_target_matrix, ntrees = 100, mtry = 2)) -#' print(head(ligand_importances_rf)) -#' } -#' -#' @export -#' -get_multi_ligand_rf_importances = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - if(ntrees <= 1) - stop("ntrees should be higher than 1") - if(mtry <= 1) - stop("mtry should be higher than 1") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - ligands_oi = setting$from - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - if(is.double(prediction_matrix) == FALSE){ - convert_categorical_factor = function(x){ - x = x %>% make.names() %>% as.factor() - } - prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) - } - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - rf_model = randomForest::randomForest(y = train_data$obs, - x = train_data[,-(which(colnames(train_data) == "obs"))], - ntree = ntrees, - mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), - importance = TRUE - ) - - metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, MeanDecreaseAccuracy, MeanDecreaseGini) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, MeanDecreaseAccuracy, MeanDecreaseGini) - return(metrics) - - -} -#' @title Get ligand importances based on target gene value prediction performance of single ligands (regression). -#' -#' @description \code{get_single_ligand_importances_regression} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. Response: continuous values associated to a gene, e.g. a log fold change value. -#' -#' @usage -#' get_single_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @inheritParams evaluate_target_prediction_regression -#' -#' @return A data.frame with for each ligand - data set combination, regression model fit metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction_regression}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances_regression,ligand_target_matrix)) -#' print(head(ligand_importances)) -#' } -#' @export -#' -get_single_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - metrics = evaluate_target_prediction_regression(setting, ligand_target_matrix, ligands_position) - metrics = metrics %>% rename(test_ligand = ligand) - if (known == TRUE){ - true_ligand = setting$ligand - metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) - metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) - } - return(metrics) -} -#' @title Get ligand importances from a multi-ligand regression model. -#' -#' @description \code{get_multi_ligand_importances_regression} A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (respone: e.g. absolute value of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_lm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances_regression,ligand_target_matrix, algorithm = "lm")) -#' print(head(ligand_importances_lm)) -#' } -#' @export -#' -get_multi_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - output = evaluate_multi_ligand_target_prediction_regression(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors) - metrics = output$var_imps - metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, importance) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, importance) - return(metrics) - -} -#' @title Get ligand importances from a multi-ligand trained random forest regression model. -#' -#' @description \code{get_multi_ligand_rf_importances_regression} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (response: e.g. absolute values of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_rf_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. -#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @importFrom randomForest randomForest importance -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances_regression,ligand_target_matrix, ntrees = 100, mtry = 2)) -#' print(head(ligand_importances_rf)) -#' } -#' -#' @export -#' -get_multi_ligand_rf_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - if(ntrees <= 1) - stop("ntrees should be higher than 1") - if(mtry <= 1) - stop("mtry should be higher than 1") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - ligands_oi = setting$from - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - response_df = tibble(gene = names(response_vector), response = response_vector) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - rf_model = randomForest::randomForest(y = train_data$obs, - x = train_data[,-(which(colnames(train_data) == "obs"))], - ntree = ntrees, - mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), - importance = TRUE - ) - - metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) - return(metrics) -} -#' @title Convert settings to correct settings format for TF prediction. -#' -#' @description \code{convert_settings_tf_prediction} Converts settings to correct settings format for TF activity prediction. In this prediction problem, TFs (out of a set of possibly active TFs) will be ranked based on feature importance scores. The format can be made suited for applications in which TFs need to be scored based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. Remark that upstream regulator analysis for TFs here is experimental and was not thoroughly validated in the study accompanying this package. -#' -#' @usage -#' convert_settings_tf_prediction(settings, all_tfs, single = TRUE) -#' -#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the tf(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. -#' @param all_tfs A character vector of possible tfs that will be considered for the tf activity state prediction. -#' @param single TRUE if feature importance scores for tfs will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. - -#' @return A list with following elements: $name, $tf: name of active tf(s) (only if validation is TRUE), $from (tf(s) that will be tested for activity prediction), $response -#' -#' @examples -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_tf_pred = convert_settings_tf_prediction(settings, all_tfs = c("SMAD1","STAT1","RELA"), single = TRUE) -#' # show how this function can be used to predict activities of TFs -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' tf_target = construct_tf_target_matrix(weighted_networks, tfs_as_cols = TRUE, standalone_output = TRUE) -#' tf_importances = dplyr::bind_rows(lapply(settings_tf_pred,get_single_ligand_importances,tf_target,known = FALSE)) -#' print(head(tf_importances)) -#' -#' @export -#' -#' -convert_settings_tf_prediction = function(settings,all_tfs, single = TRUE){ - - # input check - if(!is.list(settings)) - stop("settings should be a list") - if(!is.character(all_tfs)) - stop("all_tfs should be a character vector") - if(!is.logical(single) | length(single) != 1) - stop("single should be TRUE or FALSE") - - requireNamespace("dplyr") - - new_settings = list() - if (single == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - for (k in 1:length(all_tfs)){ - test_tf = all_tfs[[k]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_tf)) - } - } - } else if (single == FALSE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_tfs)) - } - } - return(new_settings %>% unlist(recursive = FALSE)) -} -#' @title Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands. -#' -#' @description \code{convert_expression_settings_evaluation} Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands.(useful for applications when a lot of ligands are potentially active, a lot of settings need to be predicted and a multi-ligand model is trained). -#' -#' @usage -#' convert_settings_topn_ligand_prediction(setting, importances, model, n, normalization) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) -#' @param n The top n number of ligands according to the ligand activity state prediction model will be considered as potential ligand for the generation of a new setting. -#' @inheritParams model_based_ligand_activity_prediction - -#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") -#' -#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) -#' settings = lapply(settings,convert_settings_topn_ligand_prediction, importances = ligand_importances, model = evaluation$model, n = 3, normalization = "median" ) -#' } -#' -#' @export -#' -convert_settings_topn_ligand_prediction = function(setting, importances, model, n, normalization){ - # input check - if(!is.list(setting)) - stop("setting should be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.logical(setting$response)) - stop("setting$response should be a logical vector") - - requireNamespace("dplyr") - setting_name = setting$name - importances_oi = importances %>% filter(setting == setting_name) - output = model_based_ligand_activity_prediction(importances_oi, model,normalization) - top_ligands = output %>% top_n(n,model) %>% .$test_ligand - - new_setting = list() - new_setting$name = setting$name - new_setting$from = top_ligands - new_setting$response = setting$response - - return(new_setting) -} - -#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: each dataset individually. -#' -#' @description \code{wrapper_evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. -#' -#' @usage -#' wrapper_evaluate_single_importances_ligand_prediction(group,ligand_importances) -#' -#' @param group Name of the dataset (setting) you want to calculate ligand activity performance for. -#' @param ligand_importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' -#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = ligand_importances$setting %>% unique() %>% lapply(function(x){x}) %>% lapply(wrapper_evaluate_single_importances_ligand_prediction,ligand_importances) %>% bind_rows() %>% inner_join(ligand_importances %>% distinct(setting,ligand)) -#' print(head(evaluation)) -#' } -#' @export -#' -wrapper_evaluate_single_importances_ligand_prediction = function(group,ligand_importances){ - if (!is.data.frame(ligand_importances)) - stop("ligand_importances must be a data frame") - if (!is.character(group)) - stop("group must be a character") - ligand_importances %>% filter(setting %in% group) %>% evaluate_single_importances_ligand_prediction(normalization = "no") %>% mutate(setting = group) -} - +#' @title Convert settings to correct settings format for ligand prediction. +#' +#' @description \code{convert_settings_ligand_prediction} Converts settings to correct settings format for ligand activity prediction. In this prediction problem, ligands (out of a set of possibly active ligands) will be ranked based on feature importance scores. The format can be made suited for: 1) validation of ligand activity state prediction by calculating individual feature importane scores or 2) feature importance based on models with embedded feature importance determination; applications in which ligands need to be scores based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. +#' +#' @usage +#' convert_settings_ligand_prediction(settings, all_ligands, validation = TRUE, single = TRUE) +#' +#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. +#' @param all_ligands A character vector of possible ligands that will be considered for the ligand activity state prediction. +#' @param validation TRUE if seetings need to be prepared for validation of ligand activity state predictions (this implies that the true active ligand of a setting is known); FALSE for application purposes when the true active ligand(s) is/are not known. +#' @param single TRUE if feature importance scores for ligands will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. + +#' @return A list with following elements: $name, $ligand: name of active ligand(s) (only if validation is TRUE), $from (ligand(s) that will be tested for activity prediction), $response +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) +#' ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, ligands, validation = TRUE, single = TRUE) +#' } +#' @export +#' +#' +convert_settings_ligand_prediction = function(settings,all_ligands,validation = TRUE, single = TRUE){ + + # input check + if(!is.list(settings)) + stop("settings should be a list") + if(!is.character(all_ligands)) + stop("all_ligands should be a character vector") + if(!is.logical(validation) | length(validation) != 1) + stop("validation should be TRUE or FALSE") + if(!is.logical(single) | length(single) != 1) + stop("single should be TRUE or FALSE") + + requireNamespace("dplyr") + + new_settings = list() + if (validation == TRUE && single == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + for (k in 1:length(all_ligands)){ + test_ligand = all_ligands[[k]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_validation(setting,test_ligand)) + } + } + } else if (validation == TRUE && single == FALSE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_validation(setting,all_ligands)) + } + } else if (validation == FALSE && single == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + for (k in 1:length(all_ligands)){ + test_ligand = all_ligands[[k]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_ligand)) + } + } + } else if (validation == FALSE && single == FALSE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_ligands)) + } + } + return(new_settings %>% unlist(recursive = FALSE)) +} +#' @title Get ligand importances based on target gene prediction performance of single ligands. +#' +#' @description \code{get_single_ligand_importances} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. +#' +#' @usage +#' get_single_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @inheritParams evaluate_target_prediction +#' +#' @return A data.frame with for each ligand - data set combination, classification evaluation metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' print(head(ligand_importances)) +#' } +#' @export +#' +get_single_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + metrics = evaluate_target_prediction(setting, ligand_target_matrix, ligands_position) + metrics = metrics %>% rename(test_ligand = ligand) + if (known == TRUE){ + true_ligand = setting$ligand + metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) + metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) + } + return(metrics) +} +#' @title Get ligand importances from a multi-ligand classfication model. +#' +#' @description \code{get_multi_ligand_importances} A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_glm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances,ligand_target_matrix, algorithm = "glm")) +#' print(head(ligand_importances_glm)) +#' } +#' @export +#' +get_multi_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + output = evaluate_multi_ligand_target_prediction(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors, continuous) + metrics = output$var_imps + metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, importance) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, importance) + return(metrics) + +} +#' @title Evaluation of ligand activity prediction based on ligand importance scores. +#' +#' @description \code{evaluate_importances_ligand_prediction} Evaluate how well a trained model of ligand importance scores is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. A classificiation algorithm chosen by the user is trained to construct one model based on the ligand importance scores of all ligands of interest (ligands importance scores are considered as features). Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. +#' +#' @usage +#' evaluate_importances_ligand_prediction(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) +#' +#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' +#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each ligand-setting combination the ligand importance scores for the individual importance scores, the complete model of importance scores and the ligand activity as well (TRUE or FALSE); $model: the caret model object that can be used on new importance scores to predict the ligand activity state. +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @import caret +#' @importFrom purrr safely +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") +#' print(head(evaluation)) +#' } +#' @export +#' +evaluate_importances_ligand_prediction = function(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ + if (!is.data.frame(importances)) + stop("importances must be a data frame") + if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) + stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") + if(normalization != "mean" & normalization != "median") + stop("normalization should be 'mean' or 'median'") + if(!is.character(algorithm)) + stop("algorithm should be a character vector") + if(!is.logical(var_imps) | length(var_imps) > 1) + stop("var_imps should be a logical vector: TRUE or FALSE") + if(!is.logical(cv) | length(cv) > 1) + stop("cv should be a logical vector: TRUE or FALSE") + if(!is.numeric(cv_number) | length(cv_number) > 1) + stop("cv_number should be a numeric vector of length 1") + if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) + stop("cv_repeats should be a numeric vector of length 1") + if(!is.logical(parallel) | length(parallel) > 1) + stop("parallel should be a logical vector: TRUE or FALSE") + if(!is.numeric(n_cores) | length(n_cores) > 1) + stop("n_cores should be a numeric vector of length 1") + if(!is.logical(ignore_errors) | length(ignore_errors) > 1) + stop("ignore_errors should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + +# importances = importances %>% tidyr::drop_na() + added = is_ligand_active(importances) + importances = importances %>% mutate(class = added) + + if (normalization == "mean"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "median"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) + } + + response_vector = importances$class %>% make.names() %>% as.factor() + train_data = normalized_importances %>% mutate(obs = response_vector) %>% data.frame() + + output = wrapper_caret_classification(train_data,algorithm,TRUE,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = bind_cols(importances %>% select(setting,ligand,test_ligand,class), normalized_importances),ignore_errors,return_model = TRUE) + return(output) +} +#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: aggregate all datasets. +#' +#' @description \code{evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. +#' +#' @usage +#' evaluate_single_importances_ligand_prediction(importances,normalization) +#' +#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) or "no" (use unnormalized feature importance scores - only recommended when evaluating ligand activity prediction on individual datasets) +#' +#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_single_importances_ligand_prediction(ligand_importances,normalization = "median") +#' print(head(evaluation)) +#' } +#' @export +#' +evaluate_single_importances_ligand_prediction = function(importances,normalization){ + if (!is.data.frame(importances)) + stop("importances must be a data frame") + if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) + stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") + if(normalization != "mean" & normalization != "median" & normalization != "no") + stop("normalization should be 'mean' or 'median' or 'no'") + + requireNamespace("dplyr") + importances0 = importances %>% select(-setting,-ligand,-test_ligand) +# importances = importances %>% tidyr::drop_na() + added = is_ligand_active(importances) + + if (nrow(importances) == 0){ + performances = lapply(importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) + output = tibble(importance_measure = names(performances)) + performances = bind_rows(performances) + return(bind_cols(output,performances)) + } + + importances = importances %>% select_if(.predicate = function(x) { + sum(is.na(x)) == 0 + }) + + if (normalization == "mean"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "median"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "no") { + normalized_importances = importances %>% select(-c(setting,test_ligand,ligand)) + } + + performances = lapply(normalized_importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) + output = tibble(importance_measure = names(performances)) + performances = bind_rows(performances) + return(bind_cols(output,performances)) +} +#' @title Prediction of ligand activity prediction by a model trained on ligand importance scores. +#' +#' @description \code{model_based_ligand_activity_prediction} Predict the activity state of a ligand based on a classification model that was trained to predict ligand activity state based on ligand importance scores. +#' +#' @usage +#' model_based_ligand_activity_prediction(importances, model, normalization) +#' +#' @param model A model object of a classification object as e.g. generated via caret. +#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) +#' +#' @return A data frame containing the ligand importance scores and the probabilities that according to the trained model, the ligands are active based on their importance scores. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") +#' +#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) +#' activity_predictions = model_based_ligand_activity_prediction(ligand_importances, evaluation$model,"median") +#' print(head(activity_predictions)) +#' } +#' +#' @export +#' +model_based_ligand_activity_prediction = function(importances, model, normalization){ + if (!is.list(model)) + stop("model must be a list, derived as model object from model training (e.g. via the caret package)") + if(model$finalModel$problemType != "Classification" & model$finalModel$problemType != "Regression") + stop("model should be model object (derived from model training)") + if (!is.data.frame(importances)) + stop("importances must be a data frame") + if(!is.character(importances$setting) | !is.character(importances$test_ligand)) + stop("importances$setting and importances$test_ligand should be character vectors") + if(normalization != "mean" & normalization != "median") + stop("normalization should be 'mean' or 'median'") + + requireNamespace("dplyr") + +# importances = importances %>% tidyr::drop_na() + + if (normalization == "mean"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "median"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) + } + + final_model_predictions = predict(model,newdata = normalized_importances, type = "prob") + final_model_predictions = final_model_predictions %>% as_tibble() %>% mutate(active = TRUE. > FALSE.) %>% select(-FALSE.) %>% rename(model = TRUE.) + return(bind_cols(importances,final_model_predictions) %>% as_tibble()) + +} +#' @title Get ligand importances from a multi-ligand trained random forest model. +#' +#' @description \code{get_multi_ligand_rf_importances} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_rf_importances(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. +#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). +#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @importFrom randomForest randomForest importance +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances,ligand_target_matrix, ntrees = 100, mtry = 2)) +#' print(head(ligand_importances_rf)) +#' } +#' +#' @export +#' +get_multi_ligand_rf_importances = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + if(ntrees <= 1) + stop("ntrees should be higher than 1") + if(mtry <= 1) + stop("mtry should be higher than 1") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + ligands_oi = setting$from + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + if(is.double(prediction_matrix) == FALSE){ + convert_categorical_factor = function(x){ + x = x %>% make.names() %>% as.factor() + } + prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) + } + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + rf_model = randomForest::randomForest(y = train_data$obs, + x = train_data[,-(which(colnames(train_data) == "obs"))], + ntree = ntrees, + mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), + importance = TRUE + ) + + metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, MeanDecreaseAccuracy, MeanDecreaseGini) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, MeanDecreaseAccuracy, MeanDecreaseGini) + return(metrics) + + +} +#' @title Get ligand importances based on target gene value prediction performance of single ligands (regression). +#' +#' @description \code{get_single_ligand_importances_regression} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. Response: continuous values associated to a gene, e.g. a log fold change value. +#' +#' @usage +#' get_single_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @inheritParams evaluate_target_prediction_regression +#' +#' @return A data.frame with for each ligand - data set combination, regression model fit metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction_regression}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances_regression,ligand_target_matrix)) +#' print(head(ligand_importances)) +#' } +#' @export +#' +get_single_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + metrics = evaluate_target_prediction_regression(setting, ligand_target_matrix, ligands_position) + metrics = metrics %>% rename(test_ligand = ligand) + if (known == TRUE){ + true_ligand = setting$ligand + metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) + metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) + } + return(metrics) +} +#' @title Get ligand importances from a multi-ligand regression model. +#' +#' @description \code{get_multi_ligand_importances_regression} A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (respone: e.g. absolute value of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_lm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances_regression,ligand_target_matrix, algorithm = "lm")) +#' print(head(ligand_importances_lm)) +#' } +#' @export +#' +get_multi_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + output = evaluate_multi_ligand_target_prediction_regression(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors) + metrics = output$var_imps + metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, importance) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, importance) + return(metrics) + +} +#' @title Get ligand importances from a multi-ligand trained random forest regression model. +#' +#' @description \code{get_multi_ligand_rf_importances_regression} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (response: e.g. absolute values of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_rf_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. +#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @importFrom randomForest randomForest importance +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances_regression,ligand_target_matrix, ntrees = 100, mtry = 2)) +#' print(head(ligand_importances_rf)) +#' } +#' +#' @export +#' +get_multi_ligand_rf_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + if(ntrees <= 1) + stop("ntrees should be higher than 1") + if(mtry <= 1) + stop("mtry should be higher than 1") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + ligands_oi = setting$from + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + response_df = tibble(gene = names(response_vector), response = response_vector) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + rf_model = randomForest::randomForest(y = train_data$obs, + x = train_data[,-(which(colnames(train_data) == "obs"))], + ntree = ntrees, + mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), + importance = TRUE + ) + + metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) + return(metrics) +} +#' @title Convert settings to correct settings format for TF prediction. +#' +#' @description \code{convert_settings_tf_prediction} Converts settings to correct settings format for TF activity prediction. In this prediction problem, TFs (out of a set of possibly active TFs) will be ranked based on feature importance scores. The format can be made suited for applications in which TFs need to be scored based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. Remark that upstream regulator analysis for TFs here is experimental and was not thoroughly validated in the study accompanying this package. +#' +#' @usage +#' convert_settings_tf_prediction(settings, all_tfs, single = TRUE) +#' +#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the tf(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. +#' @param all_tfs A character vector of possible tfs that will be considered for the tf activity state prediction. +#' @param single TRUE if feature importance scores for tfs will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. + +#' @return A list with following elements: $name, $tf: name of active tf(s) (only if validation is TRUE), $from (tf(s) that will be tested for activity prediction), $response +#' +#' @examples +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_tf_pred = convert_settings_tf_prediction(settings, all_tfs = c("SMAD1","STAT1","RELA"), single = TRUE) +#' # show how this function can be used to predict activities of TFs +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' tf_target = construct_tf_target_matrix(weighted_networks, tfs_as_cols = TRUE, standalone_output = TRUE) +#' tf_importances = dplyr::bind_rows(lapply(settings_tf_pred,get_single_ligand_importances,tf_target,known = FALSE)) +#' print(head(tf_importances)) +#' +#' @export +#' +#' +convert_settings_tf_prediction = function(settings,all_tfs, single = TRUE){ + + # input check + if(!is.list(settings)) + stop("settings should be a list") + if(!is.character(all_tfs)) + stop("all_tfs should be a character vector") + if(!is.logical(single) | length(single) != 1) + stop("single should be TRUE or FALSE") + + requireNamespace("dplyr") + + new_settings = list() + if (single == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + for (k in 1:length(all_tfs)){ + test_tf = all_tfs[[k]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_tf)) + } + } + } else if (single == FALSE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_tfs)) + } + } + return(new_settings %>% unlist(recursive = FALSE)) +} +#' @title Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands. +#' +#' @description \code{convert_expression_settings_evaluation} Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands.(useful for applications when a lot of ligands are potentially active, a lot of settings need to be predicted and a multi-ligand model is trained). +#' +#' @usage +#' convert_settings_topn_ligand_prediction(setting, importances, model, n, normalization) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) +#' @param n The top n number of ligands according to the ligand activity state prediction model will be considered as potential ligand for the generation of a new setting. +#' @inheritParams model_based_ligand_activity_prediction + +#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") +#' +#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) +#' settings = lapply(settings,convert_settings_topn_ligand_prediction, importances = ligand_importances, model = evaluation$model, n = 3, normalization = "median" ) +#' } +#' +#' @export +#' +convert_settings_topn_ligand_prediction = function(setting, importances, model, n, normalization){ + # input check + if(!is.list(setting)) + stop("setting should be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.logical(setting$response)) + stop("setting$response should be a logical vector") + + requireNamespace("dplyr") + setting_name = setting$name + importances_oi = importances %>% filter(setting == setting_name) + output = model_based_ligand_activity_prediction(importances_oi, model,normalization) + top_ligands = output %>% top_n(n,model) %>% .$test_ligand + + new_setting = list() + new_setting$name = setting$name + new_setting$from = top_ligands + new_setting$response = setting$response + + return(new_setting) +} + +#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: each dataset individually. +#' +#' @description \code{wrapper_evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. +#' +#' @usage +#' wrapper_evaluate_single_importances_ligand_prediction(group,ligand_importances) +#' +#' @param group Name of the dataset (setting) you want to calculate ligand activity performance for. +#' @param ligand_importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' +#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = ligand_importances$setting %>% unique() %>% lapply(function(x){x}) %>% lapply(wrapper_evaluate_single_importances_ligand_prediction,ligand_importances) %>% bind_rows() %>% inner_join(ligand_importances %>% distinct(setting,ligand)) +#' print(head(evaluation)) +#' } +#' @export +#' +wrapper_evaluate_single_importances_ligand_prediction = function(group,ligand_importances){ + if (!is.data.frame(ligand_importances)) + stop("ligand_importances must be a data frame") + if (!is.character(group)) + stop("group must be a character") + ligand_importances %>% filter(setting %in% group) %>% evaluate_single_importances_ligand_prediction(normalization = "no") %>% mutate(setting = group) +} + diff --git a/R/evaluate_model_target_prediction.R b/R/evaluate_model_target_prediction.R index 939e831..462ccab 100644 --- a/R/evaluate_model_target_prediction.R +++ b/R/evaluate_model_target_prediction.R @@ -1,683 +1,682 @@ -#' @title Extract ligands of interest from settings -#' -#' @description \code{extract_ligands_from_settings} Extract ligands of interest from (expression) settings in correct to construct the ligand-target matrix. -#' -#' @usage -#' extract_ligands_from_settings(settings,combination = TRUE) -#' -#' @param settings A list of lists for which each sub-list contains the information about (expression) datasets; with minimally the following elements: name of the setting ($name), ligands (possibly) active in the setting of interest ($from). -#' @param combination Indicate whether in case multiple ligands are possibly active ligand combinations should be extracted or only individual ligands. Default: TRUE. -#' -#' @return A list containing the ligands and ligands combinations for which a ligand-target matrix should be constructed. When for a particular dataset multiple ligands are possibly active (i.e. more than ligand in .$from slot of sublist of settings), then both the combination of these multiple ligands and each of these multiple ligands individually will be select for model construction. -#' -#' @examples -#' \dontrun{ -#' ligands = extract_ligands_from_settings(expression_settings_validation) -#' } -#' @export -#' -extract_ligands_from_settings = function(settings, combination = TRUE){ - - # input check - if (!is.list(settings)) - stop("settings must be a list") - if(sum(sapply(settings,function(x){is.character(x$from)})) != length(settings)) - stop("settings$.$from must be a character vector containing ligands") - - ligands_oi = list() - if (combination == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - ligand = setting$from - if (length(ligand) == 1) { - ligands_oi[length(ligands_oi) + 1] = ligand - } else {# if multiple ligands added - ligands_oi[[length(ligands_oi) + 1]] = ligand # ligands together - for (l in ligand) { - ligands_oi[length(ligands_oi) + 1] = l # ligands separate - } - } - } - } else { - for (i in 1:length(settings)){ - setting = settings[[i]] - ligand = setting$from - if (length(ligand) == 1) { - ligands_oi[length(ligands_oi) + 1] = ligand - } else {# if multiple ligands added - for (l in ligand) { - ligands_oi[length(ligands_oi) + 1] = l # ligands separate - } - } - } - } - - ligands_oi = unique(ligands_oi) - return(ligands_oi) -} -#' @title Convert expression settings to correct settings format for evaluation of target gene prediction. -#' -#' @description \code{convert_expression_settings_evaluation} Converts expression settings to correct settings format for evaluation of target gene prediction. -#' -#' @usage -#' convert_expression_settings_evaluation(setting) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) - -#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) -#' } -#' @export -#' -#' -convert_expression_settings_evaluation = function(setting) { - # input check - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.data.frame(setting$diffexp)) - stop("setting$diffexp should be data frame") - if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) - stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") - - requireNamespace("dplyr") - - diffexp_df = setting$diffexp %>% mutate(diffexp = (abs(lfc) >= 1) & (qval <= 0.1)) - diffexp_vector = diffexp_df$diffexp - names(diffexp_vector) = diffexp_df$gene - diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] - if((diffexp_vector %>% sum) == 0) { - print(setting$name) - warning("No differentially expressed genes, remove this expression dataset") - } - return(list(name = setting$name, from = setting$from, response = diffexp_vector)) -} -#' @title Evaluation of target gene prediction. -#' -#' @description \code{evaluate_target_prediction} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand). It shows several classification evaluation metrics for the prediction. Different classification metrics are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. -#' -#' @usage -#' evaluate_target_prediction(setting,ligand_target_matrix, ligands_position = "cols") -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" - -#' @return A data.frame with following variables: setting, ligand and for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom data.table data.table -#' @importFrom limma wilcoxGST -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(setting,evaluate_target_prediction,ligand_target_matrix) -#' print(head(perf1)) -#' perf2 = lapply(setting,evaluate_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix)) -#' } -#' @export -#' -evaluate_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols"){ - ## still make evaluation multiple ligands possible - # input check - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.logical(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - - requireNamespace("dplyr") - - if (length(setting$from) == 1){ - ligand_oi = setting$from - } else { - ligand_oi = paste0(setting$from,collapse = "-") - } - if (ligands_position == "cols"){ - if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[,ligand_oi] - names(prediction_vector) = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[ligand_oi,] - names(prediction_vector) = colnames(ligand_target_matrix) - } - response_vector = setting$response - - if(sd(prediction_vector) == 0) - warning("all target gene probability score predictions have same value") - if(sd(response_vector) == 0) - stop("all genes have same response") - performance = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector)) - output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) - - return(output) -} -#' @title Evaluation of target gene prediction for multiple ligands. -#' -#' @description \code{evaluate_multi_ligand_target_prediction} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the set of DE genes after treatment of cells by multiple ligands). A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). Several classification evaluation metrics for the prediction are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. -#' -#' @usage -#' evaluate_multi_ligand_target_prediction(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE,continuous = TRUE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost", "pls" (load "pls" package before!); without: "lda","naive_bayes", "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. -#' -#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom limma wilcoxGST -#' @import caret -#' @importFrom purrr safely -#' -#' @examples -#' \dontrun{ -#' library(dplyr) -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = convert_expression_settings_evaluation(expression_settings_validation$TGFB_IL6_timeseries) %>% list() -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' output = lapply(setting,evaluate_multi_ligand_target_prediction,ligand_target_matrix,ligands_position = "cols",algorithm = "glm") -#' output = lapply(setting,evaluate_multi_ligand_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix),ligands_position = "cols",algorithm = "glm" ) -#' } -#' @export -#' -evaluate_multi_ligand_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE){ - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.logical(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - if(!is.character(algorithm)) - stop("algorithm should be a character vector") - if(!is.logical(var_imps) | length(var_imps) > 1) - stop("var_imps should be a logical vector: TRUE or FALSE") - if(!is.logical(cv) | length(cv) > 1) - stop("cv should be a logical vector: TRUE or FALSE") - if(!is.numeric(cv_number) | length(cv_number) > 1) - stop("cv_number should be a numeric vector of length 1") - if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) - stop("cv_repeats should be a numeric vector of length 1") - if(!is.logical(parallel) | length(parallel) > 1) - stop("parallel should be a logical vector: TRUE or FALSE") - if(!is.numeric(n_cores) | length(n_cores) > 1) - stop("n_cores should be a numeric vector of length 1") - if(!is.logical(ignore_errors) | length(ignore_errors) > 1) - stop("ignore_errors should be a logical vector: TRUE or FALSE") - if(!is.logical(continuous) | length(continuous) > 1) - stop("continuous should be a logical vector: TRUE or FALSE") - requireNamespace("dplyr") - - ligands_oi = setting$from - - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - if(sd(response_vector) == 0) - stop("all genes have same response") - response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - if(is.double(prediction_matrix) == FALSE){ - convert_categorical_factor = function(x){ - x = x %>% make.names() %>% as.factor() - } - prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) - } - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - if (nrow(combined) == 0) - stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") - - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - output = wrapper_caret_classification(train_data,algorithm,continuous = continuous,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,ignore_errors, prediction_response_df = combined) - output$setting = setting$name - output$ligands = ligands_oi - # output$prediction_response_df = combined - return(output) -} -#' @title Evaluation of target gene prediction. -#' -#' @description \code{evaluate_target_prediction_interprete} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand; or the log fold change values). It shows several classification evaluation metrics for the prediction when response is categorical, or several regression model fit metrics when the response is continuous. -#' -#' @usage -#' evaluate_target_prediction_interprete(setting,ligand_target_matrix, ligands_position = "cols") -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" - -#' @return A list with the elements $performances and $prediction_response_df. $performance is a data.frame with classification evaluation metrics if response is categorical, or regression model fit metrics if response is continuous. $prediction_response_df shows for each gene, the model prediction and the response value of the gene (e.g. whether the gene the gene is a target or not according to the observed response, or the absolute value of the log fold change of a gene). -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom data.table data.table -#' @importFrom limma wilcoxGST -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) -#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation_regression) -#' perf2 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) -#' } -#' @export -#' -evaluate_target_prediction_interprete = function(setting,ligand_target_matrix, ligands_position = "cols"){ - ## still make evaluation multiple ligands possible - # input check - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if((!is.logical(setting$response) & !is.numeric(setting$response)) | is.null(names(setting$response))) - stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted or a numeric vector containing response values (e.g. log fold change).") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - - requireNamespace("dplyr") - - if (length(setting$from) == 1){ - ligand_oi = setting$from - } else { - ligand_oi = paste0(setting$from,collapse = "-") - } - if (ligands_position == "cols"){ - if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[,ligand_oi] - names(prediction_vector) = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[ligand_oi,] - names(prediction_vector) = colnames(ligand_target_matrix) - } - response_vector = setting$response - - if(sd(prediction_vector) == 0) - warning("all target gene probability score predictions have same value") - if(sd(response_vector) == 0) - stop("all genes have same response") - - if (is.logical(response_vector)){ - output = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector), prediction_response_df = TRUE) - } else { - output = evaluate_target_prediction_regression_strict(response_vector,prediction_vector, prediction_response_df = TRUE) - } - - output$setting = setting$name - output$ligand = ligand_oi - colnames(output$prediction_response_df) = c("gene","response",ligand_oi) - - return(output) -} -#' @title Convert gene list to correct settings format for evaluation of target gene prediction. -#' -#' @description \code{convert_gene_list_settings_evaluation} Converts a gene list to correct settings format for evaluation of target gene prediction. -#' -#' @usage -#' convert_gene_list_settings_evaluation(gene_list, name, ligands_oi, background) -#' -#' @param gene_list A character vector of target gene names -#' @param name The name that will be given to the setting -#' @param ligands_oi The possibly active ligands -#' @param background A character vector of names of genes that are not target genes. If genes present in the gene list are in this vector of background gene names, these genes will be removed from the background. - -#' @return A list with following elements: $name, $from, $response -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' all_genes = unique(c(weighted_networks$gr$from,weighted_networks$gr$to,weighted_networks$lr_sig$from, weighted_networks$lr_sig$to)) -#' gene_list = c("ID1","ID2","ID3") -#' setting = list(convert_gene_list_settings_evaluation(gene_list = c("ID1","ID2","ID3"), name = "test",ligands_oi = "TGFB1", background = all_genes)) -#' } -#' @export -#' -#' -convert_gene_list_settings_evaluation = function(gene_list, name, ligands_oi, background) { - # input check - if(!is.character(gene_list)) - stop("gene_list should be character vector") - if(!is.character(name) | length(name) > 1) - stop("name should be character vector of length 1") - if(!is.character(ligands_oi)) - stop("ligands_oi should be character vector") - if(!is.character(background)) - stop("background should be character vector") - - requireNamespace("dplyr") - - background = background[(background %in% gene_list) == FALSE] - - background_logical = rep(FALSE,times = length(background)) - names(background_logical) = background - gene_list_logical = rep(TRUE,times = length(gene_list)) - names(gene_list_logical) = gene_list - response = c(background_logical,gene_list_logical) - - return(list(name = name, from = ligands_oi, response = response)) -} -#' @title Convert expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). -#' -#' @description \code{convert_expression_settings_evaluation_regression} Converts expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). -#' -#' @usage -#' convert_expression_settings_evaluation_regression(setting) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) - -#' @return A list with following elements: $name, $from, $response. $response will be a gene-named numeric vector of log fold change values. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation_regression) -#' } -#' @export -#' -#' -convert_expression_settings_evaluation_regression = function(setting) { - # input check - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.data.frame(setting$diffexp)) - stop("setting$diffexp should be data frame") - if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) - stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") - - requireNamespace("dplyr") - - diffexp_vector = setting$diffexp$lfc %>% abs() - names(diffexp_vector) = setting$diffexp$gene - diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] - - return(list(name = setting$name, from = setting$from, response = diffexp_vector)) -} -#' @title Evaluation of target gene value prediction (regression). -#' -#' @description \code{evaluate_target_prediction_regression} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). It shows several regression model fit metrics for the prediction. -#' -#' @usage -#' evaluate_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols") -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" - -#' @return A data.frame with following variables: setting, ligand and as regression model fit metrics: r_squared: R squared, adj_r_squared: adjusted R squared, f_statistic: estimate of F-statistic, lm_coefficient_abs_t: absolute value of t-value of coefficient, inverse_rse: 1 divided by estimated standard deviation of the errors (inversed to become that higher values indicate better fit), reverse_aic: reverse value of Akaike information criterion (-AIC, to become that higher values indicate better fit), reverse_bic: the reverse value of the bayesian information criterion, inverse_mae: mean absolute error, pearson: pearson correlation coefficient, spearman: spearman correlation coefficient. -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' settings = lapply(expression_settings_validation[1:2],convert_expression_settings_evaluation_regression) -#' ligands = extract_ligands_from_settings(settings) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(settings,evaluate_target_prediction_regression,ligand_target_matrix) -#' } -#' @export -#' -evaluate_target_prediction_regression = function(setting,ligand_target_matrix, ligands_position = "cols"){ - # input check - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.double(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix)) - stop("ligand_target matrix should be of type double and containing numeric probabilities as predictions") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - - requireNamespace("dplyr") - - if (length(setting$from) == 1){ - ligand_oi = setting$from - } else { - ligand_oi = paste0(setting$from,collapse = "-") - } - if (ligands_position == "cols"){ - if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[,ligand_oi] - names(prediction_vector) = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[ligand_oi,] - names(prediction_vector) = colnames(ligand_target_matrix) - } - response_vector = setting$response - - if(sd(prediction_vector) == 0) - warning("all target gene probability score predictions have same value") - if(sd(response_vector) == 0) - stop("all genes have same response") - - performance = evaluate_target_prediction_regression_strict(response_vector,prediction_vector) - output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) - - return(output) -} -#' @title Evaluation of target gene value prediction for multiple ligands (regression). -#' -#' @description \code{evaluate_multi_ligand_target_prediction_regression} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). It shows several regression model fit metrics for the prediction. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. -#' -#' @usage -#' evaluate_multi_ligand_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: "lm","glmnet", "rf". -#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' -#' @return A list with the following elements. $performances: data frame containing regression model fit metrics for regression on the test folds during training via cross-validation; $performances_training: data frame containing model fit metrics for regression of the final model on the complete data set (performance can be severly optimistic due to overfitting!); $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. -#' -#' @import caret -#' @importFrom purrr safely -#' -#' @examples -#' \dontrun{ -#' library(dplyr) -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = convert_expression_settings_evaluation_regression(expression_settings_validation$TGFB_IL6_timeseries) %>% list() -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' output = lapply(setting,evaluate_multi_ligand_target_prediction_regression,ligand_target_matrix,ligands_position = "cols",algorithm = "lm" ) -#' } -#' @export -#' -evaluate_multi_ligand_target_prediction_regression = function(setting, ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.double(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - if(!is.character(algorithm)) - stop("algorithm should be a character vector") - if(!is.logical(var_imps) | length(var_imps) > 1) - stop("var_imps should be a logical vector: TRUE or FALSE") - if(!is.logical(cv) | length(cv) > 1) - stop("cv should be a logical vector: TRUE or FALSE") - if(!is.numeric(cv_number) | length(cv_number) > 1) - stop("cv_number should be a numeric vector of length 1") - if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) - stop("cv_repeats should be a numeric vector of length 1") - if(!is.logical(parallel) | length(parallel) > 1) - stop("parallel should be a logical vector: TRUE or FALSE") - if(!is.numeric(n_cores) | length(n_cores) > 1) - stop("n_cores should be a numeric vector of length 1") - if(!is.logical(ignore_errors) | length(ignore_errors) > 1) - stop("ignore_errors should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - ligands_oi = setting$from - - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - - if(sd(response_vector) == 0) - stop("all genes have same response") - response_df = tibble(gene = names(response_vector), response = response_vector) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - if (nrow(combined) == 0) - stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - output = wrapper_caret_regression(train_data,algorithm,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = combined,ignore_errors) - output$setting = setting$name - output$ligands = ligands_oi - return(output) -} -#' @title Calculate average performance of datasets of a specific ligand. -#' -#' @description \code{wrapper_average_performances} Calculate average performance of datasets of a specific ligand. Datasets profiling more than one ligand (and thus ligands other than the ligand of interest), will be included as well. -#' -#' @usage -#' wrapper_average_performances(ligand_oi,performances, averaging = "median") -#' -#' @param ligand_oi Name of the ligand for which datasets should be averaged. -#' @param performances A data frame with performance values, containing at least folowing variables: $setting, $ligand and one or more metrics. -#' @param averaging How should performances of datasets of the same ligand be averaged? 'median' or 'mean'. -#' -#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' ligands = extract_ligands_from_settings(settings) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(settings,evaluate_target_prediction,ligand_target_matrix) -#' performances_target_prediction_averaged = ligands %>% lapply(wrapper_average_performances, perf1,"median") %>% bind_rows() %>% drop_na() -#' } -#' -#' @export -#' -wrapper_average_performances = function(ligand_oi,performances, averaging = "median"){ - if (!is.character(ligand_oi)) - stop("ligand_oi must be a character") - if (!is.data.frame(performances)) - stop("performances must be a data frame") - if(averaging != "median" & averaging != "mean") - stop("averaging should be 'median' or 'mean' ") - - all_settings = performances$setting %>% unique() - settings_oi_indicators = strsplit(performances$ligand,"[-]") %>% lapply(function(real_ligand){sum(ligand_oi %in% real_ligand) > 0}) %>% unlist() - settings_oi = all_settings[settings_oi_indicators] - performances_oi = performances %>% filter(setting %in% settings_oi) - if(averaging == "median"){ - performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(median, na.rm = TRUE) - } else if (averaging == "mean"){ - performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(mean, na.rm = TRUE) - } - return(performances_oi_averaged %>% mutate(ligand = ligand_oi)) -} - - - - - - - - - - - - - - - - - - - +#' @title Extract ligands of interest from settings +#' +#' @description \code{extract_ligands_from_settings} Extract ligands of interest from (expression) settings in correct to construct the ligand-target matrix. +#' +#' @usage +#' extract_ligands_from_settings(settings,combination = TRUE) +#' +#' @param settings A list of lists for which each sub-list contains the information about (expression) datasets; with minimally the following elements: name of the setting ($name), ligands (possibly) active in the setting of interest ($from). +#' @param combination Indicate whether in case multiple ligands are possibly active ligand combinations should be extracted or only individual ligands. Default: TRUE. +#' +#' @return A list containing the ligands and ligands combinations for which a ligand-target matrix should be constructed. When for a particular dataset multiple ligands are possibly active (i.e. more than ligand in .$from slot of sublist of settings), then both the combination of these multiple ligands and each of these multiple ligands individually will be select for model construction. +#' +#' @examples +#' \dontrun{ +#' ligands = extract_ligands_from_settings(expression_settings_validation) +#' } +#' @export +#' +extract_ligands_from_settings = function(settings, combination = TRUE){ + + # input check + if (!is.list(settings)) + stop("settings must be a list") + if(sum(sapply(settings,function(x){is.character(x$from)})) != length(settings)) + stop("settings$.$from must be a character vector containing ligands") + + ligands_oi = list() + if (combination == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + ligand = setting$from + if (length(ligand) == 1) { + ligands_oi[length(ligands_oi) + 1] = ligand + } else {# if multiple ligands added + ligands_oi[[length(ligands_oi) + 1]] = ligand # ligands together + for (l in ligand) { + ligands_oi[length(ligands_oi) + 1] = l # ligands separate + } + } + } + } else { + for (i in 1:length(settings)){ + setting = settings[[i]] + ligand = setting$from + if (length(ligand) == 1) { + ligands_oi[length(ligands_oi) + 1] = ligand + } else {# if multiple ligands added + for (l in ligand) { + ligands_oi[length(ligands_oi) + 1] = l # ligands separate + } + } + } + } + + ligands_oi = unique(ligands_oi) + return(ligands_oi) +} +#' @title Convert expression settings to correct settings format for evaluation of target gene prediction. +#' +#' @description \code{convert_expression_settings_evaluation} Converts expression settings to correct settings format for evaluation of target gene prediction. +#' +#' @usage +#' convert_expression_settings_evaluation(setting) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) + +#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) +#' } +#' @export +#' +#' +convert_expression_settings_evaluation = function(setting) { + # input check + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.data.frame(setting$diffexp)) + stop("setting$diffexp should be data frame") + if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) + stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") + + requireNamespace("dplyr") + + diffexp_df = setting$diffexp %>% mutate(diffexp = (abs(lfc) >= 1) & (qval <= 0.1)) + diffexp_vector = diffexp_df$diffexp + names(diffexp_vector) = diffexp_df$gene + diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] + if((diffexp_vector %>% sum) == 0) { + print(setting$name) + warning("No differentially expressed genes, remove this expression dataset") + } + return(list(name = setting$name, from = setting$from, response = diffexp_vector)) +} +#' @title Evaluation of target gene prediction. +#' +#' @description \code{evaluate_target_prediction} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand). It shows several classification evaluation metrics for the prediction. Different classification metrics are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. +#' +#' @usage +#' evaluate_target_prediction(setting,ligand_target_matrix, ligands_position = "cols") +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" + +#' @return A data.frame with following variables: setting, ligand nd for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds.\cr +#' "mean_rank_GST_log_pval" will only be included in the dataframe if limma is installed. From NicheNet v2.1.7 onwards, limma is no longer a hard dependency of NicheNet. +#' +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @importFrom data.table data.table +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(setting,evaluate_target_prediction,ligand_target_matrix) +#' print(head(perf1)) +#' perf2 = lapply(setting,evaluate_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix)) +#' } +#' @export +#' +evaluate_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols"){ + ## still make evaluation multiple ligands possible + # input check + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.logical(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + + requireNamespace("dplyr") + + if (length(setting$from) == 1){ + ligand_oi = setting$from + } else { + ligand_oi = paste0(setting$from,collapse = "-") + } + if (ligands_position == "cols"){ + if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[,ligand_oi] + names(prediction_vector) = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[ligand_oi,] + names(prediction_vector) = colnames(ligand_target_matrix) + } + response_vector = setting$response + + if(sd(prediction_vector) == 0) + warning("all target gene probability score predictions have same value") + if(sd(response_vector) == 0) + stop("all genes have same response") + performance = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector)) + output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) + + return(output) +} +#' @title Evaluation of target gene prediction for multiple ligands. +#' +#' @description \code{evaluate_multi_ligand_target_prediction} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the set of DE genes after treatment of cells by multiple ligands). A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). Several classification evaluation metrics for the prediction are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. +#' +#' @usage +#' evaluate_multi_ligand_target_prediction(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE,continuous = TRUE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost", "pls" (load "pls" package before!); without: "lda","naive_bayes", "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. +#' +#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @import caret +#' @importFrom purrr safely +#' +#' @examples +#' \dontrun{ +#' library(dplyr) +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = convert_expression_settings_evaluation(expression_settings_validation$TGFB_IL6_timeseries) %>% list() +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' output = lapply(setting,evaluate_multi_ligand_target_prediction,ligand_target_matrix,ligands_position = "cols",algorithm = "glm") +#' output = lapply(setting,evaluate_multi_ligand_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix),ligands_position = "cols",algorithm = "glm" ) +#' } +#' @export +#' +evaluate_multi_ligand_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE){ + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.logical(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + if(!is.character(algorithm)) + stop("algorithm should be a character vector") + if(!is.logical(var_imps) | length(var_imps) > 1) + stop("var_imps should be a logical vector: TRUE or FALSE") + if(!is.logical(cv) | length(cv) > 1) + stop("cv should be a logical vector: TRUE or FALSE") + if(!is.numeric(cv_number) | length(cv_number) > 1) + stop("cv_number should be a numeric vector of length 1") + if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) + stop("cv_repeats should be a numeric vector of length 1") + if(!is.logical(parallel) | length(parallel) > 1) + stop("parallel should be a logical vector: TRUE or FALSE") + if(!is.numeric(n_cores) | length(n_cores) > 1) + stop("n_cores should be a numeric vector of length 1") + if(!is.logical(ignore_errors) | length(ignore_errors) > 1) + stop("ignore_errors should be a logical vector: TRUE or FALSE") + if(!is.logical(continuous) | length(continuous) > 1) + stop("continuous should be a logical vector: TRUE or FALSE") + requireNamespace("dplyr") + + ligands_oi = setting$from + + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + if(sd(response_vector) == 0) + stop("all genes have same response") + response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + if(is.double(prediction_matrix) == FALSE){ + convert_categorical_factor = function(x){ + x = x %>% make.names() %>% as.factor() + } + prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) + } + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + if (nrow(combined) == 0) + stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") + + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + output = wrapper_caret_classification(train_data,algorithm,continuous = continuous,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,ignore_errors, prediction_response_df = combined) + output$setting = setting$name + output$ligands = ligands_oi + # output$prediction_response_df = combined + return(output) +} +#' @title Evaluation of target gene prediction. +#' +#' @description \code{evaluate_target_prediction_interprete} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand; or the log fold change values). It shows several classification evaluation metrics for the prediction when response is categorical, or several regression model fit metrics when the response is continuous. +#' +#' @usage +#' evaluate_target_prediction_interprete(setting,ligand_target_matrix, ligands_position = "cols") +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" + +#' @return A list with the elements $performances and $prediction_response_df. $performance is a data.frame with classification evaluation metrics if response is categorical, or regression model fit metrics if response is continuous. $prediction_response_df shows for each gene, the model prediction and the response value of the gene (e.g. whether the gene the gene is a target or not according to the observed response, or the absolute value of the log fold change of a gene). +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @importFrom data.table data.table +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) +#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation_regression) +#' perf2 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) +#' } +#' @export +#' +evaluate_target_prediction_interprete = function(setting,ligand_target_matrix, ligands_position = "cols"){ + ## still make evaluation multiple ligands possible + # input check + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if((!is.logical(setting$response) & !is.numeric(setting$response)) | is.null(names(setting$response))) + stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted or a numeric vector containing response values (e.g. log fold change).") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + + requireNamespace("dplyr") + + if (length(setting$from) == 1){ + ligand_oi = setting$from + } else { + ligand_oi = paste0(setting$from,collapse = "-") + } + if (ligands_position == "cols"){ + if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[,ligand_oi] + names(prediction_vector) = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[ligand_oi,] + names(prediction_vector) = colnames(ligand_target_matrix) + } + response_vector = setting$response + + if(sd(prediction_vector) == 0) + warning("all target gene probability score predictions have same value") + if(sd(response_vector) == 0) + stop("all genes have same response") + + if (is.logical(response_vector)){ + output = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector), prediction_response_df = TRUE) + } else { + output = evaluate_target_prediction_regression_strict(response_vector,prediction_vector, prediction_response_df = TRUE) + } + + output$setting = setting$name + output$ligand = ligand_oi + colnames(output$prediction_response_df) = c("gene","response",ligand_oi) + + return(output) +} +#' @title Convert gene list to correct settings format for evaluation of target gene prediction. +#' +#' @description \code{convert_gene_list_settings_evaluation} Converts a gene list to correct settings format for evaluation of target gene prediction. +#' +#' @usage +#' convert_gene_list_settings_evaluation(gene_list, name, ligands_oi, background) +#' +#' @param gene_list A character vector of target gene names +#' @param name The name that will be given to the setting +#' @param ligands_oi The possibly active ligands +#' @param background A character vector of names of genes that are not target genes. If genes present in the gene list are in this vector of background gene names, these genes will be removed from the background. + +#' @return A list with following elements: $name, $from, $response +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' all_genes = unique(c(weighted_networks$gr$from,weighted_networks$gr$to,weighted_networks$lr_sig$from, weighted_networks$lr_sig$to)) +#' gene_list = c("ID1","ID2","ID3") +#' setting = list(convert_gene_list_settings_evaluation(gene_list = c("ID1","ID2","ID3"), name = "test",ligands_oi = "TGFB1", background = all_genes)) +#' } +#' @export +#' +#' +convert_gene_list_settings_evaluation = function(gene_list, name, ligands_oi, background) { + # input check + if(!is.character(gene_list)) + stop("gene_list should be character vector") + if(!is.character(name) | length(name) > 1) + stop("name should be character vector of length 1") + if(!is.character(ligands_oi)) + stop("ligands_oi should be character vector") + if(!is.character(background)) + stop("background should be character vector") + + requireNamespace("dplyr") + + background = background[(background %in% gene_list) == FALSE] + + background_logical = rep(FALSE,times = length(background)) + names(background_logical) = background + gene_list_logical = rep(TRUE,times = length(gene_list)) + names(gene_list_logical) = gene_list + response = c(background_logical,gene_list_logical) + + return(list(name = name, from = ligands_oi, response = response)) +} +#' @title Convert expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). +#' +#' @description \code{convert_expression_settings_evaluation_regression} Converts expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). +#' +#' @usage +#' convert_expression_settings_evaluation_regression(setting) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) + +#' @return A list with following elements: $name, $from, $response. $response will be a gene-named numeric vector of log fold change values. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation_regression) +#' } +#' @export +#' +#' +convert_expression_settings_evaluation_regression = function(setting) { + # input check + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.data.frame(setting$diffexp)) + stop("setting$diffexp should be data frame") + if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) + stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") + + requireNamespace("dplyr") + + diffexp_vector = setting$diffexp$lfc %>% abs() + names(diffexp_vector) = setting$diffexp$gene + diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] + + return(list(name = setting$name, from = setting$from, response = diffexp_vector)) +} +#' @title Evaluation of target gene value prediction (regression). +#' +#' @description \code{evaluate_target_prediction_regression} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). It shows several regression model fit metrics for the prediction. +#' +#' @usage +#' evaluate_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols") +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" + +#' @return A data.frame with following variables: setting, ligand and as regression model fit metrics: r_squared: R squared, adj_r_squared: adjusted R squared, f_statistic: estimate of F-statistic, lm_coefficient_abs_t: absolute value of t-value of coefficient, inverse_rse: 1 divided by estimated standard deviation of the errors (inversed to become that higher values indicate better fit), reverse_aic: reverse value of Akaike information criterion (-AIC, to become that higher values indicate better fit), reverse_bic: the reverse value of the bayesian information criterion, inverse_mae: mean absolute error, pearson: pearson correlation coefficient, spearman: spearman correlation coefficient. +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' settings = lapply(expression_settings_validation[1:2],convert_expression_settings_evaluation_regression) +#' ligands = extract_ligands_from_settings(settings) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(settings,evaluate_target_prediction_regression,ligand_target_matrix) +#' } +#' @export +#' +evaluate_target_prediction_regression = function(setting,ligand_target_matrix, ligands_position = "cols"){ + # input check + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.double(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix)) + stop("ligand_target matrix should be of type double and containing numeric probabilities as predictions") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + + requireNamespace("dplyr") + + if (length(setting$from) == 1){ + ligand_oi = setting$from + } else { + ligand_oi = paste0(setting$from,collapse = "-") + } + if (ligands_position == "cols"){ + if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[,ligand_oi] + names(prediction_vector) = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[ligand_oi,] + names(prediction_vector) = colnames(ligand_target_matrix) + } + response_vector = setting$response + + if(sd(prediction_vector) == 0) + warning("all target gene probability score predictions have same value") + if(sd(response_vector) == 0) + stop("all genes have same response") + + performance = evaluate_target_prediction_regression_strict(response_vector,prediction_vector) + output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) + + return(output) +} +#' @title Evaluation of target gene value prediction for multiple ligands (regression). +#' +#' @description \code{evaluate_multi_ligand_target_prediction_regression} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). It shows several regression model fit metrics for the prediction. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. +#' +#' @usage +#' evaluate_multi_ligand_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: "lm","glmnet", "rf". +#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' +#' @return A list with the following elements. $performances: data frame containing regression model fit metrics for regression on the test folds during training via cross-validation; $performances_training: data frame containing model fit metrics for regression of the final model on the complete data set (performance can be severly optimistic due to overfitting!); $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. +#' +#' @import caret +#' @importFrom purrr safely +#' +#' @examples +#' \dontrun{ +#' library(dplyr) +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = convert_expression_settings_evaluation_regression(expression_settings_validation$TGFB_IL6_timeseries) %>% list() +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' output = lapply(setting,evaluate_multi_ligand_target_prediction_regression,ligand_target_matrix,ligands_position = "cols",algorithm = "lm" ) +#' } +#' @export +#' +evaluate_multi_ligand_target_prediction_regression = function(setting, ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.double(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + if(!is.character(algorithm)) + stop("algorithm should be a character vector") + if(!is.logical(var_imps) | length(var_imps) > 1) + stop("var_imps should be a logical vector: TRUE or FALSE") + if(!is.logical(cv) | length(cv) > 1) + stop("cv should be a logical vector: TRUE or FALSE") + if(!is.numeric(cv_number) | length(cv_number) > 1) + stop("cv_number should be a numeric vector of length 1") + if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) + stop("cv_repeats should be a numeric vector of length 1") + if(!is.logical(parallel) | length(parallel) > 1) + stop("parallel should be a logical vector: TRUE or FALSE") + if(!is.numeric(n_cores) | length(n_cores) > 1) + stop("n_cores should be a numeric vector of length 1") + if(!is.logical(ignore_errors) | length(ignore_errors) > 1) + stop("ignore_errors should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + ligands_oi = setting$from + + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + + if(sd(response_vector) == 0) + stop("all genes have same response") + response_df = tibble(gene = names(response_vector), response = response_vector) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + if (nrow(combined) == 0) + stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + output = wrapper_caret_regression(train_data,algorithm,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = combined,ignore_errors) + output$setting = setting$name + output$ligands = ligands_oi + return(output) +} +#' @title Calculate average performance of datasets of a specific ligand. +#' +#' @description \code{wrapper_average_performances} Calculate average performance of datasets of a specific ligand. Datasets profiling more than one ligand (and thus ligands other than the ligand of interest), will be included as well. +#' +#' @usage +#' wrapper_average_performances(ligand_oi,performances, averaging = "median") +#' +#' @param ligand_oi Name of the ligand for which datasets should be averaged. +#' @param performances A data frame with performance values, containing at least folowing variables: $setting, $ligand and one or more metrics. +#' @param averaging How should performances of datasets of the same ligand be averaged? 'median' or 'mean'. +#' +#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' ligands = extract_ligands_from_settings(settings) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(settings,evaluate_target_prediction,ligand_target_matrix) +#' performances_target_prediction_averaged = ligands %>% lapply(wrapper_average_performances, perf1,"median") %>% bind_rows() %>% drop_na() +#' } +#' +#' @export +#' +wrapper_average_performances = function(ligand_oi,performances, averaging = "median"){ + if (!is.character(ligand_oi)) + stop("ligand_oi must be a character") + if (!is.data.frame(performances)) + stop("performances must be a data frame") + if(averaging != "median" & averaging != "mean") + stop("averaging should be 'median' or 'mean' ") + + all_settings = performances$setting %>% unique() + settings_oi_indicators = strsplit(performances$ligand,"[-]") %>% lapply(function(real_ligand){sum(ligand_oi %in% real_ligand) > 0}) %>% unlist() + settings_oi = all_settings[settings_oi_indicators] + performances_oi = performances %>% filter(setting %in% settings_oi) + if(averaging == "median"){ + performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(median, na.rm = TRUE) + } else if (averaging == "mean"){ + performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(mean, na.rm = TRUE) + } + return(performances_oi_averaged %>% mutate(ligand = ligand_oi)) +} + + + + + + + + + + + + + + + + + + + diff --git a/R/nichenetr-package.R b/R/nichenetr-package.R new file mode 100644 index 0000000..a65cf64 --- /dev/null +++ b/R/nichenetr-package.R @@ -0,0 +1,6 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +## usethis namespace: end +NULL diff --git a/R/nichenetr.R b/R/nichenetr.R deleted file mode 100644 index 6b76a1d..0000000 --- a/R/nichenetr.R +++ /dev/null @@ -1,18 +0,0 @@ -## introduction to package, which functions to import,... -#' nichenetr: Linking Extracellular Protein Signals to Target Genes by data-integration. -#' -#' This package allows you the investigate intercellular communication from a computational perspective. Functionalities of this package (e.g. including predicting extracellular upstream regulators) build upon a probabilistic model of ligand-target links that was inferred by data-integration. -#' -#' @section Construction of the probabilistic model: -#' \code{\link{construct_weighted_networks}}, \code{\link{construct_ligand_target_matrix}} -#' -#' @section Evaluation functions: -#' \code{\link{evaluate_target_prediction}} -#' -#' @docType package -#' @name nichenetr -#' -#' @import dplyr -#' @import tidyr -#' @import tibble -NULL diff --git a/R/parameter_optimization.R b/R/parameter_optimization.R index 3832338..a891ffc 100644 --- a/R/parameter_optimization.R +++ b/R/parameter_optimization.R @@ -383,12 +383,12 @@ process_mlrmbo_nichenet_optimization = function(optimization_results,source_name #' @description \code{model_evaluation_optimization_nsga2} will take as input a vector of data source weights and hyperparameters to construct a ligand-target matrix and evaluate its performance on input validation settings. #' @usage #' model_evaluation_optimization_nsga2(y, source_names, algorithm, correct_topology, lr_network, sig_network, gr_network, settings, secondary_targets = FALSE, remove_direct_links = "no",damping_factor = NULL) -#' +#' #' @param y A numeric vector containing the data source weights as the first elements, and hyperparameters as the last elements. The order of the data source weights accords to the order of source_names. #' @inheritParams model_evaluation_optimization_mlrmbo -#' +#' #' @return A numeric vector of length 4 containing the average auroc for target gene prediction, average aupr (corrected for TP fraction) for target gene prediction, average auroc for ligand activity prediction and average aupr for ligand activity prediction. -#' +#' #' @examples #' \dontrun{ #' nr_datasources = source_weights_df$source %>% unique() %>% length() @@ -397,7 +397,7 @@ process_mlrmbo_nichenet_optimization = function(optimization_results,source_name #' test_evaluation_optimization = model_evaluation_optimization_nsga2(test_input, source_weights_df$source %>% unique(), "PPR", TRUE, lr_network, sig_network, gr_network, #' lapply(expression_settings_validation, convert_expression_settings_evaluation), secondary_targets = FALSE, remove_direct_links = "no") #' } -#' +#' #' @export model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, correct_topology, lr_network, sig_network, gr_network, settings, secondary_targets = FALSE, remove_direct_links = "no", damping_factor = NULL) { # change numeric vector y input to list x @@ -416,123 +416,123 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr x$gr_hub = other_params[2] x$ltf_cutoff = other_params[3] x$damping_factor = other_params[4] - + if (!is.null(damping_factor) & is.null(x$damping_factor)) { x$damping_factor = damping_factor } - if (!is.list(x)) + if (!is.list(x)) stop("x should be a list!") - if (!is.numeric(x$source_weights)) + if (!is.numeric(x$source_weights)) stop("x$source_weights should be a numeric vector") - if (x$lr_sig_hub < 0 | x$lr_sig_hub > 1) + if (x$lr_sig_hub < 0 | x$lr_sig_hub > 1) stop("x$lr_sig_hub must be a number between 0 and 1 (0 and 1 included)") - if (x$gr_hub < 0 | x$gr_hub > 1) + if (x$gr_hub < 0 | x$gr_hub > 1) stop("x$gr_hub must be a number between 0 and 1 (0 and 1 included)") if (is.null(x$ltf_cutoff)) { - if ((algorithm == "PPR" | algorithm == "SPL") & correct_topology == - FALSE) + if ((algorithm == "PPR" | algorithm == "SPL") & correct_topology == + FALSE) warning("Did you not forget to give a value to x$ltf_cutoff?") } else { - if (x$ltf_cutoff < 0 | x$ltf_cutoff > 1) + if (x$ltf_cutoff < 0 | x$ltf_cutoff > 1) stop("x$ltf_cutoff must be a number between 0 and 1 (0 and 1 included)") } if (algorithm == "PPR") { - if (x$damping_factor < 0 | x$damping_factor >= 1) + if (x$damping_factor < 0 | x$damping_factor >= 1) stop("x$damping_factor must be a number between 0 and 1 (0 included, 1 not)") } - if (algorithm != "PPR" & algorithm != "SPL" & algorithm != - "direct") + if (algorithm != "PPR" & algorithm != "SPL" & algorithm != + "direct") stop("algorithm must be 'PPR' or 'SPL' or 'direct'") - if (correct_topology != TRUE & correct_topology != FALSE) + if (correct_topology != TRUE & correct_topology != FALSE) stop("correct_topology must be TRUE or FALSE") - if (!is.data.frame(lr_network)) + if (!is.data.frame(lr_network)) stop("lr_network must be a data frame or tibble object") - if (!is.data.frame(sig_network)) + if (!is.data.frame(sig_network)) stop("sig_network must be a data frame or tibble object") - if (!is.data.frame(gr_network)) + if (!is.data.frame(gr_network)) stop("gr_network must be a data frame or tibble object") - if (!is.list(settings)) + if (!is.list(settings)) stop("settings should be a list!") - if (!is.character(settings[[1]]$from) | !is.character(settings[[1]]$name)) + if (!is.character(settings[[1]]$from) | !is.character(settings[[1]]$name)) stop("setting$from and setting$name should be character vectors") - if (!is.logical(settings[[1]]$response) | is.null(names(settings[[1]]$response))) + if (!is.logical(settings[[1]]$response) | is.null(names(settings[[1]]$response))) stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") - if (secondary_targets != TRUE & secondary_targets != FALSE) + if (secondary_targets != TRUE & secondary_targets != FALSE) stop("secondary_targets must be TRUE or FALSE") - if (remove_direct_links != "no" & remove_direct_links != - "ligand" & remove_direct_links != "ligand-receptor") + if (remove_direct_links != "no" & remove_direct_links != + "ligand" & remove_direct_links != "ligand-receptor") stop("remove_direct_links must be 'no' or 'ligand' or 'ligand-receptor'") - if (!is.character(source_names)) + if (!is.character(source_names)) stop("source_names should be a character vector") - if (length(source_names) != length(x$source_weights)) + if (length(source_names) != length(x$source_weights)) stop("Length of source_names should be the same as length of x$source_weights") - if (correct_topology == TRUE && !is.null(x$ltf_cutoff)) + if (correct_topology == TRUE && !is.null(x$ltf_cutoff)) warning("Because PPR-ligand-target matrix will be corrected for topology, the proposed cutoff on the ligand-tf matrix will be ignored (x$ltf_cutoff") - if (correct_topology == TRUE && algorithm != "PPR") + if (correct_topology == TRUE && algorithm != "PPR") warning("Topology correction is PPR-specific and makes no sense when the algorithm is not PPR") # names(x$source_weights) = source_names - + parameters_setting = list(model_name = "query_design", source_weights = x$source_weights) if (algorithm == "PPR") { if (correct_topology == TRUE) { - parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, - lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, - ltf_cutoff = 0, algorithm = algorithm, damping_factor = x$damping_factor, + parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, + lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, + ltf_cutoff = 0, algorithm = algorithm, damping_factor = x$damping_factor, correct_topology = TRUE) } else { - parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, - lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, - ltf_cutoff = x$ltf_cutoff, algorithm = algorithm, + parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, + lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, + ltf_cutoff = x$ltf_cutoff, algorithm = algorithm, damping_factor = x$damping_factor, correct_topology = FALSE) } } if (algorithm == "SPL" | algorithm == "direct") { - parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, - lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, ltf_cutoff = x$ltf_cutoff, + parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, + lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, ltf_cutoff = x$ltf_cutoff, algorithm = algorithm, damping_factor = NULL, correct_topology = FALSE) } - output_evaluation = evaluate_model(parameters_setting, lr_network, - sig_network, gr_network, settings, calculate_popularity_bias_target_prediction = FALSE, - calculate_popularity_bias_ligand_prediction = FALSE, - ncitations = ncitations, secondary_targets = secondary_targets, + output_evaluation = evaluate_model(parameters_setting, lr_network, + sig_network, gr_network, settings, calculate_popularity_bias_target_prediction = FALSE, + calculate_popularity_bias_ligand_prediction = FALSE, + ncitations = ncitations, secondary_targets = secondary_targets, remove_direct_links = remove_direct_links, n_target_bins = 3) - + ligands_evaluation = settings %>% sapply(function(x){x$from}) %>% unlist() %>% unique() - + ligand_activity_performance_setting_summary = output_evaluation$performances_ligand_prediction_single %>% select(-setting, -ligand) %>% group_by(importance_measure) %>% summarise_all(mean) %>% group_by(importance_measure) %>% mutate(geom_average = exp(mean(log(c(auroc,aupr_corrected))))) best_metric = ligand_activity_performance_setting_summary %>% ungroup() %>% filter(geom_average == max(geom_average)) %>% pull(importance_measure) %>% .[1] performances_ligand_prediction_single_summary = output_evaluation$performances_ligand_prediction_single %>% filter(importance_measure == best_metric) - + performances_target_prediction_averaged = ligands_evaluation %>% lapply(function(x){x}) %>% lapply(wrapper_average_performances, output_evaluation$performances_target_prediction,"median") %>% bind_rows() %>% drop_na() performances_ligand_prediction_single_summary_averaged = ligands_evaluation %>% lapply(function(x){x}) %>% lapply(wrapper_average_performances, performances_ligand_prediction_single_summary %>% select(-importance_measure),"median") %>% bind_rows() %>% drop_na() - + mean_auroc_target_prediction = performances_target_prediction_averaged$auroc %>% mean(na.rm = TRUE) %>% unique() mean_aupr_target_prediction = performances_target_prediction_averaged$aupr_corrected %>% mean(na.rm = TRUE) %>% unique() - + # we want also to look at median ligand prediction, but also the mean: why? median focuses on improving half of the datasets, but can lead to ignorance of a few bad datasets -- try to semi-avoid this with the mean ## combine both mean and median median_auroc_ligand_prediction = performances_ligand_prediction_single_summary_averaged$auroc %>% median(na.rm = TRUE) %>% unique() median_aupr_ligand_prediction = performances_ligand_prediction_single_summary_averaged$aupr_corrected %>% median(na.rm = TRUE) %>% unique() - + mean_auroc_ligand_prediction = performances_ligand_prediction_single_summary_averaged$auroc %>% mean(na.rm = TRUE) %>% unique() mean_aupr_ligand_prediction = performances_ligand_prediction_single_summary_averaged$aupr_corrected %>% mean(na.rm = TRUE) %>% unique() - + score_auroc_ligand_prediction = (median_auroc_ligand_prediction + mean_auroc_ligand_prediction) * 0.5 score_aupr_ligand_prediction = (median_aupr_ligand_prediction + mean_aupr_ligand_prediction) * 0.5 - - return(c(mean_auroc_target_prediction*-1, mean_aupr_target_prediction*-1, + + return(c(mean_auroc_target_prediction*-1, mean_aupr_target_prediction*-1, score_auroc_ligand_prediction*-1, score_aupr_ligand_prediction*-1)) } #' @title Run NSGA-II for parameter optimization. #' @description \code{run_nsga2R_cluster} runs the NSGA-II algorithm for parameter optimization and allows for parallelization. The core of this function is adapted from \code{nsga2R::nsga2R}. -#' @usage +#' @usage #' run_nsga2R_cluster(run_id, fn, varNo, objDim, lowerBounds = rep(-Inf, varNo), upperBounds = rep(Inf, varNo), popSize = 100, tourSize = 2, generations = 20, cprob = 0.7, XoverDistIdx = 5, mprob = 0.2, MuDistIdx = 10, ncores = 24) -#' +#' #' @param fn The function to be optimized, usually \code{model_evaluation_optimization_nsga2}. #' @param varNo The number of variables to be optimized, usually the number of data sources + 4 hyperparameters (lr_sig_hub, gr_hub, ltf_cutoff, damping_factor). #' @param objDim Number of objective functions @@ -547,7 +547,7 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr #' @param MuDistIdx The mutation distribution index, it can be any nonnegative real number (default: 10) #' @param ncores The number of cores to be used for parallelization (default: 24) #' @param ... Additional arguments to \code{fn}. -#' +#' #' @return An 'nsga2R' object containing input settings and the following elements: #' \itemize{ #' \item intermed_output_list_params: a list with intermediate values of parameters for each generation (each element has dimensions popSize x varNo) @@ -557,7 +557,7 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr #' \item paretoFrontRank: nondomination ranks (or levels) that each non-dominated solution belongs to #' \item crowdingDistance: crowding distances of each non-dominated solution #' } -#' +#' #' @examples #' \dontrun{ #' source_names <- c(lr_network$source, sig_network$source, gr_network$source) %>% unique() @@ -568,37 +568,37 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr #' results <- run_nsga2R_cluster(model_evaluation_optimization_nsga2r, varNo=n_param, objDim=n_obj, #' lowerBounds=lower_bounds, upperBounds=upper_bounds, popSize = 360, tourSize = 2, generations = 15, ncores = 8) #' } -#' +#' #' @export -run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo), - upperBounds = rep(Inf, varNo), popSize = 100, tourSize = 2, - generations = 20, ncores = 24, cprob = 0.7, XoverDistIdx = 5, mprob = 0.2, +run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo), + upperBounds = rep(Inf, varNo), popSize = 100, tourSize = 2, + generations = 20, ncores = 24, cprob = 0.7, XoverDistIdx = 5, mprob = 0.2, MuDistIdx = 10, ...) { library(nsga2R) library(dplyr) library(tidyr) - + intermed_output_list_params = list() intermed_output_list_obj = list() - + doMC::registerDoMC(ncores) - + cat("********** R based Nondominated Sorting Genetic Algorithm II *********") cat("\n") cat("initializing the population") cat("\n") - + print(Sys.time()) - - parent <- t(sapply(1:popSize, function(u) array(runif(length(lowerBounds), + + parent <- t(sapply(1:popSize, function(u) array(runif(length(lowerBounds), lowerBounds, upperBounds)))) # parent_old = parent # parent_classic <- cbind(parent, t(apply(parent, 1, fn))) parent <- cbind(parent, t(parallel::mclapply(split(parent, 1:nrow(parent)), fn, mc.cores = ncores, ...) %>% unlist() %>% matrix(nrow = objDim))) - + cat("ranking the initial population") cat("\n") - ranking <- fastNonDominatedSorting(parent[, (varNo + 1):(varNo + + ranking <- fastNonDominatedSorting(parent[, (varNo + 1):(varNo + objDim)]) rnkIndex <- integer(popSize) i <- 1 @@ -609,15 +609,15 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) parent <- cbind(parent, rnkIndex) cat("crowding distance calculation") cat("\n") - objRange <- apply(parent[, (varNo + 1):(varNo + objDim)], - 2, max) - apply(parent[, (varNo + 1):(varNo + objDim)], + objRange <- apply(parent[, (varNo + 1):(varNo + objDim)], + 2, max) - apply(parent[, (varNo + 1):(varNo + objDim)], 2, min) cd <- crowdingDist4frnt(parent, ranking, objRange) parent <- cbind(parent, apply(cd, 1, sum)) for (iter in 1:generations) { print(iter) print(Sys.time()) - cat("---------------generation---------------", + cat("---------------generation---------------", iter, "starts") cat("\n") cat("tournament selection") @@ -625,25 +625,25 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) matingPool <- tournamentSelection(parent, popSize, tourSize) cat("crossover operator") cat("\n") - childAfterX <- boundedSBXover(matingPool[, 1:varNo], + childAfterX <- boundedSBXover(matingPool[, 1:varNo], lowerBounds, upperBounds, cprob, XoverDistIdx) cat("mutation operator") cat("\n") - childAfterM <- boundedPolyMutation(childAfterX, lowerBounds, + childAfterM <- boundedPolyMutation(childAfterX, lowerBounds, upperBounds, mprob, MuDistIdx) cat("evaluate the objective fns of childAfterM") cat("\n") # childAfterM_old = childAfterM - # childAfterM_classic <- cbind(childAfterM, t(apply(childAfterM, + # childAfterM_classic <- cbind(childAfterM, t(apply(childAfterM, # 1, fn))) childAfterM <- cbind(childAfterM, t(parallel::mclapply(split(childAfterM, 1:nrow(childAfterM)), fn, mc.cores = ncores) %>% unlist() %>% matrix(nrow = objDim))) - + cat("Rt = Pt + Qt") cat("\n") parentNext <- rbind(parent[, 1:(varNo + objDim)], childAfterM) cat("ranking again") cat("\n") - ranking <- fastNonDominatedSorting(parentNext[, (varNo + + ranking <- fastNonDominatedSorting(parentNext[, (varNo + 1):(varNo + objDim)]) i <- 1 while (i <= length(ranking)) { @@ -653,51 +653,51 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) parentNext <- cbind(parentNext, rnkIndex) cat("crowded comparison again") cat("\n") - objRange <- apply(parentNext[, (varNo + 1):(varNo + objDim)], - 2, max) - apply(parentNext[, (varNo + 1):(varNo + + objRange <- apply(parentNext[, (varNo + 1):(varNo + objDim)], + 2, max) - apply(parentNext[, (varNo + 1):(varNo + objDim)], 2, min) cd <- crowdingDist4frnt(parentNext, ranking, objRange) parentNext <- cbind(parentNext, apply(cd, 1, sum)) - parentNext.sort <- parentNext[order(parentNext[, varNo + - objDim + 1], -parentNext[, varNo + objDim + 2]), + parentNext.sort <- parentNext[order(parentNext[, varNo + + objDim + 1], -parentNext[, varNo + objDim + 2]), ] cat("environmental selection") cat("\n") parent <- parentNext.sort[1:popSize, ] - cat("---------------generation---------------", + cat("---------------generation---------------", iter, "ends") cat("\n") if (iter != generations) { - # intermed_output_list_params[[iter]] = parent[, 1:varNo] + # intermed_output_list_params[[iter]] = parent[, 1:varNo] # intermed_output_list_obj[[iter]] = parent[, (varNo + 1):(varNo + objDim)] cat("\n") cat("********** new iteration *********") cat("\n") - + } else { cat("********** stop the evolution *********") cat("\n") } - intermed_output_list_params[[iter]] = parent[, 1:varNo] + intermed_output_list_params[[iter]] = parent[, 1:varNo] intermed_output_list_obj[[iter]] = parent[, (varNo + 1):(varNo + objDim)] } - result = list(intermed_output_list_params = intermed_output_list_params, - intermed_output_list_obj = intermed_output_list_obj, - functions = fn, - parameterDim = varNo, - objectiveDim = objDim, - lowerBounds = lowerBounds, - upperBounds = upperBounds, - popSize = popSize, - tournamentSize = tourSize, - generations = generations, - XoverProb = cprob, - XoverDistIndex = XoverDistIdx, - mutationProb = mprob, - mutationDistIndex = MuDistIdx, - parameters = parent[,1:varNo], - objectives = parent[, (varNo + 1):(varNo + objDim)], + result = list(intermed_output_list_params = intermed_output_list_params, + intermed_output_list_obj = intermed_output_list_obj, + functions = fn, + parameterDim = varNo, + objectiveDim = objDim, + lowerBounds = lowerBounds, + upperBounds = upperBounds, + popSize = popSize, + tournamentSize = tourSize, + generations = generations, + XoverProb = cprob, + XoverDistIndex = XoverDistIdx, + mutationProb = mprob, + mutationDistIndex = MuDistIdx, + parameters = parent[,1:varNo], + objectives = parent[, (varNo + 1):(varNo + objDim)], paretoFrontRank = parent[, varNo + objDim + 1], crowdingDistance = parent[, varNo + objDim + 2]) class(result) = "nsga2R" @@ -708,31 +708,31 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) #' @description \code{get_optimized_parameters_nsga2} will take as input the output of \code{run_nsga2R_cluster} and extract the optimal parameter values, either from the best solution at the end of the generations or the best solution across all generations. #' @usage #' get_optimized_parameters_nsga2(result_nsga2r, source_names, search_all_iterations = FALSE, top_n = NULL, summarise_weights = TRUE) -#' +#' #' @param result_nsga2r The output of \code{run_nsga2R_cluster}. #' @param source_names Character vector containing the names of the data sources. #' @param search_all_iterations Logical indicating whether the best solution across all generations should be considered (TRUE) or only the best solution at the end of the generations (FALSE). #' @param top_n If search_all_iterations=TRUE, this indicates how many of the best solutions should be considered. #' @param summarise_weights If search_all_iterations=TRUE, a logical indicating whether the weights should be summarised by taking the mean and median. -#' +#' #' @return A list containing two dataframes, the optimal data source weights and the optimal hyperparameters. -#' +#' #' @examples #' \dontrun{ #' results <- run_nsga2R_cluster(model_evaluation_optimization_nsga2r, varNo=n_param, objDim=n_obj, #' lowerBounds=lower_bounds, upperBounds=upper_bounds, popSize = 360, tourSize = 2, generations = 15, ncores = 8) -#' +#' #' # Get the best solution at the end of the generations #' optimized_parameters <- get_optimized_parameters_nsga2(results, source_names, search_all_iterations = FALSE, top_n = NULL, summarise_weights = TRUE) -#' +#' #' # Get the best solution across all generations, consider top 25 solutions and summarise weights #' optimized_parameters <- get_optimized_parameters_nsga2(results, source_names, search_all_iterations = TRUE, top_n = 25, summarise_weights = TRUE) #' } -#' +#' #' @export -#' +#' get_optimized_parameters_nsga2r = function(result_nsga2r, source_names, search_all_iterations = FALSE, top_n = NULL, summarise_weights = TRUE){ - + if (!search_all_iterations & is.numeric(top_n)){ message("search_all_iterations is FALSE, so top_n will be ignored") } @@ -763,7 +763,7 @@ get_optimized_parameters_nsga2r = function(result_nsga2r, source_names, search_a if (!search_all_iterations){ # take the best parameter setting considering the geometric mean of the objective function results - parameter_set_index = processed_optimization %>% filter(average == max(average)) %>% .$index + parameter_set_index = processed_optimization %>% filter(average == max(average)) %>% .$index params = optimization_results$parameters[parameter_set_index,] # Data source weights @@ -780,16 +780,16 @@ get_optimized_parameters_nsga2r = function(result_nsga2r, source_names, search_a lapply(function(index) { params = optimization_results$intermed_output_list_params %>% lapply(data.frame) %>% bind_rows() %>% .[index,] %>% as.double() source_weights = tibble(source = source_names, weight = params[1:length(source_names)], index = index) - + other_params = params[(length(source_names)+1): length(params)] other_params_df = tibble(parameter = hyperparam_names, weight = c(other_params[1], other_params[2], other_params[3], other_params[4]), index = index) - list(source_weights = source_weights, + list(source_weights = source_weights, hyperparams = other_params_df) }) - + # Extract data source and hyperparameter weights - source_weight_df <- purrr::map(all_weights, "source_weights") %>% bind_rows() %>% + source_weight_df <- purrr::map(all_weights, "source_weights") %>% bind_rows() %>% mutate_cond(weight <= 0.025 & source %in% source_names_zero_possible, weight = 0) %>% mutate_cond(weight >= 0.975 & source %in% source_names, weight = 1) %>% # If summarise_weights is TRUE, summarise weights by taking the mean and median @@ -1091,10 +1091,16 @@ evaluate_model_cv = function(parameters_setting, lr_network, sig_network, gr_net ligand_importances$spearman[is.na(ligand_importances$spearman)] = 0 ligand_importances$pearson_log_pval[is.na(ligand_importances$pearson_log_pval)] = 0 ligand_importances$spearman_log_pval[is.na(ligand_importances$spearman_log_pval)] = 0 - ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 ligand_importances$pearson_log_pval[is.infinite(ligand_importances$pearson_log_pval)] = 10000 ligand_importances$spearman_log_pval[is.infinite(ligand_importances$spearman_log_pval)] = 10000 - ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + + + if ("mean_rank_GST_log_pval" %in% colnames(ligand_importances)){ + ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 + ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + } else { + warning("mean_rank_GST_log_pval not in ligand_importances; do you have limma installed?") + } # all_importances = ligand_importances %>% select_if(.predicate = function(x){sum(is.na(x)) == 0}) @@ -1111,24 +1117,24 @@ evaluate_model_cv = function(parameters_setting, lr_network, sig_network, gr_net #' @description \code{visualize_parameter_values} will take as input the output of \code{run_nsga2R_cluster} and visualize the data source weights and hyperparameters of the best and worst solutions #' @usage #' visualize_parameter_values(result_nsga2r, source_names, top_ns = c(5, 10, -10, -25)) -#' +#' #' @param result_nsga2r The output of \code{run_nsga2R_cluster}. #' @param source_names Character vector containing the names of the data sources. #' @param top_ns Numeric vector indicating how many of the best and worst solutions should be considered (negative values indicate the worst solutions; default: c(5, 10, -10, -25)). -#' +#' #' @return A list containing two ggplot objects, one for the data source weights and one for the hyperparameters. -#' +#' #' @examples #' \dontrun{ #' results <- run_nsga2R_cluster(model_evaluation_optimization_nsga2r, varNo=n_param, objDim=n_obj, #' lowerBounds=lower_bounds, upperBounds=upper_bounds, popSize = 360, tourSize = 2, generations = 15, ncores = 8) -#' +#' #' # Visualize the best and worst 5 solutions #' visualize_parameter_values(results, source_names, top_ns = c(5, -5)) #' } -#' +#' #' @export -#' +#' visualize_parameter_values = function(result_nsga2r, source_names, top_ns = c(5, 10, -10, -25)){ optimized_params <- lapply(top_ns, function(n) { @@ -1148,16 +1154,16 @@ visualize_parameter_values = function(result_nsga2r, source_names, top_ns = c(5, #' @title Visualize parameter values from the output of \code{run_nsga2R_cluster} across cross-validation folds. #' @description \code{visualize_parameter_values_across_folds} will take as input the output of \code{run_nsga2R_cluster} and visualize the data source weights and hyperparameters of the best solutions across all folds. -#' +#' #' @usage #' visualize_parameter_values_across_folds(result_nsga2r_list, source_names, top_n = 25) -#' +#' #' @param result_nsga2r_list A list containing the outputs of \code{run_nsga2R_cluster} for each cross-validation fold. #' @param source_names Character vector containing the names of the data sources. #' @param top_n Numeric indicating how many of the best solutions should be considered. -#' +#' #' @return A list containing two ggplot objects, one for the data source weights and one for the hyperparameters. -#' +#' #' @examples #' \dontrun{ #' results_list <- lapply(cv_folds, function(fold){ @@ -1171,14 +1177,14 @@ visualize_parameter_values = function(result_nsga2r, source_names, top_ns = c(5, #' source_names = source_names, algorithm = "PPR", correct_topology = FALSE, lr_network = lr_network, sig_network = lr_network, gr_network = gr_network_subset, #' settings = settings, secondary_targets = FALSE, remove_direct_links = "no", damping_factor = NULL) #' }) -#' +#' #' # Visualize the best 25 solutions across all folds #' visualize_parameter_values_across_folds(results_list, source_names, top_n = 25) #' } -#' +#' #' @export visualize_parameter_values_across_folds = function(result_nsga2r_list, source_names, top_n = 25){ - + # Get best weights for each fold optimized_params <- lapply(1:length(result_nsga2r_list), function(i){ get_optimized_parameters_nsga2r(result_nsga2r_list[[i]], source_names, search_all_iterations = TRUE, top_n = top_n, summarise_weights = FALSE) %>% @@ -1191,7 +1197,7 @@ visualize_parameter_values_across_folds = function(result_nsga2r_list, source_na hyperparameters_boxplot = purrr::map(optimized_params, "hyperparams_df") %>% bind_rows() %>% ggplot(aes(x = fold, y = weight, group = fold, color = fold)) + geom_boxplot() + geom_point() + facet_wrap(.~parameter) + theme_bw() - - + + return(list(source_weights_boxplot = source_weights_boxplot, hyperparameters_boxplot = hyperparameters_boxplot)) -} \ No newline at end of file +} diff --git a/R/prioritization.R b/R/prioritization.R index 89baf42..79b2933 100644 --- a/R/prioritization.R +++ b/R/prioritization.R @@ -125,14 +125,26 @@ get_exprs_avg = function(seurat_obj, celltype_colname, # Subset seurat object if (!is.null(condition_oi)) { - seurat_obj = seurat_obj[,seurat_obj[[condition_colname]] == condition_oi] + seurat_obj <- seurat_obj[,seurat_obj[[condition_colname]] == condition_oi] } - seurat_obj <- NormalizeData(seurat_obj, verbose = FALSE) + celltypes <- unique(seurat_obj[[celltype_colname, drop=TRUE]]) + avg_celltype <- AverageExpression(seurat_obj, group.by = celltype_colname, assays = assay_oi, ...) %>% .[[assay_oi]] %>% data.frame(check.names=FALSE) %>% rownames_to_column("gene") %>% pivot_longer(!gene, names_to = "cluster_id", values_to = "avg_expr") + # If any celltypes had an underscore in their name + if (any(grepl("_", celltypes))){ + # Map the new names and the original names + # This is so it works in the case the original name also already has a hyphen in in + mapping <- data.frame(orig_name = sort(celltypes), cluster_id = sort(unique(avg_celltype$cluster_id))) + avg_celltype <- avg_celltype %>% left_join(mapping, by = "cluster_id") %>% + dplyr::select(gene, cluster_id = orig_name, avg_expr) + + + } + return (avg_celltype) } @@ -339,7 +351,7 @@ generate_info_tables <- function(seuratObj, FindMarkers_args <- list(object = seuratObj, ident.1 = condition_oi, ident.2 = condition_reference, - group.by = "aggregate", + group.by = condition_colname, assay = assay_oi, features = unique(unlist(lr_network_filtered)), min.pct = 0, @@ -389,11 +401,11 @@ generate_info_tables <- function(seuratObj, #' Additionally, the following columns are added: #' \itemize{ #' \item \code{lfc_pval_*}: product of -log10(pval) and the LFC of the ligand/receptor -#' \item \code{p_val_*_adapted}: p-value adapted to the sign of the LFC to only consider interactions where the ligand/receptor is upregulated in the sender/receiver +#' \item \code{p_val_adapted_*}: p-value adapted to the sign of the LFC to only consider interactions where the ligand/receptor is upregulated in the sender/receiver #' \item \code{activity_zscore}: z-score of the ligand activity #' \item \code{prioritization_score}: The prioritization score for each interaction, calculated as a weighted sum of the prioritization criteria. #' } -#' Moreover, \code{scaled_*} columns are scaled using the corresponding column's ranking or the \code{scale_quantile_adapted} function. The columns used for prioritization are scaled_p_val_ligand_adapted, scaled_p_val_receptor_adapted, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_ligand_adapted_group, scaled_p_val_receptor_adapted_group +#' Moreover, \code{scaled_*} columns are scaled using the corresponding column's ranking or the \code{scale_quantile_adapted} function. The columns used for prioritization are scaled_p_val_adapted_ligand, scaled_p_val_adapted_receptor, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_adapted_ligand_group, scaled_p_val_adapted_receptor_group #' #' @import dplyr #' @@ -499,21 +511,21 @@ generate_prioritization_tables = function(sender_receiver_info, sender_receiver_ sender_ligand_prioritization = sender_receiver_de %>% dplyr::ungroup() %>% dplyr::select(sender, ligand, lfc_ligand, p_val_ligand) %>% dplyr::distinct() %>% dplyr::mutate(lfc_pval_ligand = -log10(p_val_ligand)*lfc_ligand, - p_val_ligand_adapted = -log10(p_val_ligand)*sign(lfc_ligand)) + p_val_adapted_ligand = -log10(p_val_ligand)*sign(lfc_ligand)) sender_ligand_prioritization = sender_ligand_prioritization %>% dplyr::mutate(scaled_lfc_ligand = rank(lfc_ligand, ties.method = "average", na.last = FALSE)/max(rank(lfc_ligand, ties.method = "average", na.last = FALSE)), scaled_p_val_ligand = rank(desc(p_val_ligand), ties.method = "average", na.last = FALSE)/max(rank(desc(p_val_ligand), ties.method = "average", na.last = FALSE)), scaled_lfc_pval_ligand = rank(lfc_pval_ligand, ties.method = "average", na.last = FALSE)/max(rank(lfc_pval_ligand, ties.method = "average", na.last = FALSE)), - scaled_p_val_ligand_adapted = rank(p_val_ligand_adapted, ties.method = "average", na.last = FALSE)/max(rank(p_val_ligand_adapted, ties.method = "average", na.last = FALSE))) %>% + scaled_p_val_adapted_ligand = rank(p_val_adapted_ligand, ties.method = "average", na.last = FALSE)/max(rank(p_val_adapted_ligand, ties.method = "average", na.last = FALSE))) %>% dplyr::arrange(-lfc_pval_ligand) # Receptor DE prioritization receiver_receptor_prioritization = sender_receiver_de %>% dplyr::ungroup() %>% dplyr::select(receiver, receptor, lfc_receptor, p_val_receptor) %>% dplyr::distinct() %>% dplyr::mutate(lfc_pval_receptor = -log10(p_val_receptor)*lfc_receptor, - p_val_receptor_adapted = -log10(p_val_receptor)*sign(lfc_receptor) ) + p_val_adapted_receptor = -log10(p_val_receptor)*sign(lfc_receptor) ) receiver_receptor_prioritization = receiver_receptor_prioritization %>% dplyr::mutate(scaled_lfc_receptor = rank(lfc_receptor, ties.method = "average", na.last = FALSE)/max(rank(lfc_receptor, ties.method = "average", na.last = FALSE)), scaled_p_val_receptor = rank(desc(p_val_receptor), ties.method = "average", na.last = FALSE)/max(rank(desc(p_val_receptor), ties.method = "average", na.last = FALSE)), scaled_lfc_pval_receptor = rank(lfc_pval_receptor, ties.method = "average", na.last = FALSE)/max(rank(lfc_pval_receptor, ties.method = "average", na.last = FALSE)), - scaled_p_val_receptor_adapted = rank(p_val_receptor_adapted, ties.method = "average", na.last = FALSE)/max(rank(p_val_receptor_adapted, ties.method = "average", na.last = FALSE))) %>% dplyr::arrange(-lfc_pval_receptor) + scaled_p_val_adapted_receptor = rank(p_val_adapted_receptor, ties.method = "average", na.last = FALSE)/max(rank(p_val_adapted_receptor, ties.method = "average", na.last = FALSE))) %>% dplyr::arrange(-lfc_pval_receptor) # Ligand activity prioritization ligand_activity_prioritization = ligand_activities %>% @@ -535,21 +547,21 @@ generate_prioritization_tables = function(sender_receiver_info, sender_receiver_ # Condition specificity of ligand (upregulation) ligand_condition_prioritization = lr_condition_de %>% dplyr::ungroup() %>% dplyr::select(ligand, lfc_ligand, p_val_ligand) %>% dplyr::distinct() %>% dplyr::mutate(lfc_pval_ligand = -log10(p_val_ligand)*lfc_ligand, - p_val_ligand_adapted = -log10(p_val_ligand)*sign(lfc_ligand)) + p_val_adapted_ligand = -log10(p_val_ligand)*sign(lfc_ligand)) ligand_condition_prioritization = ligand_condition_prioritization %>% dplyr::mutate(scaled_lfc_ligand = rank(lfc_ligand, ties.method = "average", na.last = FALSE)/max(rank(lfc_ligand, ties.method = "average", na.last = FALSE)), scaled_p_val_ligand = rank(desc(p_val_ligand), ties.method = "average", na.last = FALSE)/max(rank(desc(p_val_ligand), ties.method = "average", na.last = FALSE)), scaled_lfc_pval_ligand = rank(lfc_pval_ligand, ties.method = "average", na.last = FALSE)/max(rank(lfc_pval_ligand, ties.method = "average", na.last = FALSE)), - scaled_p_val_ligand_adapted = rank(p_val_ligand_adapted, ties.method = "average", na.last = FALSE)/max(rank(p_val_ligand_adapted, ties.method = "average", na.last = FALSE))) %>% + scaled_p_val_adapted_ligand = rank(p_val_adapted_ligand, ties.method = "average", na.last = FALSE)/max(rank(p_val_adapted_ligand, ties.method = "average", na.last = FALSE))) %>% dplyr::arrange(-lfc_pval_ligand) %>% rename_with(.fn = function(column_name) paste0(column_name, "_group"), .cols = -ligand) # Condition specificity of receptor (upregulation) receptor_condition_prioritization = lr_condition_de %>% dplyr::ungroup() %>% dplyr::select(receptor, lfc_receptor, p_val_receptor) %>% dplyr::distinct() %>% dplyr::mutate(lfc_pval_receptor = -log10(p_val_receptor)*lfc_receptor, - p_val_receptor_adapted = -log10(p_val_receptor)*sign(lfc_receptor)) + p_val_adapted_receptor = -log10(p_val_receptor)*sign(lfc_receptor)) receptor_condition_prioritization = receptor_condition_prioritization %>% dplyr::mutate(scaled_lfc_receptor = rank(lfc_receptor, ties.method = "average", na.last = FALSE)/max(rank(lfc_receptor, ties.method = "average", na.last = FALSE)), scaled_p_val_receptor = rank(desc(p_val_receptor), ties.method = "average", na.last = FALSE)/max(rank(desc(p_val_receptor), ties.method = "average", na.last = FALSE)), scaled_lfc_pval_receptor = rank(lfc_pval_receptor, ties.method = "average", na.last = FALSE)/max(rank(lfc_pval_receptor, ties.method = "average", na.last = FALSE)), - scaled_p_val_receptor_adapted = rank(p_val_receptor_adapted, ties.method = "average", na.last = FALSE)/max(rank(p_val_receptor_adapted, ties.method = "average", na.last = FALSE))) %>% + scaled_p_val_adapted_receptor = rank(p_val_adapted_receptor, ties.method = "average", na.last = FALSE)/max(rank(p_val_adapted_receptor, ties.method = "average", na.last = FALSE))) %>% dplyr::arrange(-lfc_pval_receptor) %>% rename_with(.fn = function(column_name) paste0(column_name, "_group"), .cols = -receptor) } else { @@ -572,20 +584,23 @@ generate_prioritization_tables = function(sender_receiver_info, sender_receiver_ # have a weighted average the final score (no product!!) - sum_prioritization_weights = weights["de_ligand"] + weights["de_receptor"] + weights["activity_scaled"] + weights["exprs_ligand"] + weights["exprs_receptor"] + weights["ligand_condition_specificity"] + weights["receptor_condition_specificity"] + sum_prioritization_weights = 0.5*weights["de_ligand"] + 0.5*weights["de_receptor"] + weights["activity_scaled"] + 0.5*weights["exprs_ligand"] + 0.5*weights["exprs_receptor"] + weights["ligand_condition_specificity"] + weights["receptor_condition_specificity"] group_prioritization_tbl = group_prioritization_tbl %>% rowwise() %>% dplyr::mutate(prioritization_score = ( - (prioritizing_weights["de_ligand"] * ifelse("scaled_p_val_ligand_adapted" %in% names(group_prioritization_tbl), scaled_p_val_ligand_adapted, 0)) + - (prioritizing_weights["de_receptor"] * ifelse("scaled_p_val_receptor_adapted" %in% names(group_prioritization_tbl), scaled_p_val_receptor_adapted, 0)) + + (0.5*prioritizing_weights["de_ligand"] * ifelse("scaled_p_val_adapted_ligand" %in% names(group_prioritization_tbl), scaled_p_val_adapted_ligand, 0)) + + (0.5*prioritizing_weights["de_receptor"] * ifelse("scaled_p_val_adapted_receptor" %in% names(group_prioritization_tbl), scaled_p_val_adapted_receptor, 0)) + (prioritizing_weights["activity_scaled"] * ifelse("scaled_activity" %in% names(group_prioritization_tbl), scaled_activity, 0)) + - (prioritizing_weights["exprs_ligand"] * ifelse("scaled_avg_exprs_ligand" %in% names(group_prioritization_tbl), scaled_avg_exprs_ligand, 0)) + - (prioritizing_weights["exprs_receptor"] * ifelse("scaled_avg_exprs_receptor" %in% names(group_prioritization_tbl), scaled_avg_exprs_receptor, 0)) + - (prioritizing_weights["ligand_condition_specificity"] * ifelse("scaled_p_val_ligand_adapted_group" %in% names(group_prioritization_tbl), scaled_p_val_ligand_adapted_group, 0)) + - (prioritizing_weights["receptor_condition_specificity"] * ifelse("scaled_p_val_receptor_adapted_group" %in% names(group_prioritization_tbl), scaled_p_val_receptor_adapted_group, 0)) + (0.5*prioritizing_weights["exprs_ligand"] * ifelse("scaled_avg_exprs_ligand" %in% names(group_prioritization_tbl), scaled_avg_exprs_ligand, 0)) + + (0.5*prioritizing_weights["exprs_receptor"] * ifelse("scaled_avg_exprs_receptor" %in% names(group_prioritization_tbl), scaled_avg_exprs_receptor, 0)) + + (prioritizing_weights["ligand_condition_specificity"] * ifelse("scaled_p_val_adapted_ligand_group" %in% names(group_prioritization_tbl), scaled_p_val_adapted_ligand_group, 0)) + + (prioritizing_weights["receptor_condition_specificity"] * ifelse("scaled_p_val_adapted_receptor_group" %in% names(group_prioritization_tbl), scaled_p_val_adapted_receptor_group, 0)) )* (1/sum_prioritization_weights)) %>% dplyr::arrange(-prioritization_score) %>% ungroup() + # Add rank + group_prioritization_tbl = group_prioritization_tbl %>% dplyr::mutate(prioritization_rank = rank(desc(prioritization_score))) + return (group_prioritization_tbl) } diff --git a/R/supporting_functions.R b/R/supporting_functions.R index 90752c7..dbfb2b2 100644 --- a/R/supporting_functions.R +++ b/R/supporting_functions.R @@ -197,7 +197,7 @@ convert_alias_to_symbols = function(aliases, organism, verbose = TRUE){ #' @description \code{alias_to_symbol_seurat} Convert aliases to official gene symbols in a Seurat Object. Makes use of `convert_alias_to_symbols` #' @usage alias_to_symbol_seurat(seurat_obj, organism) #' -#' @param seurat_obj Seurat object +#' @param seurat_obj Seurat object, v4 or below. For Seurat v5, a warning is thrown and the same object will be returned. #' @param organism Is Seurat object data from "mouse" or "human" #' #' @return Seurat object @@ -219,13 +219,14 @@ alias_to_symbol_seurat = function(seurat_obj, organism) { # Stop if obj_version is seurat v5 if (obj_version >= 5){ - stop("This function is not supported for Seurat v5 objects. Consider using `convert_alias_to_symbols` on your original expression matrix and creating a new Seurat object instead. + warning("This function is not supported for Seurat v5 objects, so the same object will be returned. Consider using `convert_alias_to_symbols` on your original expression matrix and creating a new Seurat object instead. If this is not feasible, consider checking out Seurat.utils::RenameGenesSeurat.") + + return(seurat_obj) } assays <- Assays(seurat_obj) - convert_newnames <- function(feature_names, organism, verbose = FALSE) { newnames <- convert_alias_to_symbols(feature_names, organism = organism, verbose = verbose) @@ -529,33 +530,33 @@ classification_evaluation_continuous_pred = function(prediction,response, iregul cor_p_pval = suppressWarnings(cor.test(as.numeric(prediction), as.numeric(response))) %>% .$p.value cor_s_pval = suppressWarnings(cor.test(as.numeric(prediction), as.numeric(response), method = "s")) %>% .$p.value - mean_rank_GST = limma::wilcoxGST(response, prediction) - #### now start calculating the AUC-iRegulon + # Mean rank GST calculated if limma is installed + mean_rank_GST = ifelse(rlang::is_installed("limma"), limma::wilcoxGST(response, prediction), NA) + + # Calculate the AUC-iRegulon + output_iregulon = list() + if (iregulon){ + output_iregulon = calculate_auc_iregulon(prediction,response) + } + tbl_perf = tibble(auroc = auroc, aupr = aupr, aupr_corrected = aupr - aupr_random, sensitivity_roc = sensitivity, specificity_roc = specificity, mean_rank_GST_log_pval = -log(mean_rank_GST), + auc_iregulon = output_iregulon$auc_iregulon, + auc_iregulon_corrected = output_iregulon$auc_iregulon_corrected, pearson_log_pval = -log10(cor_p_pval), spearman_log_pval = -log10(cor_s_pval), pearson = cor_p, spearman = cor_s) - if (iregulon == TRUE){ - output_iregulon = calculate_auc_iregulon(prediction,response) - tbl_perf = tibble(auroc = auroc, - aupr = aupr, - aupr_corrected = aupr - aupr_random, - sensitivity_roc = sensitivity, - specificity_roc = specificity, - mean_rank_GST_log_pval = -log(mean_rank_GST), - auc_iregulon = output_iregulon$auc_iregulon, - auc_iregulon_corrected = output_iregulon$auc_iregulon_corrected, - pearson_log_pval = -log10(cor_p_pval), - spearman_log_pval = -log10(cor_s_pval), - pearson = cor_p, - spearman = cor_s) + + # Remove mean_rank_GST if limma is not installed + if (!rlang::is_installed("limma")) { + tbl_perf = tbl_perf %>% select(-mean_rank_GST_log_pval) } + return(tbl_perf) } classification_evaluation_categorical_pred = function(predictions, response) { diff --git a/man/alias_to_symbol_seurat.Rd b/man/alias_to_symbol_seurat.Rd index ec19f7e..70eda3d 100644 --- a/man/alias_to_symbol_seurat.Rd +++ b/man/alias_to_symbol_seurat.Rd @@ -7,7 +7,7 @@ alias_to_symbol_seurat(seurat_obj, organism) } \arguments{ -\item{seurat_obj}{Seurat object} +\item{seurat_obj}{Seurat object, v4 or below. For Seurat v5, a warning is thrown and the same object will be returned.} \item{organism}{Is Seurat object data from "mouse" or "human"} } diff --git a/man/assign_ligands_to_celltype.Rd b/man/assign_ligands_to_celltype.Rd index c92543c..f63d285 100644 --- a/man/assign_ligands_to_celltype.Rd +++ b/man/assign_ligands_to_celltype.Rd @@ -4,7 +4,18 @@ \alias{assign_ligands_to_celltype} \title{Assign ligands to cell types} \usage{ -assign_ligands_to_celltype(seuratObj, ligands, celltype_col, func.agg=mean, func.assign=function(x) mean(x) + sd(x), slot="data", condition_oi=NULL, condition_col=NULL) +assign_ligands_to_celltype( + seuratObj, + ligands, + celltype_col, + func.agg = mean, + func.assign = function(x) { + mean(x) + sd(x) + }, + condition_oi = NULL, + condition_col = NULL, + ... +) } \arguments{ \item{seuratObj}{Seurat object} @@ -21,7 +32,7 @@ assign_ligands_to_celltype(seuratObj, ligands, celltype_col, func.agg=mean, func \item{condition_col}{Metadata column name in the Seurat object that contains the condition of interest (default = NULL)} -\item{slot}{Slot in the Seurat object to use (default = "data"). If "data", the normalized counts are first exponentiated before aggregation is performed} +\item{...}{Arguments passed to Seurat::GetAssayData, e.g., for the slot/layer to use (default: data)} } \value{ A data frame of two columns, the cell type the ligand has been assigned to (\code{ligand_type}) and the ligand name (\code{ligand}) @@ -29,11 +40,14 @@ A data frame of two columns, the cell type the ligand has been assigned to (\cod \description{ Assign ligands to a sender cell type, based on the strongest expressing cell type of that ligand. Ligands are only assigned to a cell type if that cell type is the only one to show an expression that is higher than the average + SD. Otherwise, it is assigned to "General". } +\details{ +If the provided slot/layer is "data", the normalized counts are first exponentiated before aggregation is performed +} \examples{ \dontrun{ assign_ligands_to_celltype(seuratObj = seuratObj, ligands = best_upstream_ligands[1:20], celltype_col = "celltype", func.agg = mean, func.assign = function(x) {mean(x)+sd(x)}, - slot = "data", condition_oi = "LCMV", condition_col = "aggregate") + condition_oi = "LCMV", condition_col = "aggregate", slot = "data") } } diff --git a/man/evaluate_target_prediction.Rd b/man/evaluate_target_prediction.Rd index e2af602..89f2849 100644 --- a/man/evaluate_target_prediction.Rd +++ b/man/evaluate_target_prediction.Rd @@ -14,7 +14,8 @@ evaluate_target_prediction(setting,ligand_target_matrix, ligands_position = "col \item{ligands_position}{Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols"} } \value{ -A data.frame with following variables: setting, ligand and for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds. +A data.frame with following variables: setting, ligand nd for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds.\cr +"mean_rank_GST_log_pval" will only be included in the dataframe if limma is installed. From NicheNet v2.1.7 onwards, limma is no longer a hard dependency of NicheNet. } \description{ \code{evaluate_target_prediction} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand). It shows several classification evaluation metrics for the prediction. Different classification metrics are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. diff --git a/man/generate_prioritization_tables.Rd b/man/generate_prioritization_tables.Rd index 33aa3ee..719a002 100644 --- a/man/generate_prioritization_tables.Rd +++ b/man/generate_prioritization_tables.Rd @@ -32,11 +32,11 @@ The resulting dataframe contains columns from the input dataframes, but columns Additionally, the following columns are added: \itemize{ \item \code{lfc_pval_*}: product of -log10(pval) and the LFC of the ligand/receptor -\item \code{p_val_*_adapted}: p-value adapted to the sign of the LFC to only consider interactions where the ligand/receptor is upregulated in the sender/receiver +\item \code{p_val_adapted_*}: p-value adapted to the sign of the LFC to only consider interactions where the ligand/receptor is upregulated in the sender/receiver \item \code{activity_zscore}: z-score of the ligand activity \item \code{prioritization_score}: The prioritization score for each interaction, calculated as a weighted sum of the prioritization criteria. } -Moreover, \code{scaled_*} columns are scaled using the corresponding column's ranking or the \code{scale_quantile_adapted} function. The columns used for prioritization are scaled_p_val_ligand_adapted, scaled_p_val_receptor_adapted, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_ligand_adapted_group, scaled_p_val_receptor_adapted_group +Moreover, \code{scaled_*} columns are scaled using the corresponding column's ranking or the \code{scale_quantile_adapted} function. The columns used for prioritization are scaled_p_val_adapted_ligand, scaled_p_val_adapted_receptor, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_adapted_ligand_group, scaled_p_val_adapted_receptor_group } \description{ User can choose the importance attached to each of the following prioritization criteria: differential expression of ligand and receptor, cell-type specificity of expression of ligand and receptor, NicheNet ligand activity diff --git a/man/make_mushroom_plot.Rd b/man/make_mushroom_plot.Rd index 2d02d29..e3d5a82 100644 --- a/man/make_mushroom_plot.Rd +++ b/man/make_mushroom_plot.Rd @@ -9,20 +9,21 @@ make_mushroom_plot( top_n = 30, show_rankings = FALSE, show_all_datapoints = FALSE, - true_color_range = FALSE, + true_color_range = TRUE, + use_absolute_rank = FALSE, size = "scaled_avg_exprs", - color = "scaled_lfc", + color = "scaled_p_val_adapted", ligand_fill_colors = c("#DEEBF7", "#08306B"), receptor_fill_colors = c("#FEE0D2", "#A50F15"), - unranked_ligand_fill_colors = c(alpha("#FFFFFF", alpha = 0.2), alpha("#252525", alpha + unranked_ligand_fill_colors = c(alpha("#FFFFFF", alpha = 0.2), alpha("#252525", alpha = + 0.2)), + unranked_receptor_fill_colors = c(alpha("#FFFFFF", alpha = 0.2), alpha("#252525", alpha = 0.2)), - unranked_receptor_fill_colors = c(alpha("#FFFFFF", alpha = 0.2), alpha("#252525", - alpha = 0.2)), ... ) } \arguments{ -\item{prioritization_table}{A prioritization table as generated by \code{\link{generate_prioritization_tables}}} +\item{prioritization_table}{A prioritization table as generated by \code{\link{generate_prioritization_tables}}.} \item{top_n}{An integer indicating how many ligand-receptor pairs to show} @@ -30,11 +31,13 @@ make_mushroom_plot( \item{show_all_datapoints}{A logical indicating whether to show all ligand-receptor pairs (default: FALSE, if true they will be grayed out)} -\item{true_color_range}{A logical indicating whether to use the true color range for the ligand-receptor pairs (default: FALSE; range 0-1 is used)} +\item{true_color_range}{A logical indicating whether to use the default color range as determined by ggplot (TRUE, default) or set the limits to a range of 0-1 (FALSE)} + +\item{use_absolute_rank}{A logical indicating to whether use the absolute prioritization rank to filter the top_n ligand-receptor pairs (default: FALSE)} \item{size}{A string indicating which column to use for the size of the semicircles (default: "scaled_avg_exprs"; use column name without "_ligand" or "_receptor" suffix)} -\item{color}{A string indicating which column to use for the color of the semicircles (default: "scaled_lfc"; use column name without "_ligand" or "_receptor" suffix)} +\item{color}{A string indicating which column to use for the color of the semicircles (default: "scaled_p_val_adapted"; use column name without "_ligand" or "_receptor" suffix)} \item{ligand_fill_colors}{A vector of the low and high colors to use for the ligand semicircle fill gradient (default: c("#DEEBF7", "#08306B"))} @@ -42,7 +45,7 @@ make_mushroom_plot( \item{unranked_ligand_fill_colors}{A vector of the low and high colors to use for the unranked ligands when show_all_datapoints is TRUE (default: c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2)))} -\item{unranked_receptor_fill_colors}{A vector of the low and high colors to use for the unkraed receptors when show_all_datapoints is TRUE (default: c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2)))} +\item{unranked_receptor_fill_colors}{A vector of the low and high colors to use for the unranked receptors when show_all_datapoints is TRUE (default: c(alpha("#FFFFFF", alpha=0.2), alpha("#252525", alpha=0.2)))} \item{...}{Additional arguments passed to \code{\link{ggplot2::theme}}. As there are often issues with the scales legend, it is recommended to change legend sizes and positions using this argument, i.e., \code{legend.key.height}, \code{legend.key.width}, \code{legend.title}, and \code{legend.text}.} } @@ -52,6 +55,13 @@ A ggplot object \description{ \code{make_mushroom_plot} Make a plot in which each glyph consists of two semicircles corresponding to ligand- and receptor- information. The size of the semicircle is the percentage of cells that express the protein, while the saturation corresponds to the scaled average expression value. } +\details{ +If the values range of the column used as the "size" parameter is not between 0 and 1.001, an error will be thrown. + +The sender cell types can be ordered by encoding the "sender" column as a factor. If the "sender" column is not a factor, the sender cell types will be ordered alphabetically. + +By default, the top_n ligand-receptor pairs are shown despite their absolute ranking. So, if a receiver cell type has LR pairs that are only ranked from 31-40 and the top_n is set to 20, the LR pairs will be shown. If use_absolute_rank is set to TRUE, only LR pairs with absolute ranking from 1-20 will be shown. +} \examples{ \dontrun{ # Create a prioritization table @@ -68,7 +78,10 @@ make_mushroom_plot(prior_table, show_all_datapoints = TRUE, true_color_range = T # Change the size and color columns make_mushroom_plot(prior_table, size = "pct_expressed", color = "scaled_avg_exprs") -} +# For a prioritization table with multiple receiver cell types +make_mushroom_plot(prior_table_combined \%>\% filter(receiver == celltype_oi)) +} + } diff --git a/man/nichenetr-package.Rd b/man/nichenetr-package.Rd new file mode 100644 index 0000000..bab2180 --- /dev/null +++ b/man/nichenetr-package.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nichenetr-package.R +\docType{package} +\name{nichenetr-package} +\alias{nichenetr} +\alias{nichenetr-package} +\title{nichenetr: Modeling Intercellular Communication by Linking Ligands to Target Genes with NicheNet} +\description{ +This package allows you the investigate intercellular communication from a computational perspective. More specifically, it allows to investigate how interacting cells influence each other's gene expression. Functionalities of this package (e.g. including predicting extracellular upstream regulators and their affected target genes) build upon a probabilistic model of ligand-target links that was inferred by data-integration. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/saeyslab/nichenetr} + \item Report bugs at \url{https://github.com/saeyslab/nichenetr/issues} +} + +} +\author{ +\strong{Maintainer}: Chananchida Sang-aram \email{chananchida.sangaram@ugent.be} + +Authors: +\itemize{ + \item Robin Browaeys +} + +} +\keyword{internal} diff --git a/man/nichenetr.Rd b/man/nichenetr.Rd deleted file mode 100644 index d6af330..0000000 --- a/man/nichenetr.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/nichenetr.R -\docType{package} -\name{nichenetr} -\alias{nichenetr} -\title{nichenetr: Linking Extracellular Protein Signals to Target Genes by data-integration.} -\description{ -This package allows you the investigate intercellular communication from a computational perspective. Functionalities of this package (e.g. including predicting extracellular upstream regulators) build upon a probabilistic model of ligand-target links that was inferred by data-integration. -} -\section{Construction of the probabilistic model}{ - -\code{\link{construct_weighted_networks}}, \code{\link{construct_ligand_target_matrix}} -} - -\section{Evaluation functions}{ - -\code{\link{evaluate_target_prediction}} -} - diff --git a/nichenetr.Rproj b/nichenetr.Rproj index 64f40d9..e1826f5 100644 --- a/nichenetr.Rproj +++ b/nichenetr.Rproj @@ -18,4 +18,5 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageCheckArgs: --no-build-vignettes PackageRoxygenize: rd,collate,namespace diff --git a/tests/testthat/test-prioritization.R b/tests/testthat/test-prioritization.R index 7dd7c3c..afd0c0f 100644 --- a/tests/testthat/test-prioritization.R +++ b/tests/testthat/test-prioritization.R @@ -11,7 +11,7 @@ test_that("Wrapper function for seurat", { if (as.numeric(substr(packageVersion("Seurat"), 1, 1)) < 5){ seurat_obj_test = alias_to_symbol_seurat(seurat_obj_test, "mouse") } else if (grepl("^5", packageVersion("Seurat")) & grepl("^5", seurat_obj_test@version)){ - expect_error(alias_to_symbol_seurat(seurat_obj_test, "mouse")) + expect_warning(alias_to_symbol_seurat(seurat_obj_test, "mouse")) } lr_network_filtered <- lr_network %>% filter(from %in% rownames(seurat_obj_test), to %in% rownames(seurat_obj_test)) @@ -138,7 +138,7 @@ test_that("Prioritization scheme works", { pct <- pcts[i] if (grepl("^5", packageVersion("Seurat"))){ - expect_error(alias_to_symbol_seurat(seurat_obj_test, "mouse")) + expect_warning(alias_to_symbol_seurat(seurat_obj_test, "mouse")) } else { seurat_obj_test <- alias_to_symbol_seurat(seurat_obj_test, "mouse") } @@ -172,13 +172,22 @@ test_that("Prioritization scheme works", { condition_colname = "aggregate", condition_oi = condition_oi, features = feature_list) - # Calculate condition specificity - only for datasets with two conditions! - condition_markers <- FindMarkers(object = seurat_obj_test, ident.1 = condition_oi, ident.2 = condition_reference, + # Test cell type names conversion for Seurat object + # Replace space with underscore + seurat_obj_test$celltype2 <- gsub(" ", "_", seurat_obj_test$celltype) + new_celltypes <- suppressWarnings(get_exprs_avg(seurat_obj_test, "celltype2") %>% pull(cluster_id) %>% unique()) + expect_equal(new_celltypes, sort(unique(seurat_obj_test$celltype2))) + + # Replace CD8 T with CD8_T-test & replace Mono with Mono-test + seurat_obj_test$celltype2 <- gsub("CD8 T", "CD8_T-test", seurat_obj_test$celltype) %>% gsub("Mono", "Mono-test", .) + new_celltypes <- suppressWarnings(get_exprs_avg(seurat_obj_test, "celltype2") %>% pull(cluster_id) %>% unique()) + expect_equal(new_celltypes, sort(unique(seurat_obj_test$celltype2))) + + # Calculate condition specificity - only for datasets with two conditions! + condition_markers <- FindMarkers(object = seurat_obj_test, ident.1 = condition_oi, ident.2 = condition_reference, group.by = "aggregate", min.pct = 0, logfc.threshold = 0, features = feature_list) %>% rownames_to_column("gene") - # TODO: TESTS FOR PROCESS_TABLE_TO_IC - # Combine DE of senders and receivers -> used for prioritization processed_DE_table <- process_table_to_ic(DE_table, table_type = "celltype_DE", lr_network, senders_oi = sender_celltypes, receivers_oi = receiver) @@ -197,41 +206,41 @@ test_that("Prioritization scheme works", { expect_identical(info_tables$sender_receiver_de, processed_DE_table) expect_identical(info_tables$lr_condition_de, processed_condition_markers) - # Check processed tables - expect_type(processed_DE_table,"list") - expect_type(processed_expr_table,"list") - expect_type(processed_condition_markers,"list") - - # Check that processed_DE_table has been processed correctly - expect_equal(length(unique(processed_DE_table$sender)), length(sender_celltypes)) - expect_equal(length(unique(processed_DE_table$receiver)), length(receiver)) - - expect_equal(processed_DE_table %>% filter(ligand == "Il1rn", sender == "Mono") %>% select(p_val_ligand, lfc_ligand, pct_expressed_sender, p_adj_ligand, sender, ligand) %>% distinct(across(everything())), - DE_table %>% filter(gene == "Il1rn", cluster_id == "Mono") %>% select(-pct.2), - check.attributes = FALSE) - expect_equal(processed_DE_table %>% filter(ligand == "Il1rn", sender == "Mono", receptor == "Il1r2") %>% select(p_val_receptor, lfc_receptor, pct_expressed_receiver, p_adj_receptor, receiver, receptor) , - DE_table %>% filter(gene == "Il1r2", cluster_id == "CD8 T") %>% select(-pct.2), - check.attributes = FALSE) - temp_row <- processed_DE_table %>% filter(ligand == "Il1rn", sender == "Mono", receiver == "CD8 T", receptor == "Il1r2") - expect_equal((temp_row$lfc_ligand + temp_row$lfc_receptor)/2, temp_row$ligand_receptor_lfc_avg) - - - # Check that processed_expr_table has been processed correctly - expect_equal(length(unique(processed_expr_table$sender)), length(celltypes)) - expect_equal(length(unique(processed_expr_table$receiver)), length(celltypes)) - - temp_row <- processed_expr_table %>% filter(ligand == "Il1rn", sender == "Mono", receiver == "CD8 T", receptor == "Il1r2") - expect_equal(temp_row$avg_ligand * temp_row$avg_receptor, temp_row$ligand_receptor_prod) - - # Check that processed_condition_markers has been processed correctly - expect_equal(processed_condition_markers %>% filter(ligand == "Il1rn") %>% select(ligand, p_val_ligand, lfc_ligand, p_adj_ligand) %>% distinct(across(everything())), - condition_markers %>% filter(gene == "Il1rn") %>% select(-pct.1, -pct.2), - check.attributes = FALSE) - expect_equal(processed_condition_markers %>% filter(receptor == "Il1r2") %>% select(receptor, p_val_receptor, lfc_receptor, p_adj_receptor) %>% distinct(across(everything())), - condition_markers %>% filter(gene == "Il1r2") %>% select(-pct.1, -pct.2), - check.attributes = FALSE) - temp_row <- processed_condition_markers %>% filter(ligand == "Il1rn", receptor == "Il1r2") - expect_equal((temp_row$lfc_ligand + temp_row$lfc_receptor)/2, temp_row$ligand_receptor_lfc_avg) + # Check processed tables + expect_type(processed_DE_table,"list") + expect_type(processed_expr_table,"list") + expect_type(processed_condition_markers,"list") + + # Check that processed_DE_table has been processed correctly + expect_equal(length(unique(processed_DE_table$sender)), length(sender_celltypes)) + expect_equal(length(unique(processed_DE_table$receiver)), length(receiver)) + + expect_equal(processed_DE_table %>% filter(ligand == "Il1rn", sender == "Mono") %>% select(p_val_ligand, lfc_ligand, pct_expressed_sender, p_adj_ligand, sender, ligand) %>% distinct(across(everything())), + DE_table %>% filter(gene == "Il1rn", cluster_id == "Mono") %>% select(-pct.2), + check.attributes = FALSE) + expect_equal(processed_DE_table %>% filter(ligand == "Il1rn", sender == "Mono", receptor == "Il1r2") %>% select(p_val_receptor, lfc_receptor, pct_expressed_receiver, p_adj_receptor, receiver, receptor) , + DE_table %>% filter(gene == "Il1r2", cluster_id == "CD8 T") %>% select(-pct.2), + check.attributes = FALSE) + temp_row <- processed_DE_table %>% filter(ligand == "Il1rn", sender == "Mono", receiver == "CD8 T", receptor == "Il1r2") + expect_equal((temp_row$lfc_ligand + temp_row$lfc_receptor)/2, temp_row$ligand_receptor_lfc_avg) + + + # Check that processed_expr_table has been processed correctly + expect_equal(length(unique(processed_expr_table$sender)), length(celltypes)) + expect_equal(length(unique(processed_expr_table$receiver)), length(celltypes)) + + temp_row <- processed_expr_table %>% filter(ligand == "Il1rn", sender == "Mono", receiver == "CD8 T", receptor == "Il1r2") + expect_equal(temp_row$avg_ligand * temp_row$avg_receptor, temp_row$ligand_receptor_prod) + + # Check that processed_condition_markers has been processed correctly + expect_equal(processed_condition_markers %>% filter(ligand == "Il1rn") %>% select(ligand, p_val_ligand, lfc_ligand, p_adj_ligand) %>% distinct(across(everything())), + condition_markers %>% filter(gene == "Il1rn") %>% select(-pct.1, -pct.2), + check.attributes = FALSE) + expect_equal(processed_condition_markers %>% filter(receptor == "Il1r2") %>% select(receptor, p_val_receptor, lfc_receptor, p_adj_receptor) %>% distinct(across(everything())), + condition_markers %>% filter(gene == "Il1r2") %>% select(-pct.1, -pct.2), + check.attributes = FALSE) + temp_row <- processed_condition_markers %>% filter(ligand == "Il1rn", receptor == "Il1r2") + expect_equal((temp_row$lfc_ligand + temp_row$lfc_receptor)/2, temp_row$ligand_receptor_lfc_avg) # Check errors and warnings in case of improper usage expect_error(process_table_to_ic(condition_markers, table_type = "group_DE", lr_network = lr_network, receivers_oi = receiver)) @@ -242,109 +251,175 @@ test_that("Prioritization scheme works", { expect_warning(process_table_to_ic(expression_info, lr_network = lr_network, receivers_oi = receiver)) expect_warning(process_table_to_ic(expression_info, lr_network = lr_network, senders_oi = sender_celltypes)) - # Default weights - prioritizing_weights = c("de_ligand" = 1, - "de_receptor" = 1, - "activity_scaled" = 2, - "exprs_ligand" = 1, - "exprs_receptor" = 1, - "ligand_condition_specificity" = 0.5, - "receptor_condition_specificity" = 0.5) - + # Default weights prior_table <- generate_prioritization_tables(processed_expr_table, - processed_DE_table, - ligand_activities, - processed_condition_markers, - prioritizing_weights = prioritizing_weights) - - expect_type(prior_table,"list") - - # Check that columns contain columns from processed_DE_table, processed_expr_table, ligand_activities, and processed_condition_markers - expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% select(colnames(processed_DE_table)), - processed_DE_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% mutate(sender = as.character(sender), receiver = as.character(receiver)), - check.attributes = FALSE) - expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% select(colnames(processed_expr_table)), - processed_expr_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69", receiver == "CD8 T"), - check.attributes = FALSE) - expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% pull(activity), - ligand_activities %>% filter(test_ligand == "Lgals1") %>% pull(aupr_corrected)) - temp_cols <- c("lfc_ligand", "lfc_receptor", "p_val_ligand", "p_val_receptor") - expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% select(all_of(paste0(temp_cols, "_group"))), - processed_condition_markers %>% filter(ligand == "Lgals1", receptor == "Cd69") %>% select(all_of(temp_cols)), - check.attributes = FALSE) + processed_DE_table, + ligand_activities, + processed_condition_markers) + default_prior_table <- prior_table + expect_type(prior_table,"list") + + # Check that columns contain columns from processed_DE_table, processed_expr_table, ligand_activities, and processed_condition_markers + expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% select(colnames(processed_DE_table)), + processed_DE_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% mutate(sender = as.character(sender), receiver = as.character(receiver)), + check.attributes = FALSE) + expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% select(colnames(processed_expr_table)), + processed_expr_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69", receiver == "CD8 T"), + check.attributes = FALSE) + expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% pull(activity), + ligand_activities %>% filter(test_ligand == "Lgals1") %>% pull(aupr_corrected)) + temp_cols <- c("lfc_ligand", "lfc_receptor", "p_val_ligand", "p_val_receptor") + expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% select(all_of(paste0(temp_cols, "_group"))), + processed_condition_markers %>% filter(ligand == "Lgals1", receptor == "Cd69") %>% select(all_of(temp_cols)), + check.attributes = FALSE) # Check that prioritization score is the same as the sum of the weighted scores - temp_weights <- prioritizing_weights + temp_weights = c("de_ligand" = 0.5, + "de_receptor" = 0.5, + "activity_scaled" = 1, + "exprs_ligand" = 0.5, + "exprs_receptor" = 0.5, + "ligand_condition_specificity" = 1, + "receptor_condition_specificity" = 1) expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% - mutate(prioritization_score = rowSums(across(c(scaled_p_val_ligand_adapted, scaled_p_val_receptor_adapted, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_ligand_adapted_group, scaled_p_val_receptor_adapted_group)) * temp_weights) / sum(temp_weights)) %>% pull(prioritization_score), + mutate(prioritization_score = rowSums(across(c(scaled_p_val_adapted_ligand, scaled_p_val_adapted_receptor, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_adapted_ligand_group, scaled_p_val_adapted_receptor_group)) * temp_weights) / sum(temp_weights)) %>% pull(prioritization_score), prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% pull(prioritization_score), check.attributes=FALSE) - # Do not pass condition markers - expect_error(generate_prioritization_tables(processed_expr_table, + # Custom weights + prioritizing_weights <- c("de_ligand" = 1, + "de_receptor" = 1, + "activity_scaled" = 2, + "exprs_ligand" = 1, + "exprs_receptor" = 1, + "ligand_condition_specificity" = 0.5, + "receptor_condition_specificity" = 0.5) + prior_table <- generate_prioritization_tables(processed_expr_table, processed_DE_table, ligand_activities, - lr_condition_de = NULL, - prioritizing_weights)) - - # Define priorization weights to 0 except for activity scaled - prioritizing_weights = c("de_ligand" = 0, - "de_receptor" = 0, - "activity_scaled" = 1, - "exprs_ligand" = 0, - "exprs_receptor" = 0, - "ligand_condition_specificity" = 0, - "receptor_condition_specificity" = 0) - - prior_table <- generate_prioritization_tables(processed_expr_table, - processed_DE_table, - ligand_activities, - processed_condition_markers, - prioritizing_weights) - - # Check colnames - expect_equal(colnames(prior_table), - unique(c(colnames(processed_DE_table), colnames(processed_expr_table), "activity", "rank", "activity_zscore", "scaled_activity", "prioritization_score"))) - - - prior_table_top10 <- prior_table %>% distinct(ligand, prioritization_score) %>% mutate(rank = rank(desc(prioritization_score), ties.method = "average")) %>% arrange(rank, ligand) %>% pull(ligand) %>% .[1:10] - ligands_top10 <- ligand_activities %>% arrange(rank, test_ligand) %>% pull(test_ligand) %>% .[1:10] - - # When using only activity scaled, the top 10 ligands should be the same as the top 10 ligands from the ligand activity table - expect_equal(prior_table_top10, ligands_top10) - - # All weights zero - prioritizing_weights = c("de_ligand" = 0, - "de_receptor" = 0, - "activity_scaled" = 0, - "exprs_ligand" = 0, - "exprs_receptor" = 0, - "ligand_condition_specificity" = 0, - "receptor_condition_specificity" = 0) - - prior_table <- generate_prioritization_tables(processed_expr_table, - processed_DE_table, - ligand_activities, - processed_condition_markers, - prioritizing_weights) - - # Check colnames - expect_equal(colnames(prior_table), - unique(c(colnames(processed_DE_table), colnames(processed_expr_table), "prioritization_score"))) - - prioritizing_weights["de_ligand"] = 1 - prior_table <- generate_prioritization_tables(processed_expr_table, - processed_DE_table, - ligand_activities, - processed_condition_markers, - prioritizing_weights) - - expect_equal(colnames(prior_table), - unique(c(colnames(processed_DE_table), colnames(processed_expr_table), - "lfc_pval_ligand", "p_val_ligand_adapted", "scaled_lfc_ligand", "scaled_p_val_ligand", "scaled_lfc_pval_ligand", "scaled_p_val_ligand_adapted", "prioritization_score"))) + processed_condition_markers, + prioritizing_weights = prioritizing_weights) + temp_weights <- prioritizing_weights + temp_weights[c("de_ligand", "de_receptor", "exprs_ligand", "exprs_receptor")] <- 0.5 + expect_equal(prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% + mutate(prioritization_score = rowSums(across(c(scaled_p_val_adapted_ligand, scaled_p_val_adapted_receptor, scaled_activity, scaled_avg_exprs_ligand, scaled_avg_exprs_receptor, scaled_p_val_adapted_ligand_group, scaled_p_val_adapted_receptor_group)) * temp_weights) / sum(temp_weights)) %>% pull(prioritization_score), + prior_table %>% filter(ligand == "Lgals1", sender == "Mono", receptor == "Cd69") %>% pull(prioritization_score), + check.attributes=FALSE) + + # Check "one_condition" scenario + prior_table <- generate_prioritization_tables(processed_expr_table, + processed_DE_table, + ligand_activities, + scenario = "one_condition") + # Check that colnames don't have "_group + expect_false(any(grepl("_group", colnames(prior_table)))) + + # Try giving lr_condition_de + expect_warning(generate_prioritization_tables(processed_expr_table, + processed_DE_table, + ligand_activities, + lr_condition_de = processed_condition_markers, + scenario = "one_condition")) + + # Do not pass condition markers + expect_error(generate_prioritization_tables(processed_expr_table, + processed_DE_table, + ligand_activities, + lr_condition_de = NULL, + scenario = "case_control")) + + # Define priorization weights to 0 except for activity scaled + prioritizing_weights = c("de_ligand" = 0, + "de_receptor" = 0, + "activity_scaled" = 1, + "exprs_ligand" = 0, + "exprs_receptor" = 0, + "ligand_condition_specificity" = 0, + "receptor_condition_specificity" = 0) + + prior_table <- generate_prioritization_tables(processed_expr_table, + processed_DE_table, + ligand_activities, + processed_condition_markers, + prioritizing_weights) + + # Check colnames + expect_equal(colnames(prior_table), + unique(c(colnames(processed_DE_table), colnames(processed_expr_table), + "activity", "rank", "activity_zscore", "scaled_activity", "prioritization_score", "prioritization_rank"))) + + + prior_table_top10 <- prior_table %>% distinct(ligand, prioritization_score) %>% mutate(rank = rank(desc(prioritization_score), ties.method = "average")) %>% arrange(rank, ligand) %>% pull(ligand) %>% .[1:10] + ligands_top10 <- ligand_activities %>% arrange(rank, test_ligand) %>% pull(test_ligand) %>% .[1:10] + + # When using only activity scaled, the top 10 ligands should be the same as the top 10 ligands from the ligand activity table + expect_equal(prior_table_top10, ligands_top10) + + # All weights zero + prioritizing_weights = c("de_ligand" = 0, + "de_receptor" = 0, + "activity_scaled" = 0, + "exprs_ligand" = 0, + "exprs_receptor" = 0, + "ligand_condition_specificity" = 0, + "receptor_condition_specificity" = 0) + prior_table <- generate_prioritization_tables(processed_expr_table, + processed_DE_table, + ligand_activities, + processed_condition_markers, + prioritizing_weights) + + # Check colnames + expect_equal(colnames(prior_table), + unique(c(colnames(processed_DE_table), colnames(processed_expr_table), "prioritization_score", "prioritization_rank"))) + + prioritizing_weights["de_ligand"] = 1 + prior_table <- generate_prioritization_tables(processed_expr_table, + processed_DE_table, + ligand_activities, + processed_condition_markers, + prioritizing_weights) + + + expect_equal(colnames(prior_table), + unique(c(colnames(processed_DE_table), colnames(processed_expr_table), + "lfc_pval_ligand", "p_val_adapted_ligand", "scaled_lfc_ligand", "scaled_p_val_ligand", "scaled_lfc_pval_ligand", "scaled_p_val_adapted_ligand", "prioritization_score", "prioritization_rank"))) + + + # Check mushroom plot + usable_colnames <- default_prior_table %>% select(ends_with(c("_ligand", "_receptor", "_sender", "_receiver"))) %>% colnames() %>% + stringr::str_remove("_ligand|_receptor|_sender|_receiver") %>% unique + + mushroom_plots <- list() + for (colname in usable_colnames){ + # If range of values is between 0 and 0.001 + ext <- c("_ligand", "_receptor") + if (colname == "pct_expressed") ext <- c("_sender", "_receiver") + values_range <- range(default_prior_table[, c(paste0(colname, ext[1]), paste0(colname, ext[2]))]) + + # Only scaled values can occupy both size and color + if (values_range[1] >= 0 & values_range[2] <= 1.001){ + mushroom_plots[[colname]] <- make_mushroom_plot(default_prior_table, top_n = 30, size = colname, color = colname) + } else { + # If range of values is not between 0 and 0.001, it should throw an error when used as size + expect_error(make_mushroom_plot(default_prior_table, top_n = 30, size = colname, color = colname)) + + # But ok when used as color + mushroom_plots[[colname]] <- make_mushroom_plot(default_prior_table, top_n = 30, size = "scaled_lfc", color = colname, true_color_range = TRUE) + } } + # Expect all to be ggplot objects + expect_true(all(sapply(mushroom_plots, inherits, "gg"))) + + # If a column name that doesn't exist is passed, it should throw an error + expect_error(make_mushroom_plot(default_prior_table, top_n = 30, size = "non_existent_colname", color = usable_colnames[1])) + expect_error(make_mushroom_plot(default_prior_table, top_n = 30, size = usable_colnames[1], color = "non_existent_colname")) + } + + + }) diff --git a/tests/testthat/test-symbol_conversion.R b/tests/testthat/test-symbol_conversion.R index 216e3b0..6d0346c 100644 --- a/tests/testthat/test-symbol_conversion.R +++ b/tests/testthat/test-symbol_conversion.R @@ -31,15 +31,17 @@ test_that("Seurat alias conversion works", { seurat_object_lite = readRDS(url("https://zenodo.org/record/3531889/files/seuratObj_test.rds")) seurat_object_lite2 = seurat_object_lite %>% alias_to_symbol_seurat(organism = "mouse") - testthat::expect_equal(typeof(seurat_object_lite2), "S4") + expect_equal(typeof(seurat_object_lite2), "S4") seurat_object_lite <- UpdateSeuratObject(seurat_object_lite) if (grepl("^5", packageVersion("Seurat")) & grepl("^5", seurat_object_lite@version)){ - expect_error(alias_to_symbol_seurat(seurat_object_lite, "mouse")) + expect_warning(alias_to_symbol_seurat(seurat_object_lite, "mouse")) + same_seurat_obj <- suppressWarnings(alias_to_symbol_seurat(seurat_object_lite, "mouse")) + expect_true(all(rownames(seurat_object_lite) == rownames(same_seurat_obj))) } else { seurat_object_lite2 = seurat_object_lite %>% alias_to_symbol_seurat(organism = "mouse") - testthat::expect_equal(typeof(seurat_object_lite2), "S4") + expect_equal(typeof(seurat_object_lite2), "S4") } }) diff --git a/vignettes/model_evaluation.Rmd b/vignettes/model_evaluation.Rmd index 7224c70..c3cd82e 100644 --- a/vignettes/model_evaluation.Rmd +++ b/vignettes/model_evaluation.Rmd @@ -66,6 +66,7 @@ performances = settings %>% lapply(evaluate_target_prediction, ligand_target_mat ``` Step 3: visualize the results: show here different classification evaluation metrics +(Note: Mean-rank gene-set enrichment will only be calculated if limma is installed) ```{r target-prediction-v2-results, fig.width=8, fig.height=8} # Visualize some classification evaluation metrics showing the target gene prediction performance diff --git a/vignettes/model_evaluation.md b/vignettes/model_evaluation.md index fbd4070..2bad270 100644 --- a/vignettes/model_evaluation.md +++ b/vignettes/model_evaluation.md @@ -88,6 +88,7 @@ performances = settings %>% lapply(evaluate_target_prediction, ligand_target_mat Step 3: visualize the results: show here different classification evaluation metrics +(Note: Mean-rank gene-set enrichment will only be calculated if limma is installed) ``` r # Visualize some classification evaluation metrics showing the target gene prediction performance diff --git a/vignettes/seurat_steps.Rmd b/vignettes/seurat_steps.Rmd index dea3737..7e3530e 100644 --- a/vignettes/seurat_steps.Rmd +++ b/vignettes/seurat_steps.Rmd @@ -416,7 +416,7 @@ DE_table_top_ligands <- lapply( DE_table_top_ligands <- DE_table_top_ligands %>% reduce(., full_join) %>% column_to_rownames("gene") -vis_ligand_lfc <- as.matrix(DE_table_top_ligands[rev(best_upstream_ligands), ]) +vis_ligand_lfc <- as.matrix(DE_table_top_ligands[rev(best_upstream_ligands), , drop = FALSE]) p_lfc <- make_threecolor_heatmap_ggplot(vis_ligand_lfc, "Prioritized ligands", "LFC in Sender", diff --git a/vignettes/seurat_steps.md b/vignettes/seurat_steps.md index 9ba4e8d..32c4fab 100644 --- a/vignettes/seurat_steps.md +++ b/vignettes/seurat_steps.md @@ -663,7 +663,7 @@ DE_table_top_ligands <- lapply( DE_table_top_ligands <- DE_table_top_ligands %>% reduce(., full_join) %>% column_to_rownames("gene") -vis_ligand_lfc <- as.matrix(DE_table_top_ligands[rev(best_upstream_ligands), ]) +vis_ligand_lfc <- as.matrix(DE_table_top_ligands[rev(best_upstream_ligands), , drop = FALSE]) p_lfc <- make_threecolor_heatmap_ggplot(vis_ligand_lfc, "Prioritized ligands", "LFC in Sender", diff --git a/vignettes/seurat_steps_prioritization.Rmd b/vignettes/seurat_steps_prioritization.Rmd index cf091ab..3b2e77b 100644 --- a/vignettes/seurat_steps_prioritization.Rmd +++ b/vignettes/seurat_steps_prioritization.Rmd @@ -38,7 +38,6 @@ library(nichenetr) # Please update to v2.0.6 library(Seurat) library(SeuratObject) library(tidyverse) - ``` ```{r} @@ -170,7 +169,7 @@ As you can see, the resulting table now show the rankings for *ligand-receptor i We included all columns here, but if you just want relevant columns that were used to calculate the ranking: ```{r} -prior_table %>% select(c('sender', 'receiver', 'ligand', 'receptor', 'scaled_p_val_ligand_adapted', 'scaled_p_val_receptor_adapted', 'scaled_avg_exprs_ligand', 'scaled_avg_exprs_receptor', 'scaled_p_val_ligand_adapted_group', 'scaled_p_val_receptor_adapted_group', 'scaled_activity')) +prior_table %>% select(c('sender', 'receiver', 'ligand', 'receptor', 'scaled_p_val_adapted_ligand', 'scaled_p_val_adapted_receptor', 'scaled_avg_exprs_ligand', 'scaled_avg_exprs_receptor', 'scaled_p_val_adapted_ligand_group', 'scaled_p_val_adapted_receptor_group', 'scaled_activity')) ``` Note that we appended the suffix '_group' to columns that refer to differential expression between conditions, e.g., `lfc_ligand_group` and `lfc_receptor_group.` @@ -231,8 +230,8 @@ prior_table %>% head As NicheNet is a receiver-based pipeline, to prioritize ligand-receptor pairs across multiple receivers, we need to perform the NicheNet analysis for each receiver separately. Let's suppose we want to prioritize ligand-receptor pairs across all T cells (CD4, CD8, and Tregs). The CD8 T analysis has already been performed above. We will use the wrapper function to perform a basic NicheNet analysis on the other two: ```{r} -nichenet_output <- lapply(c("CD4 T", "Treg"), function(receiver_ct){ - nichenet_seuratobj_aggregate(receiver = receiver_ct, +nichenet_outputs <- lapply(c("CD8 T", "CD4 T", "Treg"), function(receiver_ct){ + output <- nichenet_seuratobj_aggregate(receiver = receiver_ct, seurat_obj = seuratObj, condition_colname = "aggregate", condition_oi = condition_oi, @@ -243,42 +242,41 @@ nichenet_output <- lapply(c("CD4 T", "Treg"), function(receiver_ct){ weighted_networks = weighted_networks, expression_pct = 0.05) -}) %>% setNames(c("CD4 T", "Treg")) + # Add receiver cell type in ligand activity table + output$ligand_activities$receiver <- receiver_ct + return(output) +}) ``` To generate the dataframes used for prioritization, we will simply change the `lr_network_filtered` argument to only calculate DE and expression values for ligand-receptor pairs of interest. ```{r} -info_tables2 <- lapply(names(nichenet_output), function(receiver_ct) { - generate_info_tables(seuratObj, - celltype_colname = "celltype", - senders_oi = sender_celltypes, - receivers_oi = receiver_ct, - lr_network_filtered = lr_network %>% - filter(from %in% nichenet_output[[receiver_ct]]$ligand_activities$test_ligand & - to %in% nichenet_output[[receiver_ct]]$background_expressed_genes), - condition_colname = "aggregate", - condition_oi = condition_oi, - condition_reference = condition_reference, - scenario = "case_control") +# Calculate prioritization criteria for each receiver cell type +info_tables <- lapply(nichenet_outputs, function(output) { + lr_network_filtered <- lr_network %>% select(from, to) %>% + filter(from %in% output$ligand_activities$test_ligand & + to %in% output$background_expressed_genes) + + generate_info_tables(seuratObj, + celltype_colname = "celltype", + senders_oi = sender_celltypes, + receivers_oi = unique(output$ligand_activities$receiver), + lr_network_filtered = lr_network_filtered, + condition_colname = "aggregate", + condition_oi = condition_oi, + condition_reference = condition_reference, + scenario = "case_control") }) ``` -We can then combine the results from `generate_info_tables` using `bind_rows`, which will concatenate the rows together. For the ligand activities, we will also add an additional column containing the receiver cell type. Note that for the average expression table (`sender_receiver_info`) and condition specificity (`lr_condition_de`), we need to remove duplicate rows. +We can then combine the results from `generate_info_tables` using `bind_rows`, which will concatenate the rows together. Note that for the average expression table (`sender_receiver_info`) and condition specificity (`lr_condition_de`), we need to remove duplicate rows. ```{r} -# Add CD8 T to list -info_tables2[[3]] <- info_tables - # bind rows of each element of info_tables using pmap -info_tables_combined <- purrr::pmap(info_tables2, bind_rows) - -# Combine ligand activities and add receiver information -ligand_activities_combined <- bind_rows(nichenet_output$`CD4 T`$ligand_activities %>% mutate(receiver = "CD4 T"), - nichenet_output$Treg$ligand_activities %>% mutate(receiver = "Treg"), - ligand_activities %>% mutate(receiver = "CD8 T")) +info_tables_combined <- purrr::pmap(info_tables, bind_rows) +ligand_activities_combined <- purrr::map_dfr(nichenet_outputs, "ligand_activities") prior_table_combined <- generate_prioritization_tables( sender_receiver_info = info_tables_combined$sender_receiver_info %>% distinct, @@ -313,7 +311,8 @@ circos_plot <- make_circos_lr(prior_table_oi, circos_plot ``` -Furthermore, we provide the function `make_mushroom_plot` which allows you to display expression of ligand-receptor pairs in a specific receiver. By default, the fill gradient shows the LFC between cell types, while the size of the semicircle corresponds to the scaled mean expression. You can also choose to show the rankings of each ligand-receptor-sender pair with `show_rankings`, as well as show all data points for context (`show_all_datapoints`). `true_color_range = TRUE` will adjust the limits of the color gradient to the min-max of the values, instead of the limit being from 0 to 1. Note that the numbers displayed here are the rankings within the chosen cell type and not across all receiver cell types (in case of multiple receivers). +Furthermore, we provide the function `make_mushroom_plot` which allows you to display expression of ligand-receptor pairs in a specific receiver. By default, the fill gradient shows the LFC between cell types, while the size of the semicircle corresponds to the scaled mean expression. You can also choose to show the rankings of each ligand-receptor-sender pair with `show_rankings`, as well as show all data points for context (`show_all_datapoints`). `true_color_range = TRUE` will adjust the limits of the color gradient to the min-max of the values, instead of the limit being from 0 to 1. Note that the numbers displayed here are the rankings across all receiver cell types (in case of multiple receivers), and by default the `top_n` ligand-receptor pairs are shown despite the absolute ranking. To show only pairs that have an absolute ranking within top_n across all receivers, set `use_absolute_rank = TRUE`. + ```{r mushroom-plot-1, fig.height=8, fig.width=8} receiver_oi <- "CD8 T" @@ -339,6 +338,7 @@ make_mushroom_plot(prior_table, top_n = 30, size = "pct_expressed", color = "sca axis.title.x = element_text(hjust = 0.25)) ``` + ```{r} sessionInfo() ``` diff --git a/vignettes/seurat_steps_prioritization.md b/vignettes/seurat_steps_prioritization.md index 7a51449..516a814 100644 --- a/vignettes/seurat_steps_prioritization.md +++ b/vignettes/seurat_steps_prioritization.md @@ -168,21 +168,20 @@ info_tables <- generate_info_tables(seuratObj, names(info_tables) ## [1] "sender_receiver_de" "sender_receiver_info" "lr_condition_de" +``` + +``` r info_tables$sender_receiver_de %>% head() -## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender -## 1 DC CD8 T Ccl5 Cxcr3 6.432043 0.16714791 3.299595 1.893317e-25 2.563740e-21 7.758812e-05 1.000000e+00 1.000 -## 2 Mono CD8 T Lyz2 Itgal 5.493265 -0.01687003 2.738198 1.728697e-160 2.340828e-156 4.973381e-02 1.000000e+00 0.933 -## 3 DC CD8 T H2-M2 Cd8a 3.416479 1.94059972 2.678539 1.017174e-272 1.377355e-268 5.250531e-206 7.109745e-202 0.429 -## 4 DC CD8 T Cxcl16 Cxcr6 4.182085 0.54826454 2.365175 1.138617e-243 1.541801e-239 5.987787e-21 8.108063e-17 0.929 -## 5 Mono CD8 T Cxcl9 Cxcr3 4.328801 0.16714791 2.247975 3.834954e-124 5.192911e-120 7.758812e-05 1.000000e+00 0.547 -## 6 Mono CD8 T Cxcl9 Dpp4 4.328801 0.16416445 2.246483 3.834954e-124 5.192911e-120 6.628900e-04 1.000000e+00 0.547 -## pct_expressed_receiver -## 1 0.042 -## 2 0.188 -## 3 0.659 -## 4 0.089 -## 5 0.042 -## 6 0.148 +## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender pct_expressed_receiver +## 1 DC CD8 T H2-M2 Cd8a 11.002412 2.3838066 6.693109 1.017174e-272 1.377355e-268 5.250531e-206 7.109745e-202 0.429 0.659 +## 2 DC CD8 T H2-M2 Klrd1 11.002412 0.9199196 5.961166 1.017174e-272 1.377355e-268 6.104465e-17 8.266056e-13 0.429 0.185 +## 3 DC CD8 T Ccl22 Dpp4 9.920608 0.2991720 5.109890 1.590801e-296 2.154103e-292 6.628900e-04 1.000000e+00 0.500 0.148 +## 4 DC CD8 T Vsig10 Il6st 10.070530 0.1411494 5.105840 2.637179e-194 3.571005e-190 1.470347e-02 1.000000e+00 0.286 0.090 +## 5 DC CD8 T Ccl22 Ccr7 9.920608 0.1468652 5.033737 1.590801e-296 2.154103e-292 5.070025e-05 6.865321e-01 0.500 0.320 +## 6 DC CD8 T Cxcl16 Cxcr6 8.101436 1.8384579 4.969947 1.138617e-243 1.541801e-239 5.987787e-21 8.108063e-17 0.929 0.089 +``` + +``` r info_tables$sender_receiver_info %>% head() ## # A tibble: 6 × 7 ## sender receiver ligand receptor avg_ligand avg_receptor ligand_receptor_prod @@ -193,14 +192,17 @@ info_tables$sender_receiver_info %>% head() ## 4 DC Treg B2m Tap1 216. 7.18 1552. ## 5 Mono Mono B2m Tap1 158. 8.59 1353. ## 6 DC DC B2m Tap1 216. 5.91 1277. +``` + +``` r info_tables$lr_condition_de %>% head() -## ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor -## 1 H2-Ab1 Cd4 2.4021254 0.11569357 1.2589095 4.424390e-06 5.991066e-02 5.634068e-02 1.000000e+00 -## 2 Cxcl10 Dpp4 1.6066163 0.35175421 0.9791853 6.700636e-29 9.073332e-25 1.170731e-06 1.585287e-02 -## 3 B2m Tap1 0.7071427 1.13931050 0.9232266 6.936359e-174 9.392524e-170 3.585450e-52 4.855057e-48 -## 4 H2-T22 Klrd1 1.5223370 -0.05659737 0.7328698 1.006291e-111 1.362618e-107 6.202530e-01 1.000000e+00 -## 5 H2-T23 Klrd1 1.4651999 -0.05659737 0.7043013 1.789643e-114 2.423356e-110 6.202530e-01 1.000000e+00 -## 6 Cxcl10 Cxcr3 1.6066163 -0.25400642 0.6763049 6.700636e-29 9.073332e-25 1.918372e-06 2.597667e-02 +## ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor +## 1 Cxcl11 Dpp4 7.197344 0.7345098 3.965927 0.0001621364 1 1.170731e-06 1.585287e-02 +## 2 Sirpb1c Cd47 6.236414 0.7474147 3.491914 0.0006820290 1 8.720485e-23 1.180841e-18 +## 3 Cxcl11 Cxcr3 7.197344 -1.1317386 3.032803 0.0001621364 1 1.918372e-06 2.597667e-02 +## 4 Ccl22 Dpp4 5.075469 0.7345098 2.904989 0.0863610523 1 1.170731e-06 1.585287e-02 +## 5 F13a1 Itga4 5.436884 0.1228459 2.779865 0.0299628836 1 6.837926e-02 1.000000e+00 +## 6 Vcan Sell 5.234169 0.3254999 2.779835 0.0423593686 1 7.148719e-07 9.680080e-03 ``` Next, we generate the prioritization table. This table contains the @@ -219,22 +221,22 @@ prior_table <- generate_prioritization_tables(info_tables$sender_receiver_info, scenario = "case_control") prior_table %>% head -## # A tibble: 6 × 51 -## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender -## -## 1 NK CD8 T Ptprc Dpp4 0.596 0.164 0.380 2.18e- 7 2.96e- 3 0.000663 1 0.894 -## 2 Mono CD8 T Ptprc Dpp4 0.438 0.164 0.301 3.52e- 5 4.77e- 1 0.000663 1 0.867 -## 3 Mono CD8 T Cxcl10 Dpp4 4.27 0.164 2.22 2.53e- 79 3.43e- 75 0.000663 1 0.867 -## 4 Mono CD8 T Cxcl9 Dpp4 4.33 0.164 2.25 3.83e-124 5.19e-120 0.000663 1 0.547 -## 5 Treg CD8 T Ptprc Dpp4 0.282 0.164 0.223 1.44e- 2 1 e+ 0 0.000663 1 0.685 -## 6 Mono CD8 T Cxcl11 Dpp4 2.36 0.164 1.26 9.28e-121 1.26e-116 0.000663 1 0.307 -## # ℹ 39 more variables: pct_expressed_receiver , avg_ligand , avg_receptor , ligand_receptor_prod , lfc_pval_ligand , -## # p_val_ligand_adapted , scaled_lfc_ligand , scaled_p_val_ligand , scaled_lfc_pval_ligand , scaled_p_val_ligand_adapted , activity , -## # rank , activity_zscore , scaled_activity , lfc_pval_receptor , p_val_receptor_adapted , scaled_lfc_receptor , -## # scaled_p_val_receptor , scaled_lfc_pval_receptor , scaled_p_val_receptor_adapted , scaled_avg_exprs_ligand , -## # scaled_avg_exprs_receptor , lfc_ligand_group , p_val_ligand_group , lfc_pval_ligand_group , p_val_ligand_adapted_group , -## # scaled_lfc_ligand_group , scaled_p_val_ligand_group , scaled_lfc_pval_ligand_group , scaled_p_val_ligand_adapted_group , -## # lfc_receptor_group , p_val_receptor_group , lfc_pval_receptor_group , p_val_receptor_adapted_group , scaled_lfc_receptor_group , … +## # A tibble: 6 × 52 +## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender pct_expressed_receiver avg_ligand +## +## 1 NK CD8 T Ptprc Dpp4 0.642 0.299 0.471 2.18e- 7 2.96e- 3 0.000663 1 0.894 0.148 16.6 +## 2 Mono CD8 T Ptprc Dpp4 0.474 0.299 0.386 3.52e- 5 4.77e- 1 0.000663 1 0.867 0.148 14.9 +## 3 Treg CD8 T Ptprc Dpp4 0.307 0.299 0.303 1.44e- 2 1 e+ 0 0.000663 1 0.685 0.148 13.2 +## 4 Mono CD8 T Cxcl10 Dpp4 4.86 0.299 2.58 2.53e-79 3.43e-75 0.000663 1 0.867 0.148 54.8 +## 5 B CD8 T Ptprc Dpp4 0.201 0.299 0.250 2.08e- 2 1 e+ 0 0.000663 1 0.669 0.148 12.3 +## 6 Mono CD8 T Ebi3 Il6st 4.00 0.141 2.07 9.77e-49 1.32e-44 0.0147 1 0.147 0.09 0.546 +## # ℹ 38 more variables: avg_receptor , ligand_receptor_prod , lfc_pval_ligand , p_val_adapted_ligand , scaled_lfc_ligand , scaled_p_val_ligand , +## # scaled_lfc_pval_ligand , scaled_p_val_adapted_ligand , activity , rank , activity_zscore , scaled_activity , lfc_pval_receptor , +## # p_val_adapted_receptor , scaled_lfc_receptor , scaled_p_val_receptor , scaled_lfc_pval_receptor , scaled_p_val_adapted_receptor , scaled_avg_exprs_ligand , +## # scaled_avg_exprs_receptor , lfc_ligand_group , p_val_ligand_group , lfc_pval_ligand_group , p_val_adapted_ligand_group , scaled_lfc_ligand_group , +## # scaled_p_val_ligand_group , scaled_lfc_pval_ligand_group , scaled_p_val_adapted_ligand_group , lfc_receptor_group , p_val_receptor_group , +## # lfc_pval_receptor_group , p_val_adapted_receptor_group , scaled_lfc_receptor_group , scaled_p_val_receptor_group , scaled_lfc_pval_receptor_group , +## # scaled_p_val_adapted_receptor_group , prioritization_score , prioritization_rank ``` As you can see, the resulting table now show the rankings for @@ -248,23 +250,22 @@ We included all columns here, but if you just want relevant columns that were used to calculate the ranking: ``` r -prior_table %>% select(c('sender', 'receiver', 'ligand', 'receptor', 'scaled_p_val_ligand_adapted', 'scaled_p_val_receptor_adapted', 'scaled_avg_exprs_ligand', 'scaled_avg_exprs_receptor', 'scaled_p_val_ligand_adapted_group', 'scaled_p_val_receptor_adapted_group', 'scaled_activity')) -## # A tibble: 1,272 × 11 -## sender receiver ligand receptor scaled_p_val_ligand_adapted scaled_p_val_receptor_adapted scaled_avg_exprs_ligand scaled_avg_exprs_receptor scaled_p_val_ligand_…¹ -## -## 1 NK CD8 T Ptprc Dpp4 0.869 0.829 1.00 1.00 0.850 -## 2 Mono CD8 T Ptprc Dpp4 0.841 0.829 0.867 1.00 0.850 -## 3 Mono CD8 T Cxcl10 Dpp4 0.960 0.829 1.00 1.00 0.929 -## 4 Mono CD8 T Cxcl9 Dpp4 0.975 0.829 1.00 1.00 0.787 -## 5 Treg CD8 T Ptprc Dpp4 0.756 0.829 0.741 1.00 0.850 -## 6 Mono CD8 T Cxcl11 Dpp4 0.973 0.829 1.00 1.00 0.732 -## 7 B CD8 T Ptprc Dpp4 0.748 0.829 0.666 1.00 0.850 -## 8 DC CD8 T Icam1 Il2rg 0.876 0.714 1.00 0.995 0.717 -## 9 DC CD8 T Ccl22 Dpp4 0.997 0.829 1.00 1.00 0.539 -## 10 NK CD8 T Cd320 Jaml 0.889 0.943 0.905 1.00 0.472 -## # ℹ 1,262 more rows -## # ℹ abbreviated name: ¹​scaled_p_val_ligand_adapted_group -## # ℹ 2 more variables: scaled_p_val_receptor_adapted_group , scaled_activity +prior_table %>% select(c('sender', 'receiver', 'ligand', 'receptor', 'scaled_p_val_adapted_ligand', 'scaled_p_val_adapted_receptor', 'scaled_avg_exprs_ligand', 'scaled_avg_exprs_receptor', 'scaled_p_val_adapted_ligand_group', 'scaled_p_val_adapted_receptor_group', 'scaled_activity')) +## # A tibble: 1,212 × 11 +## sender receiver ligand receptor scaled_p_val_adapted_ligand scaled_p_val_adapted_re…¹ scaled_avg_exprs_lig…² scaled_avg_exprs_rec…³ scaled_p_val_adapted…⁴ scaled_p_val_adapted…⁵ scaled_activity +## +## 1 NK CD8 T Ptprc Dpp4 0.871 0.846 1.00 1.00 0.844 0.833 0.660 +## 2 Mono CD8 T Ptprc Dpp4 0.841 0.846 0.867 1.00 0.844 0.833 0.660 +## 3 Treg CD8 T Ptprc Dpp4 0.754 0.846 0.741 1.00 0.844 0.833 0.660 +## 4 Mono CD8 T Cxcl10 Dpp4 0.958 0.846 1.00 1.00 0.926 0.833 0.309 +## 5 B CD8 T Ptprc Dpp4 0.747 0.846 0.666 1.00 0.844 0.833 0.660 +## 6 Mono CD8 T Ebi3 Il6st 0.937 0.692 1.00 0.635 0.680 0.515 1.00 +## 7 Mono CD8 T Cxcl9 Dpp4 0.974 0.846 1.00 1.00 0.779 0.833 0.263 +## 8 DC CD8 T Icam1 Il2rg 0.878 0.723 1.00 0.995 0.713 0.985 0.273 +## 9 DC CD8 T B2m Tap2 0.862 0.631 1.00 0.781 1 0.864 0.254 +## 10 Mono CD8 T Cxcl11 Dpp4 0.972 0.846 1.00 1.00 0.721 0.833 0.273 +## # ℹ 1,202 more rows +## # ℹ abbreviated names: ¹​scaled_p_val_adapted_receptor, ²​scaled_avg_exprs_ligand, ³​scaled_avg_exprs_receptor, ⁴​scaled_p_val_adapted_ligand_group, ⁵​scaled_p_val_adapted_receptor_group ``` Note that we appended the suffix ’\_group’ to columns that refer to @@ -328,22 +329,22 @@ prior_table <- generate_prioritization_tables(processed_expr_table, prioritizing_weights) prior_table %>% head -## # A tibble: 6 × 51 -## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender -## -## 1 NK CD8 T Ptprc Dpp4 0.596 0.164 0.380 2.18e- 7 2.96e- 3 0.000663 1 0.894 -## 2 Mono CD8 T Ptprc Dpp4 0.438 0.164 0.301 3.52e- 5 4.77e- 1 0.000663 1 0.867 -## 3 Mono CD8 T Cxcl10 Dpp4 4.27 0.164 2.22 2.53e- 79 3.43e- 75 0.000663 1 0.867 -## 4 Mono CD8 T Cxcl9 Dpp4 4.33 0.164 2.25 3.83e-124 5.19e-120 0.000663 1 0.547 -## 5 Treg CD8 T Ptprc Dpp4 0.282 0.164 0.223 1.44e- 2 1 e+ 0 0.000663 1 0.685 -## 6 Mono CD8 T Cxcl11 Dpp4 2.36 0.164 1.26 9.28e-121 1.26e-116 0.000663 1 0.307 -## # ℹ 39 more variables: pct_expressed_receiver , avg_ligand , avg_receptor , ligand_receptor_prod , lfc_pval_ligand , -## # p_val_ligand_adapted , scaled_lfc_ligand , scaled_p_val_ligand , scaled_lfc_pval_ligand , scaled_p_val_ligand_adapted , activity , -## # rank , activity_zscore , scaled_activity , lfc_pval_receptor , p_val_receptor_adapted , scaled_lfc_receptor , -## # scaled_p_val_receptor , scaled_lfc_pval_receptor , scaled_p_val_receptor_adapted , scaled_avg_exprs_ligand , -## # scaled_avg_exprs_receptor , lfc_ligand_group , p_val_ligand_group , lfc_pval_ligand_group , p_val_ligand_adapted_group , -## # scaled_lfc_ligand_group , scaled_p_val_ligand_group , scaled_lfc_pval_ligand_group , scaled_p_val_ligand_adapted_group , -## # lfc_receptor_group , p_val_receptor_group , lfc_pval_receptor_group , p_val_receptor_adapted_group , scaled_lfc_receptor_group , … +## # A tibble: 6 × 52 +## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender pct_expressed_receiver avg_ligand +## +## 1 NK CD8 T Ptprc Dpp4 0.642 0.299 0.471 2.18e- 7 2.96e- 3 0.000663 1 0.894 0.148 16.6 +## 2 Mono CD8 T Ptprc Dpp4 0.474 0.299 0.386 3.52e- 5 4.77e- 1 0.000663 1 0.867 0.148 14.9 +## 3 Treg CD8 T Ptprc Dpp4 0.307 0.299 0.303 1.44e- 2 1 e+ 0 0.000663 1 0.685 0.148 13.2 +## 4 Mono CD8 T Cxcl10 Dpp4 4.86 0.299 2.58 2.53e-79 3.43e-75 0.000663 1 0.867 0.148 54.8 +## 5 B CD8 T Ptprc Dpp4 0.201 0.299 0.250 2.08e- 2 1 e+ 0 0.000663 1 0.669 0.148 12.3 +## 6 Mono CD8 T Ebi3 Il6st 4.00 0.141 2.07 9.77e-49 1.32e-44 0.0147 1 0.147 0.09 0.546 +## # ℹ 38 more variables: avg_receptor , ligand_receptor_prod , lfc_pval_ligand , p_val_adapted_ligand , scaled_lfc_ligand , scaled_p_val_ligand , +## # scaled_lfc_pval_ligand , scaled_p_val_adapted_ligand , activity , rank , activity_zscore , scaled_activity , lfc_pval_receptor , +## # p_val_adapted_receptor , scaled_lfc_receptor , scaled_p_val_receptor , scaled_lfc_pval_receptor , scaled_p_val_adapted_receptor , scaled_avg_exprs_ligand , +## # scaled_avg_exprs_receptor , lfc_ligand_group , p_val_ligand_group , lfc_pval_ligand_group , p_val_adapted_ligand_group , scaled_lfc_ligand_group , +## # scaled_p_val_ligand_group , scaled_lfc_pval_ligand_group , scaled_p_val_adapted_ligand_group , lfc_receptor_group , p_val_receptor_group , +## # lfc_pval_receptor_group , p_val_adapted_receptor_group , scaled_lfc_receptor_group , scaled_p_val_receptor_group , scaled_lfc_pval_receptor_group , +## # scaled_p_val_adapted_receptor_group , prioritization_score , prioritization_rank ``` # Prioritizing across multiple receivers @@ -357,8 +358,8 @@ the wrapper function to perform a basic NicheNet analysis on the other two: ``` r -nichenet_output <- lapply(c("CD4 T", "Treg"), function(receiver_ct){ - nichenet_seuratobj_aggregate(receiver = receiver_ct, +nichenet_outputs <- lapply(c("CD8 T", "CD4 T", "Treg"), function(receiver_ct){ + output <- nichenet_seuratobj_aggregate(receiver = receiver_ct, seurat_obj = seuratObj, condition_colname = "aggregate", condition_oi = condition_oi, @@ -369,7 +370,11 @@ nichenet_output <- lapply(c("CD4 T", "Treg"), function(receiver_ct){ weighted_networks = weighted_networks, expression_pct = 0.05) -}) %>% setNames(c("CD4 T", "Treg")) + # Add receiver cell type in ligand activity table + output$ligand_activities$receiver <- receiver_ct + return(output) +}) +## [1] "The RNA assay will be used for the analysis." ## [1] "Read in and process NicheNet's networks" ## [1] "Define expressed ligands and receptors in receiver and sender cells" ## [1] "Perform DE analysis in receiver cell" @@ -377,6 +382,15 @@ nichenet_output <- lapply(c("CD4 T", "Treg"), function(receiver_ct){ ## [1] "Infer active target genes of the prioritized ligands" ## [1] "Infer receptors of the prioritized ligands" ## [1] "Perform DE analysis in sender cells" +## [1] "The RNA assay will be used for the analysis." +## [1] "Read in and process NicheNet's networks" +## [1] "Define expressed ligands and receptors in receiver and sender cells" +## [1] "Perform DE analysis in receiver cell" +## [1] "Perform NicheNet ligand activity analysis" +## [1] "Infer active target genes of the prioritized ligands" +## [1] "Infer receptors of the prioritized ligands" +## [1] "Perform DE analysis in sender cells" +## [1] "The RNA assay will be used for the analysis." ## [1] "Read in and process NicheNet's networks" ## [1] "Define expressed ligands and receptors in receiver and sender cells" ## [1] "Perform DE analysis in receiver cell" @@ -391,39 +405,33 @@ change the `lr_network_filtered` argument to only calculate DE and expression values for ligand-receptor pairs of interest. ``` r -info_tables2 <- lapply(names(nichenet_output), function(receiver_ct) { - generate_info_tables(seuratObj, - celltype_colname = "celltype", - senders_oi = sender_celltypes, - receivers_oi = receiver_ct, - lr_network_filtered = lr_network %>% - filter(from %in% nichenet_output[[receiver_ct]]$ligand_activities$test_ligand & - to %in% nichenet_output[[receiver_ct]]$background_expressed_genes), - condition_colname = "aggregate", - condition_oi = condition_oi, - condition_reference = condition_reference, - scenario = "case_control") +# Calculate prioritization criteria for each receiver cell type +info_tables <- lapply(nichenet_outputs, function(output) { + lr_network_filtered <- lr_network %>% select(from, to) %>% + filter(from %in% output$ligand_activities$test_ligand & + to %in% output$background_expressed_genes) + + generate_info_tables(seuratObj, + celltype_colname = "celltype", + senders_oi = sender_celltypes, + receivers_oi = unique(output$ligand_activities$receiver), + lr_network_filtered = lr_network_filtered, + condition_colname = "aggregate", + condition_oi = condition_oi, + condition_reference = condition_reference, + scenario = "case_control") }) ``` We can then combine the results from `generate_info_tables` using -`bind_rows`, which will concatenate the rows together. For the ligand -activities, we will also add an additional column containing the -receiver cell type. Note that for the average expression table -(`sender_receiver_info`) and condition specificity (`lr_condition_de`), -we need to remove duplicate rows. +`bind_rows`, which will concatenate the rows together. Note that for the +average expression table (`sender_receiver_info`) and condition +specificity (`lr_condition_de`), we need to remove duplicate rows. ``` r -# Add CD8 T to list -info_tables2[[3]] <- info_tables - # bind rows of each element of info_tables using pmap -info_tables_combined <- purrr::pmap(info_tables2, bind_rows) - -# Combine ligand activities and add receiver information -ligand_activities_combined <- bind_rows(nichenet_output$`CD4 T`$ligand_activities %>% mutate(receiver = "CD4 T"), - nichenet_output$Treg$ligand_activities %>% mutate(receiver = "Treg"), - ligand_activities %>% mutate(receiver = "CD8 T")) +info_tables_combined <- purrr::pmap(info_tables, bind_rows) +ligand_activities_combined <- purrr::map_dfr(nichenet_outputs, "ligand_activities") prior_table_combined <- generate_prioritization_tables( sender_receiver_info = info_tables_combined$sender_receiver_info %>% distinct, @@ -433,22 +441,22 @@ prior_table_combined <- generate_prioritization_tables( scenario = "case_control") head(prior_table_combined) -## # A tibble: 6 × 51 -## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender -## -## 1 NK CD8 T Ptprc Dpp4 0.596 0.164 0.380 0.000000218 0.00296 6.63e- 4 1 e+ 0 0.894 -## 2 NK CD4 T Ptprc Cd4 0.596 0.996 0.796 0.000000218 0.00296 2.63e-34 3.56e-30 0.894 -## 3 B CD4 T H2-Eb1 Cd4 4.02 0.996 2.51 0 0 2.63e-34 3.56e-30 0.93 -## 4 Mono CD8 T Ptprc Dpp4 0.438 0.164 0.301 0.0000352 0.477 6.63e- 4 1 e+ 0 0.867 -## 5 Mono CD4 T Ptprc Cd4 0.438 0.996 0.717 0.0000352 0.477 2.63e-34 3.56e-30 0.867 -## 6 NK CD4 T Ptprc Cd247 0.596 0.457 0.526 0.000000218 0.00296 5.61e- 4 1 e+ 0 0.894 -## # ℹ 39 more variables: pct_expressed_receiver , avg_ligand , avg_receptor , ligand_receptor_prod , lfc_pval_ligand , -## # p_val_ligand_adapted , scaled_lfc_ligand , scaled_p_val_ligand , scaled_lfc_pval_ligand , scaled_p_val_ligand_adapted , activity , -## # rank , activity_zscore , scaled_activity , lfc_pval_receptor , p_val_receptor_adapted , scaled_lfc_receptor , -## # scaled_p_val_receptor , scaled_lfc_pval_receptor , scaled_p_val_receptor_adapted , scaled_avg_exprs_ligand , -## # scaled_avg_exprs_receptor , lfc_ligand_group , p_val_ligand_group , lfc_pval_ligand_group , p_val_ligand_adapted_group , -## # scaled_lfc_ligand_group , scaled_p_val_ligand_group , scaled_lfc_pval_ligand_group , scaled_p_val_ligand_adapted_group , -## # lfc_receptor_group , p_val_receptor_group , lfc_pval_receptor_group , p_val_receptor_adapted_group , scaled_lfc_receptor_group , … +## # A tibble: 6 × 52 +## sender receiver ligand receptor lfc_ligand lfc_receptor ligand_receptor_lfc_avg p_val_ligand p_adj_ligand p_val_receptor p_adj_receptor pct_expressed_sender pct_expressed_receiver avg_ligand +## +## 1 NK CD8 T Ptprc Dpp4 0.642 0.299 0.471 0.000000218 0.00296 6.63e- 4 1 e+ 0 0.894 0.148 16.6 +## 2 NK CD4 T Ptprc Cd4 0.642 1.71 1.18 0.000000218 0.00296 2.63e-34 3.56e-30 0.894 0.226 16.6 +## 3 Mono CD8 T Ptprc Dpp4 0.474 0.299 0.386 0.0000352 0.477 6.63e- 4 1 e+ 0 0.867 0.148 14.9 +## 4 B CD4 T H2-Eb1 Cd4 5.00 1.71 3.36 0 0 2.63e-34 3.56e-30 0.93 0.226 31.0 +## 5 Mono CD4 T Ptprc Cd4 0.474 1.71 1.09 0.0000352 0.477 2.63e-34 3.56e-30 0.867 0.226 14.9 +## 6 NK CD4 T Ptprc Cd247 0.642 0.599 0.620 0.000000218 0.00296 5.61e- 4 1 e+ 0 0.894 0.309 16.6 +## # ℹ 38 more variables: avg_receptor , ligand_receptor_prod , lfc_pval_ligand , p_val_adapted_ligand , scaled_lfc_ligand , scaled_p_val_ligand , +## # scaled_lfc_pval_ligand , scaled_p_val_adapted_ligand , activity , rank , activity_zscore , scaled_activity , lfc_pval_receptor , +## # p_val_adapted_receptor , scaled_lfc_receptor , scaled_p_val_receptor , scaled_lfc_pval_receptor , scaled_p_val_adapted_receptor , scaled_avg_exprs_ligand , +## # scaled_avg_exprs_receptor , lfc_ligand_group , p_val_ligand_group , lfc_pval_ligand_group , p_val_adapted_ligand_group , scaled_lfc_ligand_group , +## # scaled_p_val_ligand_group , scaled_lfc_pval_ligand_group , scaled_p_val_adapted_ligand_group , lfc_receptor_group , p_val_receptor_group , +## # lfc_pval_receptor_group , p_val_adapted_receptor_group , scaled_lfc_receptor_group , scaled_p_val_receptor_group , scaled_lfc_pval_receptor_group , +## # scaled_p_val_adapted_receptor_group , prioritization_score , prioritization_rank ``` ### Extra visualization of ligand-receptor pairs @@ -488,9 +496,11 @@ ligand-receptor-sender pair with `show_rankings`, as well as show all data points for context (`show_all_datapoints`). `true_color_range = TRUE` will adjust the limits of the color gradient to the min-max of the values, instead of the limit being from 0 to 1. -Note that the numbers displayed here are the rankings within the chosen -cell type and not across all receiver cell types (in case of multiple -receivers). +Note that the numbers displayed here are the rankings across all +receiver cell types (in case of multiple receivers), and by default the +`top_n` ligand-receptor pairs are shown despite the absolute ranking. To +show only pairs that have an absolute ranking within top_n across all +receivers, set `use_absolute_rank = TRUE`. ``` r receiver_oi <- "CD8 T" @@ -513,7 +523,10 @@ columns from the prioritization table (those with the `_ligand` or ``` r print(paste0("Column names that you can use are: ", paste0(prior_table %>% select(ends_with(c("_ligand", "_receptor", "_sender", "_receiver"))) %>% colnames() %>% str_remove("_ligand|_receptor|_sender|_receiver") %>% unique, collapse = ", "))) -## [1] "Column names that you can use are: lfc, p_val, p_adj, avg, lfc_pval, scaled_lfc, scaled_p_val, scaled_lfc_pval, scaled_avg_exprs, pct_expressed" +## [1] "Column names that you can use are: lfc, p_val, p_adj, avg, lfc_pval, p_val_adapted, scaled_lfc, scaled_p_val, scaled_lfc_pval, scaled_p_val_adapted, scaled_avg_exprs, pct_expressed" +``` + +``` r # Change size and color columns make_mushroom_plot(prior_table, top_n = 30, size = "pct_expressed", color = "scaled_avg_exprs") + @@ -525,64 +538,55 @@ make_mushroom_plot(prior_table, top_n = 30, size = "pct_expressed", color = "sca ``` r sessionInfo() -## R version 4.3.2 (2023-10-31) -## Platform: x86_64-redhat-linux-gnu (64-bit) +## R version 4.3.3 (2024-02-29) +## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: CentOS Stream 8 ## ## Matrix products: default -## BLAS/LAPACK: /usr/lib64/libopenblaso-r0.3.15.so; LAPACK version 3.9.0 +## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so; LAPACK version 3.9.0 ## ## locale: -## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 -## [6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C -## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C +## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 +## [8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C ## -## time zone: Asia/Bangkok +## time zone: Europe/Brussels ## tzcode source: system (glibc) ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: -## [1] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.4 purrr_1.0.2 readr_2.1.2 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4 -## [9] tidyverse_1.3.1 SeuratObject_5.0.1 Seurat_4.4.0 nichenetr_2.1.0 +## [1] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 +## [11] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0 nichenetr_2.2.0 ## ## loaded via a namespace (and not attached): -## [1] fs_1.6.3 matrixStats_1.2.0 spatstat.sparse_3.0-3 bitops_1.0-7 lubridate_1.9.3 httr_1.4.7 -## [7] RColorBrewer_1.1-3 doParallel_1.0.17 tools_4.3.2 sctransform_0.4.0 backports_1.4.1 utf8_1.2.4 -## [13] R6_2.5.1 lazyeval_0.2.2 uwot_0.1.16 GetoptLong_1.0.5 withr_2.5.2 sp_2.1-2 -## [19] gridExtra_2.3 fdrtool_1.2.17 progressr_0.14.0 cli_3.6.2 spatstat.explore_3.2-1 labeling_0.4.3 -## [25] spatstat.data_3.0-3 randomForest_4.7-1.1 proxy_0.4-27 ggridges_0.5.5 pbapply_1.7-2 foreign_0.8-85 -## [31] parallelly_1.36.0 limma_3.56.2 readxl_1.4.3 rstudioapi_0.15.0 gridGraphics_0.5-1 visNetwork_2.1.2 -## [37] generics_0.1.3 shape_1.4.6 ica_1.0-3 spatstat.random_3.2-2 car_3.1-2 Matrix_1.6-4 -## [43] fansi_1.0.6 S4Vectors_0.38.1 abind_1.4-5 lifecycle_1.0.4 yaml_2.3.8 carData_3.0-5 -## [49] recipes_1.0.7 Rtsne_0.17 grid_4.3.2 promises_1.2.1 crayon_1.5.2 miniUI_0.1.1.1 -## [55] lattice_0.21-9 haven_2.4.3 cowplot_1.1.2 pillar_1.9.0 knitr_1.45 ComplexHeatmap_2.16.0 -## [61] rjson_0.2.21 future.apply_1.11.0 codetools_0.2-19 leiden_0.3.9 glue_1.6.2 data.table_1.14.10 -## [67] vctrs_0.6.5 png_0.1-8 spam_2.10-0 cellranger_1.1.0 gtable_0.3.4 assertthat_0.2.1 -## [73] gower_1.0.1 xfun_0.41 mime_0.12 prodlim_2023.08.28 survival_3.5-7 timeDate_4032.109 -## [79] iterators_1.0.14 hardhat_1.3.0 lava_1.7.3 DiagrammeR_1.0.10 ellipsis_0.3.2 fitdistrplus_1.1-11 -## [85] ROCR_1.0-11 ipred_0.9-14 nlme_3.1-163 RcppAnnoy_0.0.21 irlba_2.3.5.1 KernSmooth_2.23-22 -## [91] rpart_4.1.21 colorspace_2.1-0 BiocGenerics_0.46.0 DBI_1.1.3 Hmisc_5.1-0 nnet_7.3-19 -## [97] tidyselect_1.2.0 compiler_4.3.2 rvest_1.0.2 htmlTable_2.4.1 xml2_1.3.6 plotly_4.10.0 -## [103] shadowtext_0.1.2 checkmate_2.3.1 scales_1.3.0 caTools_1.18.2 lmtest_0.9-40 digest_0.6.33 -## [109] goftest_1.2-3 spatstat.utils_3.0-4 rmarkdown_2.11 htmltools_0.5.7 pkgconfig_2.0.3 base64enc_0.1-3 -## [115] highr_0.10 dbplyr_2.1.1 fastmap_1.1.1 rlang_1.1.2 GlobalOptions_0.1.2 htmlwidgets_1.6.2 -## [121] shiny_1.7.1 farver_2.1.1 zoo_1.8-12 jsonlite_1.8.8 ModelMetrics_1.2.2.2 magrittr_2.0.3 -## [127] Formula_1.2-5 dotCall64_1.1-1 patchwork_1.1.3 munsell_0.5.0 Rcpp_1.0.11 ggnewscale_0.4.9 -## [133] reticulate_1.34.0 stringi_1.7.6 pROC_1.18.5 MASS_7.3-60 plyr_1.8.9 parallel_4.3.2 -## [139] listenv_0.9.0 ggrepel_0.9.4 deldir_2.0-2 splines_4.3.2 tensor_1.5 hms_1.1.3 -## [145] circlize_0.4.15 igraph_1.2.11 ggpubr_0.6.0 spatstat.geom_3.2-7 ggsignif_0.6.4 reshape2_1.4.4 -## [151] stats4_4.3.2 reprex_2.0.1 evaluate_0.23 modelr_0.1.8 tzdb_0.4.0 foreach_1.5.2 -## [157] tweenr_2.0.2 httpuv_1.6.13 RANN_2.6.1 polyclip_1.10-6 future_1.33.0 clue_0.3-64 -## [163] scattermore_1.2 ggforce_0.4.1 broom_0.7.12 xtable_1.8-4 e1071_1.7-14 rstatix_0.7.2 -## [169] later_1.3.2 viridisLite_0.4.2 class_7.3-22 IRanges_2.34.1 cluster_2.1.4 timechange_0.2.0 -## [175] globals_0.16.2 caret_6.0-94 +## [1] RcppAnnoy_0.0.22 splines_4.3.3 later_1.3.2 bitops_1.0-7 polyclip_1.10-6 hardhat_1.3.1 pROC_1.18.5 rpart_4.1.23 +## [9] fastDummies_1.7.3 lifecycle_1.0.4 rstatix_0.7.2 doParallel_1.0.17 globals_0.16.3 lattice_0.22-5 MASS_7.3-60.0.1 backports_1.4.1 +## [17] magrittr_2.0.3 limma_3.58.1 rmarkdown_2.27 Hmisc_5.1-2 plotly_4.10.4 yaml_2.3.8 httpuv_1.6.15 sctransform_0.4.1 +## [25] spam_2.10-0 spatstat.sparse_3.0-3 reticulate_1.37.0 cowplot_1.1.3 pbapply_1.7-2 RColorBrewer_1.1-3 abind_1.4-5 Rtsne_0.17 +## [33] presto_1.0.0 BiocGenerics_0.48.1 nnet_7.3-19 tweenr_2.0.3 ipred_0.9-14 circlize_0.4.16 lava_1.8.0 IRanges_2.36.0 +## [41] S4Vectors_0.40.2 ggrepel_0.9.5 irlba_2.3.5.1 listenv_0.9.1 spatstat.utils_3.0-4 goftest_1.2-3 RSpectra_0.16-1 spatstat.random_3.2-3 +## [49] fitdistrplus_1.1-11 parallelly_1.37.1 leiden_0.4.3.1 codetools_0.2-19 ggforce_0.4.2 tidyselect_1.2.1 shape_1.4.6.1 farver_2.1.2 +## [57] matrixStats_1.3.0 stats4_4.3.3 base64enc_0.1-3 spatstat.explore_3.2-7 jsonlite_1.8.8 caret_6.0-94 GetoptLong_1.0.5 e1071_1.7-14 +## [65] progressr_0.14.0 Formula_1.2-5 ggridges_0.5.6 survival_3.5-8 iterators_1.0.14 foreach_1.5.2 tools_4.3.3 ggnewscale_0.4.10 +## [73] ica_1.0-3 Rcpp_1.0.12 glue_1.7.0 prodlim_2023.08.28 gridExtra_2.3 xfun_0.44 withr_3.0.0 fastmap_1.2.0 +## [81] fansi_1.0.6 caTools_1.18.2 digest_0.6.35 gridGraphics_0.5-1 timechange_0.3.0 R6_2.5.1 mime_0.12 colorspace_2.1-0 +## [89] scattermore_1.2 tensor_1.5 spatstat.data_3.0-4 DiagrammeR_1.0.11 utf8_1.2.4 generics_0.1.3 data.table_1.15.4 recipes_1.0.10 +## [97] class_7.3-22 httr_1.4.7 htmlwidgets_1.6.4 uwot_0.2.2 ModelMetrics_1.2.2.2 pkgconfig_2.0.3 gtable_0.3.5 timeDate_4032.109 +## [105] ComplexHeatmap_2.18.0 lmtest_0.9-40 shadowtext_0.1.3 htmltools_0.5.8.1 carData_3.0-5 dotCall64_1.1-1 clue_0.3-65 scales_1.3.0 +## [113] png_0.1-8 gower_1.0.1 knitr_1.46 rstudioapi_0.16.0 tzdb_0.4.0 reshape2_1.4.4 rjson_0.2.21 checkmate_2.3.1 +## [121] visNetwork_2.1.2 nlme_3.1-164 proxy_0.4-27 zoo_1.8-12 GlobalOptions_0.1.2 KernSmooth_2.23-22 parallel_4.3.3 miniUI_0.1.1.1 +## [129] foreign_0.8-86 pillar_1.9.0 grid_4.3.3 vctrs_0.6.5 RANN_2.6.1 ggpubr_0.6.0 randomForest_4.7-1.1 promises_1.3.0 +## [137] car_3.1-2 xtable_1.8-4 cluster_2.1.6 htmlTable_2.4.2 evaluate_0.23 cli_3.6.2 compiler_4.3.3 rlang_1.1.3 +## [145] crayon_1.5.2 ggsignif_0.6.4 future.apply_1.11.2 labeling_0.4.3 fdrtool_1.2.17 plyr_1.8.9 stringi_1.8.4 viridisLite_0.4.2 +## [153] deldir_2.0-4 munsell_0.5.1 lazyeval_0.2.2 spatstat.geom_3.2-9 Matrix_1.6-5 RcppHNSW_0.6.0 hms_1.1.3 patchwork_1.2.0 +## [161] future_1.33.2 statmod_1.5.0 shiny_1.8.1.1 highr_0.10 ROCR_1.0-11 broom_1.0.5 igraph_2.0.3 ``` ### References -
+
diff --git a/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-1.png b/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-1.png index b2c79b5..400a49c 100644 Binary files a/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-1.png and b/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-1.png differ diff --git a/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-unused-1.png b/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-unused-1.png index bcbfffb..3972427 100644 Binary files a/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-unused-1.png and b/vignettes/seurat_steps_prioritization_files/figure-gfm/lr-circos-unused-1.png differ diff --git a/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-1-1.png b/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-1-1.png index 976c32c..5f3784c 100644 Binary files a/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-1-1.png and b/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-1-1.png differ diff --git a/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-2-1.png b/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-2-1.png index c662f46..1c6cb73 100644 Binary files a/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-2-1.png and b/vignettes/seurat_steps_prioritization_files/figure-gfm/mushroom-plot-2-1.png differ