diff --git a/R/combineContigs.R b/R/combineContigs.R index d5230bd..0735a10 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -1,5 +1,9 @@ # Adding Global Variables # data('v_gene','j_gene', 'c_gene', 'd_gene') +# note that currently the Rcpp internals have hardcoded column names so +# if some breaking change here is made, the Rcpp code will need to be updated, +# or functions need to be adjusted to intake expected column names that +# uses these variables utils::globalVariables(c("v_gene", "j_gene", "c_gene", "d_gene", "chain")) heavy_lines <- c("IGH", "cdr3_aa1", "cdr3_nt1", "vgene1") @@ -21,42 +25,42 @@ utils::globalVariables(c( #' @title Combining the list of T cell receptor contigs into clones #' #' @description This function consolidates a list of TCR sequencing results to -#' the level of the individual cell barcodes. Using the **samples** and -#' **ID** parameters, the function will add the strings as prefixes to -#' prevent issues with repeated barcodes. The resulting new barcodes will -#' need to match the Seurat or SCE object in order to use, -#' [combineExpression()]. Several levels of filtering exist - -#' *removeNA*, *removeMulti*, or *filterMulti* are parameters -#' that control how the function deals with barcodes with multiple chains +#' the level of the individual cell barcodes. Using the **samples** and +#' **ID** parameters, the function will add the strings as prefixes to +#' prevent issues with repeated barcodes. The resulting new barcodes will +#' need to match the Seurat or SCE object in order to use, +#' [combineExpression()]. Several levels of filtering exist - +#' *removeNA*, *removeMulti*, or *filterMulti* are parameters +#' that control how the function deals with barcodes with multiple chains #' recovered. -#' +#' #' @examples -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) -#' -#' @param input.data List of filtered contig annotations or +#' +#' @param input.data List of filtered contig annotations or #' outputs from [loadContigs()]. #' @param samples The labels of samples (recommended). #' @param ID The additional sample labeling (optional). #' @param removeNA This will remove any chain without values. #' @param removeMulti This will remove barcodes with greater than 2 chains. -#' @param filterMulti This option will allow for the selection of the 2 -#' corresponding chains with the highest expression for a single barcode. -#' @param filterNonproductive This option will allow for the removal of +#' @param filterMulti This option will allow for the selection of the 2 +#' corresponding chains with the highest expression for a single barcode. +#' @param filterNonproductive This option will allow for the removal of #' nonproductive chains if the variable exists in the contig data. Default #' is set to TRUE to remove nonproductive contigs. -#' +#' #' @importFrom assertthat assert_that is.flag #' @export #' @concept Loading_and_Processing_Contigs #' @return List of clones for individual cell barcodes -#' -combineTCR <- function(input.data, - samples = NULL, - ID = NULL, - removeNA = FALSE, - removeMulti = FALSE, +#' +combineTCR <- function(input.data, + samples = NULL, + ID = NULL, + removeNA = FALSE, + removeMulti = FALSE, filterMulti = FALSE, filterNonproductive = TRUE) { @@ -66,7 +70,7 @@ combineTCR <- function(input.data, assert_that(is.flag(removeNA)) assert_that(is.flag(removeMulti)) assert_that(is.flag(filterMulti)) - + input.data <- .checkList(input.data) input.data <- .checkContigs(input.data) out <- NULL @@ -80,8 +84,8 @@ combineTCR <- function(input.data, } input.data[[i]]$sample <- samples[i] input.data[[i]]$ID <- ID[i] - if (filterMulti) { - input.data[[i]] <- .filteringMulti(input.data[[i]]) + if (filterMulti) { + input.data[[i]] <- .filteringMulti(input.data[[i]]) } } #Prevents error caused by list containing elements with 0 rows @@ -104,10 +108,10 @@ combineTCR <- function(input.data, data2 <- .makeGenes(cellType = "T", out[[i]]) Con.df <- .constructConDfAndParseTCR(data2) Con.df <- .assignCT(cellType = "T", Con.df) - Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA - data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], + Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA + data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], Con.df, by = "barcode") - + columns_to_include <- c("barcode") # Conditionally add columns based on user input if (!is.null(samples)) { @@ -116,17 +120,17 @@ combineTCR <- function(input.data, if (!is.null(ID)) { columns_to_include <- c(columns_to_include, "ID") } - + # Add TCR and CT lines which are presumably always needed columns_to_include <- c(columns_to_include, tcr1_lines, tcr2_lines, CT_lines) - + # Subset the data frame based on the dynamically built list of columns data3 <- data3[, columns_to_include] - - final[[i]] <- data3 + + final[[i]] <- data3 } name_vector <- character(length(samples)) - for (i in seq_along(samples)) { + for (i in seq_along(samples)) { if (!is.null(samples) && !is.null(ID)) { curr <- paste(samples[i], "_", ID[i], sep="") } else if (!is.null(samples) & is.null(ID)) { @@ -155,42 +159,42 @@ combineTCR <- function(input.data, #' Combining the list of B cell receptor contigs into clones #' -#' This function consolidates a list of BCR sequencing results to the level -#' of the individual cell barcodes. Using the samples and ID parameters, -#' the function will add the strings as prefixes to prevent issues with -#' repeated barcodes. The resulting new barcodes will need to match the -#' Seurat or SCE object in order to use, [combineExpression()]. -#' Unlike [combineTCR()], combineBCR produces a column -#' **CTstrict** of an index of nucleotide sequence and the -#' corresponding V gene. This index automatically calculates the +#' This function consolidates a list of BCR sequencing results to the level +#' of the individual cell barcodes. Using the samples and ID parameters, +#' the function will add the strings as prefixes to prevent issues with +#' repeated barcodes. The resulting new barcodes will need to match the +#' Seurat or SCE object in order to use, [combineExpression()]. +#' Unlike [combineTCR()], combineBCR produces a column +#' **CTstrict** of an index of nucleotide sequence and the +#' corresponding V gene. This index automatically calculates the #' Levenshtein distance between sequences with the same V gene and will -#' index sequences using a normalized Levenshtein distance with the same -#' ID. After which, clone clusters are called using the -#' [igraph::components()] function. Clones that are clustered -#' across multiple sequences will then be labeled with "Cluster" in the +#' index sequences using a normalized Levenshtein distance with the same +#' ID. After which, clone clusters are called using the +#' [igraph::components()] function. Clones that are clustered +#' across multiple sequences will then be labeled with "Cluster" in the #' CTstrict header. #' #' @examples #' #Data derived from the 10x Genomics intratumoral NSCLC B cells #' BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv") -#' combined <- combineBCR(BCR, -#' samples = "Patient1", +#' combined <- combineBCR(BCR, +#' samples = "Patient1", #' threshold = 0.85) -#' -#' @param input.data List of filtered contig annotations or outputs from +#' +#' @param input.data List of filtered contig annotations or outputs from #' [loadContigs()]. #' @param samples The labels of samples (required). #' @param ID The additional sample labeling (optional). -#' @param call.related.clones Use the nucleotide sequence and V gene -#' to call related clones. Default is set to TRUE. FALSE will return +#' @param call.related.clones Use the nucleotide sequence and V gene +#' to call related clones. Default is set to TRUE. FALSE will return #' a CTstrict or strict clone as V gene + amino acid sequence. -#' @param threshold The normalized edit distance to consider. The higher +#' @param threshold The normalized edit distance to consider. The higher #' the number the more similarity of sequence will be used for clustering. #' @param removeNA This will remove any chain without values. #' @param removeMulti This will remove barcodes with greater than 2 chains. -#' @param filterMulti This option will allow for the selection of the +#' @param filterMulti This option will allow for the selection of the #' highest-expressing light and heavy chains, if not calling related clones. -#' @param filterNonproductive This option will allow for the removal of +#' @param filterNonproductive This option will allow for the removal of #' nonproductive chains if the variable exists in the contig data. Default #' is set to TRUE to remove nonproductive contigs. #' @@ -198,18 +202,23 @@ combineTCR <- function(input.data, #' @concept Loading_and_Processing_Contigs #' @return List of clones for individual cell barcodes combineBCR <- function(input.data, - samples, + samples = NULL, ID = NULL, call.related.clones = TRUE, threshold = 0.85, - removeNA = FALSE, + removeNA = FALSE, removeMulti = FALSE, filterMulti = TRUE, filterNonproductive = TRUE) { + if (is.null(samples)) { + stop("combineBCR() requires the samples parameter for the calculation of edit distance.") + } + assert_that( - isListOfNonEmptyDataFrames(input.data) || isNonEmptyDataFrame(input.data), - is.character(samples), + isListOfNonEmptyDataFrames(input.data) || + isNonEmptyDataFrame(input.data), + is.character(samples) || is.null(samples), is.flag(call.related.clones), is.numeric(threshold), is.flag(removeNA), @@ -220,7 +229,8 @@ combineBCR <- function(input.data, final <- input.data %>% .checkList() %>% .checkContigs() %>% - lapply(function(x) { + unname() %>% + purrr::imap(function(x, i) { x <- subset(x, chain %in% c("IGH", "IGK", "IGL")) if (!is.null(ID)) x$ID <- ID[i] if (filterNonproductive && "productive" %in% colnames(x)) { @@ -230,7 +240,7 @@ combineBCR <- function(input.data, # Keep IGH / IGK / IGL info in save_chain x$save_chain <- x$chain # Collapse IGK and IGL chains - x$chain <- ifelse(x$chain=="IGH","IGH","IGLC") + x$chain <- ifelse(x$chain == "IGH", "IGH", "IGLC") x <- .filteringMulti(x) # Get back IGK / IGL distinction x$chain <- x$save_chain @@ -238,7 +248,13 @@ combineBCR <- function(input.data, } x }) %>% - .modifyBarcodes(samples, ID) %>% + (function(x) { + if (!is.null(samples)) { + .modifyBarcodes(x, samples, ID) + } else { # https://github.com/ncborcherding/scRepertoire/pull/450 + x + } + }) %>% lapply(function(x) { data2 <- data.frame(x) data2 <- .makeGenes(cellType = "B", data2) @@ -270,26 +286,23 @@ combineBCR <- function(input.data, } final[[i]]$sample <- samples[i] final[[i]]$ID <- ID[i] - final[[i]][final[[i]] == "NA_NA" | final[[i]] == "NA;NA_NA;NA"] <- NA + final[[i]][final[[i]] == "NA_NA" | final[[i]] == "NA;NA_NA;NA"] <- NA if (!is.null(sample) & !is.null(ID)) { - final[[i]]<- final[[i]][, c("barcode", "sample", "ID", + final[[i]]<- final[[i]][, c("barcode", "sample", "ID", heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)] } else if (!is.null(sample) & is.null(ID)) { - final[[i]]<- final[[i]][, c("barcode", "sample", + final[[i]]<- final[[i]][, c("barcode", "sample", heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)] } } - names <- NULL - for (i in seq_along(samples)) { - if (!is.null(samples) && !is.null(ID)) { - c <- paste(samples[i], "_", ID[i], sep="") - } else if (!is.null(samples) && is.null(ID)) { - c <- paste(samples[i], sep = "") - } - names <- c(names, c) + + names(final) <- if (!is.null(samples)) { + if (is.null(ID)) samples else paste0(samples, "_", ID) + } else { + paste0("S", seq_along(final)) } - names(final) <- names + for (i in seq_along(final)) { final[[i]] <- final[[i]][!duplicated(final[[i]]$barcode),] final[[i]] <- final[[i]][rowSums(is.na(final[[i]])) < 10, ] diff --git a/man/combineBCR.Rd b/man/combineBCR.Rd index 7363ce6..97964d9 100644 --- a/man/combineBCR.Rd +++ b/man/combineBCR.Rd @@ -6,7 +6,7 @@ \usage{ combineBCR( input.data, - samples, + samples = NULL, ID = NULL, call.related.clones = TRUE, threshold = 0.85, @@ -64,8 +64,8 @@ CTstrict header. \examples{ #Data derived from the 10x Genomics intratumoral NSCLC B cells BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv") -combined <- combineBCR(BCR, - samples = "Patient1", +combined <- combineBCR(BCR, + samples = "Patient1", threshold = 0.85) } diff --git a/man/combineTCR.Rd b/man/combineTCR.Rd index 6c030fa..0c53b18 100644 --- a/man/combineTCR.Rd +++ b/man/combineTCR.Rd @@ -48,8 +48,8 @@ that control how the function deals with barcodes with multiple chains recovered. } \examples{ -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) }