From d07e4a3386fbe475ad7674570cb1c233abc71dbf Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 02:53:50 -0800 Subject: [PATCH 01/11] fix --- R/utils.R | 5 ++--- src/constructConDfAndparseTCR.cpp | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/R/utils.R b/R/utils.R index 6b31daec..28123927 100644 --- a/R/utils.R +++ b/R/utils.R @@ -371,8 +371,7 @@ #' @keywords internal .constructConDfAndParseTCR <- function(data2) { rcppConstructConDfAndParseTCR( - data2 %>% dplyr::arrange(., chain, cdr3_nt), - unique(data2[[1]]) # 1 is the index of the barcode column + data2, uniqueData2Barcodes = unique(data2$barcode) ) } @@ -383,7 +382,7 @@ .parseBCR <- function (Con.df, unique_df, data2) { barcodeIndex <- rcppConstructBarcodeIndex(unique_df, data2$barcode) for (y in seq_along(unique_df)) { - location.i <- barcodeIndex[[y]] # *may* be wrong but should be fine. Test on old version first + location.i <- barcodeIndex[[y]] for (z in seq_along(location.i)) { where.chain <- data2[location.i[z],"chain"] diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp index 9e977b6a..f6d77a97 100644 --- a/src/constructConDfAndparseTCR.cpp +++ b/src/constructConDfAndparseTCR.cpp @@ -43,7 +43,7 @@ class TcrParser { TcrParser( Rcpp::DataFrame& data2, std::vector& uniqueData2Barcodes ) { - // construct conDf, initializaing the matrix to "NA" *strings* + // construct conDf, initializing the matrix to "NA" *strings* conDf = scRepHelper::initStringMatrix( 7, uniqueData2Barcodes.size(), "NA" ); @@ -65,7 +65,7 @@ class TcrParser { } // Rcpp implementation of .parseTCR() - void parseTCR() { + &TcrParser parseTCR() { for (int y = 0; y < (int) conDf[0].size(); y++) { for (int index : barcodeIndex[y]) { std::string chainType = std::string(data2ChainTypes[index]); @@ -78,6 +78,7 @@ class TcrParser { } } } + return *this; } // parseTCR() helpers @@ -122,7 +123,5 @@ class TcrParser { Rcpp::DataFrame rcppConstructConDfAndParseTCR( Rcpp::DataFrame& data2, std::vector uniqueData2Barcodes ) { - TcrParser parser = TcrParser(data2, uniqueData2Barcodes); - parser.parseTCR(); - return parser.getConDf(); + return TcrParser(data2, uniqueData2Barcodes).parseTCR().getConDf(); } From 41592f468407e449389b1da934d456fee273e2c8 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 02:56:30 -0800 Subject: [PATCH 02/11] fix amperstand position --- src/constructConDfAndparseTCR.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp index f6d77a97..44c3fcd9 100644 --- a/src/constructConDfAndparseTCR.cpp +++ b/src/constructConDfAndparseTCR.cpp @@ -65,7 +65,7 @@ class TcrParser { } // Rcpp implementation of .parseTCR() - &TcrParser parseTCR() { + TcrParser& parseTCR() { for (int y = 0; y < (int) conDf[0].size(); y++) { for (int index : barcodeIndex[y]) { std::string chainType = std::string(data2ChainTypes[index]); From 4cec5ce306840cd5d6cd5cdcf3a753f9fd31e572 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 13:00:31 -0800 Subject: [PATCH 03/11] add arrangement back --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 28123927..bf53a1d5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -371,7 +371,8 @@ #' @keywords internal .constructConDfAndParseTCR <- function(data2) { rcppConstructConDfAndParseTCR( - data2, uniqueData2Barcodes = unique(data2$barcode) + dplyr::arrange(data2, chain, cdr3_nt), + uniqueData2Barcodes = unique(data2$barcode) ) } From b05e718a43080a4288132e8d70390f34e8f6e541 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 14:26:09 -0800 Subject: [PATCH 04/11] rm potentially redundant cast --- src/constructConDfAndparseTCR.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp index 44c3fcd9..623a5780 100644 --- a/src/constructConDfAndparseTCR.cpp +++ b/src/constructConDfAndparseTCR.cpp @@ -60,7 +60,7 @@ class TcrParser { // construct barcodeIndex barcodeIndex = constructBarcodeIndex( - uniqueData2Barcodes, Rcpp::as>(data2[data2.findName("barcode")]) + uniqueData2Barcodes, data2[data2.findName("barcode")] ); } From d4ea3f1efb610a103a1dcffb3f495745957c069b Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 19:37:24 -0800 Subject: [PATCH 05/11] force samples argument in combineBCR --- R/combineContigs.R | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/R/combineContigs.R b/R/combineContigs.R index b2fdec90..36aec49b 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -197,27 +197,25 @@ combineTCR <- function(input.data, #' @export #' @concept Loading_and_Processing_Contigs #' @return List of clones for individual cell barcodes -combineBCR <- function(input.data, - samples = NULL, - ID = NULL, +combineBCR <- function(input.data, + samples, + ID = NULL, call.related.clones = TRUE, threshold = 0.85, removeNA = FALSE, removeMulti = FALSE, filterMulti = TRUE, filterNonproductive = TRUE) { - if(is.null(samples)) { - stop("combineBCR() requires the samples parameter for the calculation of edit distance.") - } - # rudimentary input checking - assert_that(is.character(samples) || is.null(samples)) - assert_that(is.character(ID) || is.null(ID)) - assert_that(is.flag(call.related.clones)) - assert_that(is.numeric(threshold)) - assert_that(is.flag(removeNA)) - assert_that(is.flag(removeMulti)) - assert_that(is.flag(filterMulti)) + assert_that( + isListOfNonEmptyDataFrames(input.data), + is.character(samples), + is.flag(call.related.clones), + is.numeric(threshold), + is.flag(removeNA), + is.flag(removeMulti), + is.flag(filterMulti) + ) input.data <- .checkList(input.data) input.data <- .checkContigs(input.data) @@ -256,9 +254,9 @@ combineBCR <- function(input.data, Con.df$barcode <- unique_df Con.df <- .parseBCR(Con.df, unique_df, data2) Con.df <- .assignCT(cellType = "B", Con.df) - data3<-Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>% + data3 <- Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>% mutate(length2 = nchar(cdr3_nt2)) - final[[i]] <- data3 + final[[i]] <- data3 } dictionary <- bind_rows(final) if(call.related.clones) { From 06e3e2c9e08cbe9e5e90fb2d0c3650180b0b37ad Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 19:43:11 -0800 Subject: [PATCH 06/11] make combineBCR more consice --- R/combineContigs.R | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/R/combineContigs.R b/R/combineContigs.R index 36aec49b..f6385f36 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -217,34 +217,26 @@ combineBCR <- function(input.data, is.flag(filterMulti) ) - input.data <- .checkList(input.data) - input.data <- .checkContigs(input.data) - out <- NULL + input.data <- .checkContigs(.checkList(input.data)) final <- list() - chain1 <- "heavy" - chain2 <- "light" for (i in seq_along(input.data)) { input.data[[i]] <- subset(input.data[[i]], chain %in% c("IGH", "IGK", "IGL")) input.data[[i]]$ID <- ID[i] - if(c("productive") %in% colnames(input.data[[i]]) & filterNonproductive) { - input.data[[i]] <- subset(input.data[[i]], productive %in% c(TRUE, "TRUE", "True", "true")) + if ("productive" %in% colnames(input.data[[i]]) & filterNonproductive) { + input.data[[i]] <- subset(input.data[[i]], tolower(productive) == "true") } if (filterMulti) { - # Keep IGH / IGK / IGL info in save_chain - input.data[[i]]$save_chain <- input.data[[i]]$chain - # Collapse IGK and IGL chains - input.data[[i]]$chain <- ifelse(input.data[[i]]$chain=="IGH","IGH","IGLC") - input.data[[i]] <- .filteringMulti(input.data[[i]]) - # Get back IGK / IGL distinction - input.data[[i]]$chain <- input.data[[i]]$save_chain - input.data[[i]]$save_chain <- NULL + # Keep IGH / IGK / IGL info in save_chain + input.data[[i]]$save_chain <- input.data[[i]]$chain + # Collapse IGK and IGL chains + input.data[[i]]$chain <- ifelse(input.data[[i]]$chain=="IGH","IGH","IGLC") + input.data[[i]] <- .filteringMulti(input.data[[i]]) + # Get back IGK / IGL distinction + input.data[[i]]$chain <- input.data[[i]]$save_chain + input.data[[i]]$save_chain <- NULL } } - if (!is.null(samples)) { - out <- .modifyBarcodes(input.data, samples, ID) - } else { - out <- input.data - } + out <- .modifyBarcodes(input.data, samples, ID) for (i in seq_along(out)) { data2 <- data.frame(out[[i]]) data2 <- .makeGenes(cellType = "B", data2) From cdf96f1cfb90a03c299a0ba178a1c52262fbd75c Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 20:01:23 -0800 Subject: [PATCH 07/11] favour lapply over for loop in combineBCR --- R/combineContigs.R | 63 +++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/R/combineContigs.R b/R/combineContigs.R index f6385f36..421d6032 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -218,27 +218,28 @@ combineBCR <- function(input.data, ) input.data <- .checkContigs(.checkList(input.data)) - final <- list() - for (i in seq_along(input.data)) { - input.data[[i]] <- subset(input.data[[i]], chain %in% c("IGH", "IGK", "IGL")) - input.data[[i]]$ID <- ID[i] - if ("productive" %in% colnames(input.data[[i]]) & filterNonproductive) { - input.data[[i]] <- subset(input.data[[i]], tolower(productive) == "true") + + input.data <- lapply(input.data, function(x) { + x <- subset(x, chain %in% c("IGH", "IGK", "IGL")) + if (!is.null(ID)) x$ID <- ID[i] + if (filterNonproductive && "productive" %in% colnames(x)) { + x <- subset(x, tolower(productive) == "true") } if (filterMulti) { # Keep IGH / IGK / IGL info in save_chain - input.data[[i]]$save_chain <- input.data[[i]]$chain + x$save_chain <- x$chain # Collapse IGK and IGL chains - input.data[[i]]$chain <- ifelse(input.data[[i]]$chain=="IGH","IGH","IGLC") - input.data[[i]] <- .filteringMulti(input.data[[i]]) + x$chain <- ifelse(x$chain=="IGH","IGH","IGLC") + x <- .filteringMulti(x) # Get back IGK / IGL distinction - input.data[[i]]$chain <- input.data[[i]]$save_chain - input.data[[i]]$save_chain <- NULL + x$chain <- x$save_chain + x$save_chain <- NULL } - } - out <- .modifyBarcodes(input.data, samples, ID) - for (i in seq_along(out)) { - data2 <- data.frame(out[[i]]) + x + }) + + final <- .modifyBarcodes(input.data, samples, ID) %>% lapply(function(x) { + data2 <- data.frame(x) data2 <- .makeGenes(cellType = "B", data2) unique_df <- unique(data2$barcode) Con.df <- data.frame(matrix(NA, length(unique_df), 9)) @@ -246,11 +247,12 @@ combineBCR <- function(input.data, Con.df$barcode <- unique_df Con.df <- .parseBCR(Con.df, unique_df, data2) Con.df <- .assignCT(cellType = "B", Con.df) - data3 <- Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>% + Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>% mutate(length2 = nchar(cdr3_nt2)) - final[[i]] <- data3 - } + }) + dictionary <- bind_rows(final) + if(call.related.clones) { IGH <- .lvCompare(dictionary, "IGH", "cdr3_nt1", threshold) IGLC <- .lvCompare(dictionary, "IGLC", "cdr3_nt2", threshold) @@ -278,30 +280,21 @@ combineBCR <- function(input.data, } } names <- NULL - for (i in seq_along(samples)) { - if (!is.null(samples) & !is.null(ID)) { + for (i in seq_along(samples)) { + if (!is.null(samples) && !is.null(ID)) { c <- paste(samples[i], "_", ID[i], sep="") - } else if (!is.null(samples) & is.null(ID)) { - c <- paste(samples[i], sep="") + } else if (!is.null(samples) && is.null(ID)) { + c <- paste(samples[i], sep = "") } names <- c(names, c) } names(final) <- names for (i in seq_along(final)) { final[[i]] <- final[[i]][!duplicated(final[[i]]$barcode),] - final[[i]]<-final[[i]][rowSums(is.na(final[[i]])) < 10, ] + final[[i]] <- final[[i]][rowSums(is.na(final[[i]])) < 10, ] final[[i]][final[[i]] == "NA"] <- NA } - if (removeNA) { - final <- .removingNA(final) - } - if (removeMulti) { - final <- .removingMulti(final) - } - #Adding list element names to output if samples NULL - if(is.null(samples)) { - names(final) <- paste0("S", seq_len(length(final))) - } - return(final) + if (removeNA) final <- .removingNA(final) + if (removeMulti) final <- .removingMulti(final) + return(final) } - From 77868f01b810269d72ac4679590ea7ef70d76088 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 20:17:04 -0800 Subject: [PATCH 08/11] more lapply and fix combineBCR input check --- R/combineContigs.R | 79 +++++++++++++++++++++++----------------------- R/typecheck.R | 10 ++++-- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/R/combineContigs.R b/R/combineContigs.R index 421d6032..d5230bd8 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -208,7 +208,7 @@ combineBCR <- function(input.data, filterNonproductive = TRUE) { assert_that( - isListOfNonEmptyDataFrames(input.data), + isListOfNonEmptyDataFrames(input.data) || isNonEmptyDataFrame(input.data), is.character(samples), is.flag(call.related.clones), is.numeric(threshold), @@ -217,47 +217,48 @@ combineBCR <- function(input.data, is.flag(filterMulti) ) - input.data <- .checkContigs(.checkList(input.data)) - - input.data <- lapply(input.data, function(x) { - x <- subset(x, chain %in% c("IGH", "IGK", "IGL")) - if (!is.null(ID)) x$ID <- ID[i] - if (filterNonproductive && "productive" %in% colnames(x)) { - x <- subset(x, tolower(productive) == "true") - } - if (filterMulti) { - # Keep IGH / IGK / IGL info in save_chain - x$save_chain <- x$chain - # Collapse IGK and IGL chains - x$chain <- ifelse(x$chain=="IGH","IGH","IGLC") - x <- .filteringMulti(x) - # Get back IGK / IGL distinction - x$chain <- x$save_chain - x$save_chain <- NULL - } - x - }) - - final <- .modifyBarcodes(input.data, samples, ID) %>% lapply(function(x) { - data2 <- data.frame(x) - data2 <- .makeGenes(cellType = "B", data2) - unique_df <- unique(data2$barcode) - Con.df <- data.frame(matrix(NA, length(unique_df), 9)) - colnames(Con.df) <- c("barcode", heavy_lines, light_lines) - Con.df$barcode <- unique_df - Con.df <- .parseBCR(Con.df, unique_df, data2) - Con.df <- .assignCT(cellType = "B", Con.df) - Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>% - mutate(length2 = nchar(cdr3_nt2)) - }) + final <- input.data %>% + .checkList() %>% + .checkContigs() %>% + lapply(function(x) { + x <- subset(x, chain %in% c("IGH", "IGK", "IGL")) + if (!is.null(ID)) x$ID <- ID[i] + if (filterNonproductive && "productive" %in% colnames(x)) { + x <- subset(x, tolower(productive) == "true") + } + if (filterMulti) { + # Keep IGH / IGK / IGL info in save_chain + x$save_chain <- x$chain + # Collapse IGK and IGL chains + x$chain <- ifelse(x$chain=="IGH","IGH","IGLC") + x <- .filteringMulti(x) + # Get back IGK / IGL distinction + x$chain <- x$save_chain + x$save_chain <- NULL + } + x + }) %>% + .modifyBarcodes(samples, ID) %>% + lapply(function(x) { + data2 <- data.frame(x) + data2 <- .makeGenes(cellType = "B", data2) + unique_df <- unique(data2$barcode) + Con.df <- data.frame(matrix(NA, length(unique_df), 9)) + colnames(Con.df) <- c("barcode", heavy_lines, light_lines) + Con.df$barcode <- unique_df + Con.df <- .parseBCR(Con.df, unique_df, data2) + Con.df <- .assignCT(cellType = "B", Con.df) + Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>% + mutate(length2 = nchar(cdr3_nt2)) + }) dictionary <- bind_rows(final) - if(call.related.clones) { - IGH <- .lvCompare(dictionary, "IGH", "cdr3_nt1", threshold) - IGLC <- .lvCompare(dictionary, "IGLC", "cdr3_nt2", threshold) - } - for(i in seq_along(final)) { + if (call.related.clones) { + IGH <- .lvCompare(dictionary, "IGH", "cdr3_nt1", threshold) + IGLC <- .lvCompare(dictionary, "IGLC", "cdr3_nt2", threshold) + } + for (i in seq_along(final)) { if(call.related.clones) { final[[i]]<-merge(final[[i]],IGH,by.x="cdr3_nt1",by.y="clone",all.x=TRUE) final[[i]]<-merge(final[[i]],IGLC,by.x="cdr3_nt2",by.y="clone",all.x=TRUE) diff --git a/R/typecheck.R b/R/typecheck.R index 21a55d4a..80accf91 100644 --- a/R/typecheck.R +++ b/R/typecheck.R @@ -1,8 +1,14 @@ # base R type check functions +isNonEmptyDataFrame <- function(obj) { + is.data.frame(obj) && sum(dim(obj)) > 0 +} +assertthat::on_failure(isNonEmptyDataFrame) <- function(call, env) { + paste0(deparse(call$obj), " is not a non-empty `data.frame`") +} + isListOfNonEmptyDataFrames <- function(obj) { - is.list(obj) && - all(sapply(obj, function(x) is.data.frame(x) && sum(dim(x)) > 0)) + is.list(obj) && all(sapply(obj, isNonEmptyDataFrame)) } assertthat::on_failure(isListOfNonEmptyDataFrames) <- function(call, env) { paste0(deparse(call$obj), " is not a list of non-empty `data.frame`s") From b25a6e1f84e9def48860bccafe4f44b54d319474 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 20:30:52 -0800 Subject: [PATCH 09/11] run document --- man/combineBCR.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/combineBCR.Rd b/man/combineBCR.Rd index 394af4cf..7363ce61 100644 --- a/man/combineBCR.Rd +++ b/man/combineBCR.Rd @@ -6,7 +6,7 @@ \usage{ combineBCR( input.data, - samples = NULL, + samples, ID = NULL, call.related.clones = TRUE, threshold = 0.85, From df1f3f7dcaa3a3d93ec0f2859308c5cfc46fec66 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sat, 23 Nov 2024 21:21:05 -0800 Subject: [PATCH 10/11] fix faq url --- R/combineExpression.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/combineExpression.R b/R/combineExpression.R index b43b227e..f1a32e47 100644 --- a/R/combineExpression.R +++ b/R/combineExpression.R @@ -223,5 +223,5 @@ getHighBarcodeMismatchWarning <- function() paste( "< 1% of barcodes match: Ensure the barcodes in the single-cell object", "match the barcodes in the combined immune receptor output from", "scRepertoire. If getting this error, please check", - "https://www.borch.dev/uploads/screpertoire/articles/faq." + "https://www.borch.dev/uploads/screpertoire/articles/faq" ) From 9811879da68c3ebfd2d79380a58d49d5e701cbd8 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Mon, 25 Nov 2024 23:45:01 -0800 Subject: [PATCH 11/11] re-allow samples = NULL in combineBCR --- R/combineContigs.R | 159 ++++++++++++++++++++++++--------------------- man/combineBCR.Rd | 6 +- man/combineTCR.Rd | 4 +- 3 files changed, 91 insertions(+), 78 deletions(-) diff --git a/R/combineContigs.R b/R/combineContigs.R index d5230bd8..0735a10f 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -1,5 +1,9 @@ # Adding Global Variables # data('v_gene','j_gene', 'c_gene', 'd_gene') +# note that currently the Rcpp internals have hardcoded column names so +# if some breaking change here is made, the Rcpp code will need to be updated, +# or functions need to be adjusted to intake expected column names that +# uses these variables utils::globalVariables(c("v_gene", "j_gene", "c_gene", "d_gene", "chain")) heavy_lines <- c("IGH", "cdr3_aa1", "cdr3_nt1", "vgene1") @@ -21,42 +25,42 @@ utils::globalVariables(c( #' @title Combining the list of T cell receptor contigs into clones #' #' @description This function consolidates a list of TCR sequencing results to -#' the level of the individual cell barcodes. Using the **samples** and -#' **ID** parameters, the function will add the strings as prefixes to -#' prevent issues with repeated barcodes. The resulting new barcodes will -#' need to match the Seurat or SCE object in order to use, -#' [combineExpression()]. Several levels of filtering exist - -#' *removeNA*, *removeMulti*, or *filterMulti* are parameters -#' that control how the function deals with barcodes with multiple chains +#' the level of the individual cell barcodes. Using the **samples** and +#' **ID** parameters, the function will add the strings as prefixes to +#' prevent issues with repeated barcodes. The resulting new barcodes will +#' need to match the Seurat or SCE object in order to use, +#' [combineExpression()]. Several levels of filtering exist - +#' *removeNA*, *removeMulti*, or *filterMulti* are parameters +#' that control how the function deals with barcodes with multiple chains #' recovered. -#' +#' #' @examples -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) -#' -#' @param input.data List of filtered contig annotations or +#' +#' @param input.data List of filtered contig annotations or #' outputs from [loadContigs()]. #' @param samples The labels of samples (recommended). #' @param ID The additional sample labeling (optional). #' @param removeNA This will remove any chain without values. #' @param removeMulti This will remove barcodes with greater than 2 chains. -#' @param filterMulti This option will allow for the selection of the 2 -#' corresponding chains with the highest expression for a single barcode. -#' @param filterNonproductive This option will allow for the removal of +#' @param filterMulti This option will allow for the selection of the 2 +#' corresponding chains with the highest expression for a single barcode. +#' @param filterNonproductive This option will allow for the removal of #' nonproductive chains if the variable exists in the contig data. Default #' is set to TRUE to remove nonproductive contigs. -#' +#' #' @importFrom assertthat assert_that is.flag #' @export #' @concept Loading_and_Processing_Contigs #' @return List of clones for individual cell barcodes -#' -combineTCR <- function(input.data, - samples = NULL, - ID = NULL, - removeNA = FALSE, - removeMulti = FALSE, +#' +combineTCR <- function(input.data, + samples = NULL, + ID = NULL, + removeNA = FALSE, + removeMulti = FALSE, filterMulti = FALSE, filterNonproductive = TRUE) { @@ -66,7 +70,7 @@ combineTCR <- function(input.data, assert_that(is.flag(removeNA)) assert_that(is.flag(removeMulti)) assert_that(is.flag(filterMulti)) - + input.data <- .checkList(input.data) input.data <- .checkContigs(input.data) out <- NULL @@ -80,8 +84,8 @@ combineTCR <- function(input.data, } input.data[[i]]$sample <- samples[i] input.data[[i]]$ID <- ID[i] - if (filterMulti) { - input.data[[i]] <- .filteringMulti(input.data[[i]]) + if (filterMulti) { + input.data[[i]] <- .filteringMulti(input.data[[i]]) } } #Prevents error caused by list containing elements with 0 rows @@ -104,10 +108,10 @@ combineTCR <- function(input.data, data2 <- .makeGenes(cellType = "T", out[[i]]) Con.df <- .constructConDfAndParseTCR(data2) Con.df <- .assignCT(cellType = "T", Con.df) - Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA - data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], + Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA + data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], Con.df, by = "barcode") - + columns_to_include <- c("barcode") # Conditionally add columns based on user input if (!is.null(samples)) { @@ -116,17 +120,17 @@ combineTCR <- function(input.data, if (!is.null(ID)) { columns_to_include <- c(columns_to_include, "ID") } - + # Add TCR and CT lines which are presumably always needed columns_to_include <- c(columns_to_include, tcr1_lines, tcr2_lines, CT_lines) - + # Subset the data frame based on the dynamically built list of columns data3 <- data3[, columns_to_include] - - final[[i]] <- data3 + + final[[i]] <- data3 } name_vector <- character(length(samples)) - for (i in seq_along(samples)) { + for (i in seq_along(samples)) { if (!is.null(samples) && !is.null(ID)) { curr <- paste(samples[i], "_", ID[i], sep="") } else if (!is.null(samples) & is.null(ID)) { @@ -155,42 +159,42 @@ combineTCR <- function(input.data, #' Combining the list of B cell receptor contigs into clones #' -#' This function consolidates a list of BCR sequencing results to the level -#' of the individual cell barcodes. Using the samples and ID parameters, -#' the function will add the strings as prefixes to prevent issues with -#' repeated barcodes. The resulting new barcodes will need to match the -#' Seurat or SCE object in order to use, [combineExpression()]. -#' Unlike [combineTCR()], combineBCR produces a column -#' **CTstrict** of an index of nucleotide sequence and the -#' corresponding V gene. This index automatically calculates the +#' This function consolidates a list of BCR sequencing results to the level +#' of the individual cell barcodes. Using the samples and ID parameters, +#' the function will add the strings as prefixes to prevent issues with +#' repeated barcodes. The resulting new barcodes will need to match the +#' Seurat or SCE object in order to use, [combineExpression()]. +#' Unlike [combineTCR()], combineBCR produces a column +#' **CTstrict** of an index of nucleotide sequence and the +#' corresponding V gene. This index automatically calculates the #' Levenshtein distance between sequences with the same V gene and will -#' index sequences using a normalized Levenshtein distance with the same -#' ID. After which, clone clusters are called using the -#' [igraph::components()] function. Clones that are clustered -#' across multiple sequences will then be labeled with "Cluster" in the +#' index sequences using a normalized Levenshtein distance with the same +#' ID. After which, clone clusters are called using the +#' [igraph::components()] function. Clones that are clustered +#' across multiple sequences will then be labeled with "Cluster" in the #' CTstrict header. #' #' @examples #' #Data derived from the 10x Genomics intratumoral NSCLC B cells #' BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv") -#' combined <- combineBCR(BCR, -#' samples = "Patient1", +#' combined <- combineBCR(BCR, +#' samples = "Patient1", #' threshold = 0.85) -#' -#' @param input.data List of filtered contig annotations or outputs from +#' +#' @param input.data List of filtered contig annotations or outputs from #' [loadContigs()]. #' @param samples The labels of samples (required). #' @param ID The additional sample labeling (optional). -#' @param call.related.clones Use the nucleotide sequence and V gene -#' to call related clones. Default is set to TRUE. FALSE will return +#' @param call.related.clones Use the nucleotide sequence and V gene +#' to call related clones. Default is set to TRUE. FALSE will return #' a CTstrict or strict clone as V gene + amino acid sequence. -#' @param threshold The normalized edit distance to consider. The higher +#' @param threshold The normalized edit distance to consider. The higher #' the number the more similarity of sequence will be used for clustering. #' @param removeNA This will remove any chain without values. #' @param removeMulti This will remove barcodes with greater than 2 chains. -#' @param filterMulti This option will allow for the selection of the +#' @param filterMulti This option will allow for the selection of the #' highest-expressing light and heavy chains, if not calling related clones. -#' @param filterNonproductive This option will allow for the removal of +#' @param filterNonproductive This option will allow for the removal of #' nonproductive chains if the variable exists in the contig data. Default #' is set to TRUE to remove nonproductive contigs. #' @@ -198,18 +202,23 @@ combineTCR <- function(input.data, #' @concept Loading_and_Processing_Contigs #' @return List of clones for individual cell barcodes combineBCR <- function(input.data, - samples, + samples = NULL, ID = NULL, call.related.clones = TRUE, threshold = 0.85, - removeNA = FALSE, + removeNA = FALSE, removeMulti = FALSE, filterMulti = TRUE, filterNonproductive = TRUE) { + if (is.null(samples)) { + stop("combineBCR() requires the samples parameter for the calculation of edit distance.") + } + assert_that( - isListOfNonEmptyDataFrames(input.data) || isNonEmptyDataFrame(input.data), - is.character(samples), + isListOfNonEmptyDataFrames(input.data) || + isNonEmptyDataFrame(input.data), + is.character(samples) || is.null(samples), is.flag(call.related.clones), is.numeric(threshold), is.flag(removeNA), @@ -220,7 +229,8 @@ combineBCR <- function(input.data, final <- input.data %>% .checkList() %>% .checkContigs() %>% - lapply(function(x) { + unname() %>% + purrr::imap(function(x, i) { x <- subset(x, chain %in% c("IGH", "IGK", "IGL")) if (!is.null(ID)) x$ID <- ID[i] if (filterNonproductive && "productive" %in% colnames(x)) { @@ -230,7 +240,7 @@ combineBCR <- function(input.data, # Keep IGH / IGK / IGL info in save_chain x$save_chain <- x$chain # Collapse IGK and IGL chains - x$chain <- ifelse(x$chain=="IGH","IGH","IGLC") + x$chain <- ifelse(x$chain == "IGH", "IGH", "IGLC") x <- .filteringMulti(x) # Get back IGK / IGL distinction x$chain <- x$save_chain @@ -238,7 +248,13 @@ combineBCR <- function(input.data, } x }) %>% - .modifyBarcodes(samples, ID) %>% + (function(x) { + if (!is.null(samples)) { + .modifyBarcodes(x, samples, ID) + } else { # https://github.com/ncborcherding/scRepertoire/pull/450 + x + } + }) %>% lapply(function(x) { data2 <- data.frame(x) data2 <- .makeGenes(cellType = "B", data2) @@ -270,26 +286,23 @@ combineBCR <- function(input.data, } final[[i]]$sample <- samples[i] final[[i]]$ID <- ID[i] - final[[i]][final[[i]] == "NA_NA" | final[[i]] == "NA;NA_NA;NA"] <- NA + final[[i]][final[[i]] == "NA_NA" | final[[i]] == "NA;NA_NA;NA"] <- NA if (!is.null(sample) & !is.null(ID)) { - final[[i]]<- final[[i]][, c("barcode", "sample", "ID", + final[[i]]<- final[[i]][, c("barcode", "sample", "ID", heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)] } else if (!is.null(sample) & is.null(ID)) { - final[[i]]<- final[[i]][, c("barcode", "sample", + final[[i]]<- final[[i]][, c("barcode", "sample", heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)] } } - names <- NULL - for (i in seq_along(samples)) { - if (!is.null(samples) && !is.null(ID)) { - c <- paste(samples[i], "_", ID[i], sep="") - } else if (!is.null(samples) && is.null(ID)) { - c <- paste(samples[i], sep = "") - } - names <- c(names, c) + + names(final) <- if (!is.null(samples)) { + if (is.null(ID)) samples else paste0(samples, "_", ID) + } else { + paste0("S", seq_along(final)) } - names(final) <- names + for (i in seq_along(final)) { final[[i]] <- final[[i]][!duplicated(final[[i]]$barcode),] final[[i]] <- final[[i]][rowSums(is.na(final[[i]])) < 10, ] diff --git a/man/combineBCR.Rd b/man/combineBCR.Rd index 7363ce61..97964d98 100644 --- a/man/combineBCR.Rd +++ b/man/combineBCR.Rd @@ -6,7 +6,7 @@ \usage{ combineBCR( input.data, - samples, + samples = NULL, ID = NULL, call.related.clones = TRUE, threshold = 0.85, @@ -64,8 +64,8 @@ CTstrict header. \examples{ #Data derived from the 10x Genomics intratumoral NSCLC B cells BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv") -combined <- combineBCR(BCR, - samples = "Patient1", +combined <- combineBCR(BCR, + samples = "Patient1", threshold = 0.85) } diff --git a/man/combineTCR.Rd b/man/combineTCR.Rd index 6c030fa7..0c53b18f 100644 --- a/man/combineTCR.Rd +++ b/man/combineTCR.Rd @@ -48,8 +48,8 @@ that control how the function deals with barcodes with multiple chains recovered. } \examples{ -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) }