From d07e4a3386fbe475ad7674570cb1c233abc71dbf Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 02:53:50 -0800
Subject: [PATCH 01/11] fix

---
 R/utils.R                         | 5 ++---
 src/constructConDfAndparseTCR.cpp | 9 ++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 6b31daec..28123927 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -371,8 +371,7 @@
 #' @keywords internal
 .constructConDfAndParseTCR <- function(data2) {
   rcppConstructConDfAndParseTCR(
-    data2 %>% dplyr::arrange(., chain, cdr3_nt),
-    unique(data2[[1]]) # 1 is the index of the barcode column
+    data2, uniqueData2Barcodes = unique(data2$barcode)
   )
 }
 
@@ -383,7 +382,7 @@
 .parseBCR <- function (Con.df, unique_df, data2) {
   barcodeIndex <- rcppConstructBarcodeIndex(unique_df, data2$barcode)
   for (y in seq_along(unique_df)) {
-    location.i <- barcodeIndex[[y]] # *may* be wrong but should be fine. Test on old version first
+    location.i <- barcodeIndex[[y]]
     
     for (z in seq_along(location.i)) {
       where.chain <- data2[location.i[z],"chain"]
diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp
index 9e977b6a..f6d77a97 100644
--- a/src/constructConDfAndparseTCR.cpp
+++ b/src/constructConDfAndparseTCR.cpp
@@ -43,7 +43,7 @@ class TcrParser {
     TcrParser(
         Rcpp::DataFrame& data2, std::vector<std::string>& uniqueData2Barcodes
     ) {
-        // construct conDf, initializaing the matrix to "NA" *strings*
+        // construct conDf, initializing the matrix to "NA" *strings*
         conDf = scRepHelper::initStringMatrix(
             7, uniqueData2Barcodes.size(), "NA"
         );
@@ -65,7 +65,7 @@ class TcrParser {
     }
 
     // Rcpp implementation of .parseTCR()
-    void parseTCR() {
+    &TcrParser parseTCR() {
         for (int y = 0; y < (int) conDf[0].size(); y++) {
             for (int index : barcodeIndex[y]) {
                 std::string chainType = std::string(data2ChainTypes[index]);
@@ -78,6 +78,7 @@ class TcrParser {
                 }
             }
         }
+        return *this;
     }
 
     // parseTCR() helpers
@@ -122,7 +123,5 @@ class TcrParser {
 Rcpp::DataFrame rcppConstructConDfAndParseTCR(
     Rcpp::DataFrame& data2, std::vector<std::string> uniqueData2Barcodes
 ) {
-    TcrParser parser = TcrParser(data2, uniqueData2Barcodes);
-    parser.parseTCR();
-    return parser.getConDf();
+    return TcrParser(data2, uniqueData2Barcodes).parseTCR().getConDf();
 }

From 41592f468407e449389b1da934d456fee273e2c8 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 02:56:30 -0800
Subject: [PATCH 02/11] fix amperstand position

---
 src/constructConDfAndparseTCR.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp
index f6d77a97..44c3fcd9 100644
--- a/src/constructConDfAndparseTCR.cpp
+++ b/src/constructConDfAndparseTCR.cpp
@@ -65,7 +65,7 @@ class TcrParser {
     }
 
     // Rcpp implementation of .parseTCR()
-    &TcrParser parseTCR() {
+    TcrParser& parseTCR() {
         for (int y = 0; y < (int) conDf[0].size(); y++) {
             for (int index : barcodeIndex[y]) {
                 std::string chainType = std::string(data2ChainTypes[index]);

From 4cec5ce306840cd5d6cd5cdcf3a753f9fd31e572 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 13:00:31 -0800
Subject: [PATCH 03/11] add arrangement back

---
 R/utils.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/utils.R b/R/utils.R
index 28123927..bf53a1d5 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -371,7 +371,8 @@
 #' @keywords internal
 .constructConDfAndParseTCR <- function(data2) {
   rcppConstructConDfAndParseTCR(
-    data2, uniqueData2Barcodes = unique(data2$barcode)
+    dplyr::arrange(data2, chain, cdr3_nt),
+    uniqueData2Barcodes = unique(data2$barcode)
   )
 }
 

From b05e718a43080a4288132e8d70390f34e8f6e541 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 14:26:09 -0800
Subject: [PATCH 04/11] rm potentially redundant cast

---
 src/constructConDfAndparseTCR.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp
index 44c3fcd9..623a5780 100644
--- a/src/constructConDfAndparseTCR.cpp
+++ b/src/constructConDfAndparseTCR.cpp
@@ -60,7 +60,7 @@ class TcrParser {
 
         // construct barcodeIndex
         barcodeIndex = constructBarcodeIndex(
-            uniqueData2Barcodes, Rcpp::as<std::vector<std::string>>(data2[data2.findName("barcode")])
+            uniqueData2Barcodes, data2[data2.findName("barcode")]
         );
     }
 

From d4ea3f1efb610a103a1dcffb3f495745957c069b Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 19:37:24 -0800
Subject: [PATCH 05/11] force samples argument in combineBCR

---
 R/combineContigs.R | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/R/combineContigs.R b/R/combineContigs.R
index b2fdec90..36aec49b 100644
--- a/R/combineContigs.R
+++ b/R/combineContigs.R
@@ -197,27 +197,25 @@ combineTCR <- function(input.data,
 #' @export
 #' @concept Loading_and_Processing_Contigs
 #' @return List of clones for individual cell barcodes
-combineBCR <- function(input.data, 
-                       samples = NULL, 
-                       ID = NULL, 
+combineBCR <- function(input.data,
+                       samples,
+                       ID = NULL,
                        call.related.clones = TRUE,
                        threshold = 0.85,
                        removeNA = FALSE, 
                        removeMulti = FALSE,
                        filterMulti = TRUE,
                        filterNonproductive = TRUE) {
-    if(is.null(samples)) {
-      stop("combineBCR() requires the samples parameter for the calculation of edit distance.")
-    }
 
-    # rudimentary input checking
-    assert_that(is.character(samples) || is.null(samples))
-    assert_that(is.character(ID) || is.null(ID))
-    assert_that(is.flag(call.related.clones))
-    assert_that(is.numeric(threshold))
-    assert_that(is.flag(removeNA))
-    assert_that(is.flag(removeMulti))
-    assert_that(is.flag(filterMulti))
+    assert_that(
+        isListOfNonEmptyDataFrames(input.data),
+        is.character(samples),
+        is.flag(call.related.clones),
+        is.numeric(threshold),
+        is.flag(removeNA),
+        is.flag(removeMulti),
+        is.flag(filterMulti)
+    )
 
     input.data <- .checkList(input.data)
     input.data <- .checkContigs(input.data)
@@ -256,9 +254,9 @@ combineBCR <- function(input.data,
         Con.df$barcode <- unique_df
         Con.df <- .parseBCR(Con.df, unique_df, data2)
         Con.df <- .assignCT(cellType = "B", Con.df)
-        data3<-Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>%
+        data3 <- Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>%
             mutate(length2 = nchar(cdr3_nt2))
-        final[[i]] <- data3 
+        final[[i]] <- data3
     }
     dictionary <- bind_rows(final)
     if(call.related.clones) {

From 06e3e2c9e08cbe9e5e90fb2d0c3650180b0b37ad Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 19:43:11 -0800
Subject: [PATCH 06/11] make combineBCR more consice

---
 R/combineContigs.R | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/R/combineContigs.R b/R/combineContigs.R
index 36aec49b..f6385f36 100644
--- a/R/combineContigs.R
+++ b/R/combineContigs.R
@@ -217,34 +217,26 @@ combineBCR <- function(input.data,
         is.flag(filterMulti)
     )
 
-    input.data <- .checkList(input.data)
-    input.data <- .checkContigs(input.data)
-    out <- NULL
+    input.data <- .checkContigs(.checkList(input.data))
     final <- list()
-    chain1 <- "heavy"
-    chain2 <- "light"
     for (i in seq_along(input.data)) {
         input.data[[i]] <- subset(input.data[[i]], chain %in% c("IGH", "IGK", "IGL"))
         input.data[[i]]$ID <- ID[i]
-        if(c("productive") %in% colnames(input.data[[i]]) & filterNonproductive) {
-          input.data[[i]] <- subset(input.data[[i]], productive %in% c(TRUE, "TRUE", "True", "true"))
+        if ("productive" %in% colnames(input.data[[i]]) & filterNonproductive) {
+            input.data[[i]] <- subset(input.data[[i]], tolower(productive) == "true")
         }
         if (filterMulti) {
-                    # Keep IGH / IGK / IGL info in save_chain
-                    input.data[[i]]$save_chain <- input.data[[i]]$chain
-                    # Collapse IGK and IGL chains
-                    input.data[[i]]$chain <- ifelse(input.data[[i]]$chain=="IGH","IGH","IGLC")
-                    input.data[[i]] <- .filteringMulti(input.data[[i]])
-                    # Get back IGK / IGL distinction
-                    input.data[[i]]$chain <- input.data[[i]]$save_chain
-                    input.data[[i]]$save_chain <- NULL
+            # Keep IGH / IGK / IGL info in save_chain
+            input.data[[i]]$save_chain <- input.data[[i]]$chain
+            # Collapse IGK and IGL chains
+            input.data[[i]]$chain <- ifelse(input.data[[i]]$chain=="IGH","IGH","IGLC")
+            input.data[[i]] <- .filteringMulti(input.data[[i]])
+            # Get back IGK / IGL distinction
+            input.data[[i]]$chain <- input.data[[i]]$save_chain
+            input.data[[i]]$save_chain <- NULL
         }
     }
-    if (!is.null(samples)) {
-        out <- .modifyBarcodes(input.data, samples, ID)
-    } else {
-        out <- input.data
-    }
+    out <- .modifyBarcodes(input.data, samples, ID)
     for (i in seq_along(out)) { 
         data2 <- data.frame(out[[i]])
         data2 <- .makeGenes(cellType = "B", data2)

From cdf96f1cfb90a03c299a0ba178a1c52262fbd75c Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 20:01:23 -0800
Subject: [PATCH 07/11] favour lapply over for loop in combineBCR

---
 R/combineContigs.R | 63 +++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

diff --git a/R/combineContigs.R b/R/combineContigs.R
index f6385f36..421d6032 100644
--- a/R/combineContigs.R
+++ b/R/combineContigs.R
@@ -218,27 +218,28 @@ combineBCR <- function(input.data,
     )
 
     input.data <- .checkContigs(.checkList(input.data))
-    final <- list()
-    for (i in seq_along(input.data)) {
-        input.data[[i]] <- subset(input.data[[i]], chain %in% c("IGH", "IGK", "IGL"))
-        input.data[[i]]$ID <- ID[i]
-        if ("productive" %in% colnames(input.data[[i]]) & filterNonproductive) {
-            input.data[[i]] <- subset(input.data[[i]], tolower(productive) == "true")
+
+    input.data <- lapply(input.data, function(x) {
+        x <- subset(x, chain %in% c("IGH", "IGK", "IGL"))
+        if (!is.null(ID)) x$ID <- ID[i]
+        if (filterNonproductive && "productive" %in% colnames(x)) {
+            x <- subset(x, tolower(productive) == "true")
         }
         if (filterMulti) {
             # Keep IGH / IGK / IGL info in save_chain
-            input.data[[i]]$save_chain <- input.data[[i]]$chain
+            x$save_chain <- x$chain
             # Collapse IGK and IGL chains
-            input.data[[i]]$chain <- ifelse(input.data[[i]]$chain=="IGH","IGH","IGLC")
-            input.data[[i]] <- .filteringMulti(input.data[[i]])
+            x$chain <- ifelse(x$chain=="IGH","IGH","IGLC")
+            x <- .filteringMulti(x)
             # Get back IGK / IGL distinction
-            input.data[[i]]$chain <- input.data[[i]]$save_chain
-            input.data[[i]]$save_chain <- NULL
+            x$chain <- x$save_chain
+            x$save_chain <- NULL
         }
-    }
-    out <- .modifyBarcodes(input.data, samples, ID)
-    for (i in seq_along(out)) { 
-        data2 <- data.frame(out[[i]])
+        x
+    })
+
+    final <- .modifyBarcodes(input.data, samples, ID) %>% lapply(function(x) {
+        data2 <- data.frame(x)
         data2 <- .makeGenes(cellType = "B", data2)
         unique_df <- unique(data2$barcode)
         Con.df <- data.frame(matrix(NA, length(unique_df), 9))
@@ -246,11 +247,12 @@ combineBCR <- function(input.data,
         Con.df$barcode <- unique_df
         Con.df <- .parseBCR(Con.df, unique_df, data2)
         Con.df <- .assignCT(cellType = "B", Con.df)
-        data3 <- Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>%
+        Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>%
             mutate(length2 = nchar(cdr3_nt2))
-        final[[i]] <- data3
-    }
+    })
+
     dictionary <- bind_rows(final)
+
     if(call.related.clones) {
       IGH <- .lvCompare(dictionary, "IGH", "cdr3_nt1", threshold)
       IGLC <- .lvCompare(dictionary, "IGLC", "cdr3_nt2", threshold)
@@ -278,30 +280,21 @@ combineBCR <- function(input.data,
         }
     }
     names <- NULL
-    for (i in seq_along(samples)) { 
-        if (!is.null(samples) & !is.null(ID)) {
+    for (i in seq_along(samples)) {
+        if (!is.null(samples) && !is.null(ID)) {
             c <- paste(samples[i], "_", ID[i], sep="")
-        } else if (!is.null(samples) & is.null(ID)) {
-            c <- paste(samples[i], sep="")
+        } else if (!is.null(samples) && is.null(ID)) {
+            c <- paste(samples[i], sep = "")
         }
         names <- c(names, c)
     }
     names(final) <- names
     for (i in seq_along(final)) {
         final[[i]] <- final[[i]][!duplicated(final[[i]]$barcode),]
-        final[[i]]<-final[[i]][rowSums(is.na(final[[i]])) < 10, ]
+        final[[i]] <- final[[i]][rowSums(is.na(final[[i]])) < 10, ]
         final[[i]][final[[i]] == "NA"] <- NA
     }
-    if (removeNA) { 
-      final <- .removingNA(final) 
-    }
-    if (removeMulti) { 
-      final <- .removingMulti(final) 
-    }
-    #Adding list element names to output if samples NULL
-    if(is.null(samples)) {
-      names(final) <- paste0("S", seq_len(length(final)))
-    }
-    return(final) 
+    if (removeNA) final <- .removingNA(final)
+    if (removeMulti) final <- .removingMulti(final)
+    return(final)
 }
-

From 77868f01b810269d72ac4679590ea7ef70d76088 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 20:17:04 -0800
Subject: [PATCH 08/11] more lapply and fix combineBCR input check

---
 R/combineContigs.R | 79 +++++++++++++++++++++++-----------------------
 R/typecheck.R      | 10 ++++--
 2 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/R/combineContigs.R b/R/combineContigs.R
index 421d6032..d5230bd8 100644
--- a/R/combineContigs.R
+++ b/R/combineContigs.R
@@ -208,7 +208,7 @@ combineBCR <- function(input.data,
                        filterNonproductive = TRUE) {
 
     assert_that(
-        isListOfNonEmptyDataFrames(input.data),
+        isListOfNonEmptyDataFrames(input.data) || isNonEmptyDataFrame(input.data),
         is.character(samples),
         is.flag(call.related.clones),
         is.numeric(threshold),
@@ -217,47 +217,48 @@ combineBCR <- function(input.data,
         is.flag(filterMulti)
     )
 
-    input.data <- .checkContigs(.checkList(input.data))
-
-    input.data <- lapply(input.data, function(x) {
-        x <- subset(x, chain %in% c("IGH", "IGK", "IGL"))
-        if (!is.null(ID)) x$ID <- ID[i]
-        if (filterNonproductive && "productive" %in% colnames(x)) {
-            x <- subset(x, tolower(productive) == "true")
-        }
-        if (filterMulti) {
-            # Keep IGH / IGK / IGL info in save_chain
-            x$save_chain <- x$chain
-            # Collapse IGK and IGL chains
-            x$chain <- ifelse(x$chain=="IGH","IGH","IGLC")
-            x <- .filteringMulti(x)
-            # Get back IGK / IGL distinction
-            x$chain <- x$save_chain
-            x$save_chain <- NULL
-        }
-        x
-    })
-
-    final <- .modifyBarcodes(input.data, samples, ID) %>% lapply(function(x) {
-        data2 <- data.frame(x)
-        data2 <- .makeGenes(cellType = "B", data2)
-        unique_df <- unique(data2$barcode)
-        Con.df <- data.frame(matrix(NA, length(unique_df), 9))
-        colnames(Con.df) <- c("barcode", heavy_lines, light_lines)
-        Con.df$barcode <- unique_df
-        Con.df <- .parseBCR(Con.df, unique_df, data2)
-        Con.df <- .assignCT(cellType = "B", Con.df)
-        Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>%
-            mutate(length2 = nchar(cdr3_nt2))
-    })
+    final <- input.data %>%
+        .checkList() %>%
+        .checkContigs() %>%
+        lapply(function(x) {
+            x <- subset(x, chain %in% c("IGH", "IGK", "IGL"))
+            if (!is.null(ID)) x$ID <- ID[i]
+            if (filterNonproductive && "productive" %in% colnames(x)) {
+                x <- subset(x, tolower(productive) == "true")
+            }
+            if (filterMulti) {
+                # Keep IGH / IGK / IGL info in save_chain
+                x$save_chain <- x$chain
+                # Collapse IGK and IGL chains
+                x$chain <- ifelse(x$chain=="IGH","IGH","IGLC")
+                x <- .filteringMulti(x)
+                # Get back IGK / IGL distinction
+                x$chain <- x$save_chain
+                x$save_chain <- NULL
+            }
+            x
+        }) %>%
+        .modifyBarcodes(samples, ID) %>%
+        lapply(function(x) {
+            data2 <- data.frame(x)
+            data2 <- .makeGenes(cellType = "B", data2)
+            unique_df <- unique(data2$barcode)
+            Con.df <- data.frame(matrix(NA, length(unique_df), 9))
+            colnames(Con.df) <- c("barcode", heavy_lines, light_lines)
+            Con.df$barcode <- unique_df
+            Con.df <- .parseBCR(Con.df, unique_df, data2)
+            Con.df <- .assignCT(cellType = "B", Con.df)
+            Con.df %>% mutate(length1 = nchar(cdr3_nt1)) %>%
+                mutate(length2 = nchar(cdr3_nt2))
+        })
 
     dictionary <- bind_rows(final)
 
-    if(call.related.clones) {
-      IGH <- .lvCompare(dictionary, "IGH", "cdr3_nt1", threshold)
-      IGLC <- .lvCompare(dictionary, "IGLC", "cdr3_nt2", threshold)
-    } 
-    for(i in seq_along(final)) {
+    if (call.related.clones) {
+        IGH <- .lvCompare(dictionary, "IGH", "cdr3_nt1", threshold)
+        IGLC <- .lvCompare(dictionary, "IGLC", "cdr3_nt2", threshold)
+    }
+    for (i in seq_along(final)) {
       if(call.related.clones) {
         final[[i]]<-merge(final[[i]],IGH,by.x="cdr3_nt1",by.y="clone",all.x=TRUE)
         final[[i]]<-merge(final[[i]],IGLC,by.x="cdr3_nt2",by.y="clone",all.x=TRUE)
diff --git a/R/typecheck.R b/R/typecheck.R
index 21a55d4a..80accf91 100644
--- a/R/typecheck.R
+++ b/R/typecheck.R
@@ -1,8 +1,14 @@
 # base R type check functions
 
+isNonEmptyDataFrame <- function(obj) {
+    is.data.frame(obj) && sum(dim(obj)) > 0
+}
+assertthat::on_failure(isNonEmptyDataFrame) <- function(call, env) {
+    paste0(deparse(call$obj), " is not a non-empty `data.frame`")
+}
+
 isListOfNonEmptyDataFrames <- function(obj) {
-    is.list(obj) &&
-        all(sapply(obj, function(x) is.data.frame(x) && sum(dim(x)) > 0))
+    is.list(obj) && all(sapply(obj, isNonEmptyDataFrame))
 }
 assertthat::on_failure(isListOfNonEmptyDataFrames) <- function(call, env) {
     paste0(deparse(call$obj), " is not a list of non-empty `data.frame`s")

From b25a6e1f84e9def48860bccafe4f44b54d319474 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 20:30:52 -0800
Subject: [PATCH 09/11] run document

---
 man/combineBCR.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/combineBCR.Rd b/man/combineBCR.Rd
index 394af4cf..7363ce61 100644
--- a/man/combineBCR.Rd
+++ b/man/combineBCR.Rd
@@ -6,7 +6,7 @@
 \usage{
 combineBCR(
   input.data,
-  samples = NULL,
+  samples,
   ID = NULL,
   call.related.clones = TRUE,
   threshold = 0.85,

From df1f3f7dcaa3a3d93ec0f2859308c5cfc46fec66 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Sat, 23 Nov 2024 21:21:05 -0800
Subject: [PATCH 10/11] fix faq url

---
 R/combineExpression.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/combineExpression.R b/R/combineExpression.R
index b43b227e..f1a32e47 100644
--- a/R/combineExpression.R
+++ b/R/combineExpression.R
@@ -223,5 +223,5 @@ getHighBarcodeMismatchWarning <- function() paste(
     "< 1% of barcodes match: Ensure the barcodes in the single-cell object",
     "match the barcodes in the combined immune receptor output from",
     "scRepertoire. If getting this error, please check",
-    "https://www.borch.dev/uploads/screpertoire/articles/faq."
+    "https://www.borch.dev/uploads/screpertoire/articles/faq"
 )

From 9811879da68c3ebfd2d79380a58d49d5e701cbd8 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Mon, 25 Nov 2024 23:45:01 -0800
Subject: [PATCH 11/11] re-allow samples = NULL in combineBCR

---
 R/combineContigs.R | 159 ++++++++++++++++++++++++---------------------
 man/combineBCR.Rd  |   6 +-
 man/combineTCR.Rd  |   4 +-
 3 files changed, 91 insertions(+), 78 deletions(-)

diff --git a/R/combineContigs.R b/R/combineContigs.R
index d5230bd8..0735a10f 100644
--- a/R/combineContigs.R
+++ b/R/combineContigs.R
@@ -1,5 +1,9 @@
 # Adding Global Variables
 # data('v_gene','j_gene', 'c_gene', 'd_gene')
+# note that currently the Rcpp internals have hardcoded column names so
+# if some breaking change here is made, the Rcpp code will need to be updated,
+# or functions need to be adjusted to intake expected column names that
+# uses these variables
 utils::globalVariables(c("v_gene", "j_gene", "c_gene", "d_gene", "chain"))
 
 heavy_lines <- c("IGH", "cdr3_aa1", "cdr3_nt1", "vgene1")
@@ -21,42 +25,42 @@ utils::globalVariables(c(
 #' @title Combining the list of T cell receptor contigs into clones
 #'
 #' @description This function consolidates a list of TCR sequencing results to
-#' the level of  the individual cell barcodes. Using the **samples** and 
-#' **ID** parameters, the function will add the strings as prefixes to 
-#' prevent issues with repeated  barcodes. The resulting new barcodes will 
-#' need to match the Seurat or SCE object in order to use, 
-#' [combineExpression()]. Several levels of filtering exist - 
-#' *removeNA*, *removeMulti*, or *filterMulti* are parameters 
-#' that control how the function deals with barcodes with multiple chains 
+#' the level of  the individual cell barcodes. Using the **samples** and
+#' **ID** parameters, the function will add the strings as prefixes to
+#' prevent issues with repeated  barcodes. The resulting new barcodes will
+#' need to match the Seurat or SCE object in order to use,
+#' [combineExpression()]. Several levels of filtering exist -
+#' *removeNA*, *removeMulti*, or *filterMulti* are parameters
+#' that control how the function deals with barcodes with multiple chains
 #' recovered.
-#' 
+#'
 #' @examples
-#' combined <- combineTCR(contig_list, 
-#'                         samples = c("P17B", "P17L", "P18B", "P18L", 
+#' combined <- combineTCR(contig_list,
+#'                         samples = c("P17B", "P17L", "P18B", "P18L",
 #'                                     "P19B","P19L", "P20B", "P20L"))
-#' 
-#' @param input.data List of filtered contig annotations or 
+#'
+#' @param input.data List of filtered contig annotations or
 #' outputs from [loadContigs()].
 #' @param samples The labels of samples (recommended).
 #' @param ID The additional sample labeling (optional).
 #' @param removeNA This will remove any chain without values.
 #' @param removeMulti This will remove barcodes with greater than 2 chains.
-#' @param filterMulti This option will allow for the selection of the 2 
-#' corresponding chains with the highest expression for a single barcode. 
-#' @param filterNonproductive This option will allow for the removal of 
+#' @param filterMulti This option will allow for the selection of the 2
+#' corresponding chains with the highest expression for a single barcode.
+#' @param filterNonproductive This option will allow for the removal of
 #' nonproductive chains if the variable exists in the contig data. Default
 #' is set to TRUE to remove nonproductive contigs.
-#' 
+#'
 #' @importFrom assertthat assert_that is.flag
 #' @export
 #' @concept Loading_and_Processing_Contigs
 #' @return List of clones for individual cell barcodes
-#' 
-combineTCR <- function(input.data, 
-                       samples = NULL, 
-                       ID = NULL, 
-                       removeNA = FALSE, 
-                       removeMulti = FALSE, 
+#'
+combineTCR <- function(input.data,
+                       samples = NULL,
+                       ID = NULL,
+                       removeNA = FALSE,
+                       removeMulti = FALSE,
                        filterMulti = FALSE,
                        filterNonproductive = TRUE) {
 
@@ -66,7 +70,7 @@ combineTCR <- function(input.data,
     assert_that(is.flag(removeNA))
     assert_that(is.flag(removeMulti))
     assert_that(is.flag(filterMulti))
-    
+
     input.data <- .checkList(input.data)
     input.data <- .checkContigs(input.data)
     out <- NULL
@@ -80,8 +84,8 @@ combineTCR <- function(input.data,
         }
         input.data[[i]]$sample <- samples[i]
         input.data[[i]]$ID <- ID[i]
-        if (filterMulti) { 
-          input.data[[i]] <- .filteringMulti(input.data[[i]]) 
+        if (filterMulti) {
+          input.data[[i]] <- .filteringMulti(input.data[[i]])
         }
     }
     #Prevents error caused by list containing elements with 0 rows
@@ -104,10 +108,10 @@ combineTCR <- function(input.data,
         data2 <- .makeGenes(cellType = "T", out[[i]])
         Con.df <- .constructConDfAndParseTCR(data2)
         Con.df <- .assignCT(cellType = "T", Con.df)
-        Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA 
-        data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], 
+        Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA
+        data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))],
             Con.df, by = "barcode")
-        
+      
         columns_to_include <- c("barcode")
         # Conditionally add columns based on user input
         if (!is.null(samples)) {
@@ -116,17 +120,17 @@ combineTCR <- function(input.data,
         if (!is.null(ID)) {
           columns_to_include <- c(columns_to_include, "ID")
         }
-        
+      
         # Add TCR and CT lines which are presumably always needed
         columns_to_include <- c(columns_to_include, tcr1_lines, tcr2_lines, CT_lines)
-        
+      
         # Subset the data frame based on the dynamically built list of columns
         data3 <- data3[, columns_to_include]
-        
-        final[[i]] <- data3 
+      
+        final[[i]] <- data3
     }
     name_vector <- character(length(samples))
-    for (i in seq_along(samples)) { 
+    for (i in seq_along(samples)) {
         if (!is.null(samples) && !is.null(ID)) {
             curr <- paste(samples[i], "_", ID[i], sep="")
         } else if (!is.null(samples) & is.null(ID)) {
@@ -155,42 +159,42 @@ combineTCR <- function(input.data,
 
 #' Combining the list of B cell receptor contigs into clones
 #'
-#' This function consolidates a list of BCR sequencing results to the level 
-#' of the individual cell barcodes. Using the samples and ID parameters, 
-#' the function will add the strings as prefixes to prevent issues with 
-#' repeated barcodes. The resulting new barcodes will need to match the 
-#' Seurat or SCE object in order to use, [combineExpression()]. 
-#' Unlike [combineTCR()], combineBCR produces a column 
-#' **CTstrict** of an index of nucleotide sequence and the 
-#' corresponding V gene. This index automatically calculates the 
+#' This function consolidates a list of BCR sequencing results to the level
+#' of the individual cell barcodes. Using the samples and ID parameters,
+#' the function will add the strings as prefixes to prevent issues with
+#' repeated barcodes. The resulting new barcodes will need to match the
+#' Seurat or SCE object in order to use, [combineExpression()].
+#' Unlike [combineTCR()], combineBCR produces a column
+#' **CTstrict** of an index of nucleotide sequence and the
+#' corresponding V gene. This index automatically calculates the
 #' Levenshtein distance between sequences with the same V gene and will
-#' index sequences using a normalized Levenshtein distance with the same 
-#' ID. After which, clone clusters are called using the 
-#' [igraph::components()] function. Clones that are clustered 
-#' across multiple sequences will then be labeled with "Cluster" in the 
+#' index sequences using a normalized Levenshtein distance with the same
+#' ID. After which, clone clusters are called using the
+#' [igraph::components()] function. Clones that are clustered
+#' across multiple sequences will then be labeled with "Cluster" in the
 #' CTstrict header.
 #'
 #' @examples
 #' #Data derived from the 10x Genomics intratumoral NSCLC B cells
 #' BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv")
-#' combined <- combineBCR(BCR, 
-#'                        samples = "Patient1", 
+#' combined <- combineBCR(BCR,
+#'                        samples = "Patient1",
 #'                        threshold = 0.85)
-#' 
-#' @param input.data List of filtered contig annotations or outputs from 
+#'
+#' @param input.data List of filtered contig annotations or outputs from
 #' [loadContigs()].
 #' @param samples The labels of samples (required).
 #' @param ID The additional sample labeling (optional).
-#' @param call.related.clones Use the nucleotide sequence and V gene 
-#' to call related clones. Default is set to TRUE. FALSE will return 
+#' @param call.related.clones Use the nucleotide sequence and V gene
+#' to call related clones. Default is set to TRUE. FALSE will return
 #' a CTstrict or strict clone as V gene + amino acid sequence.
-#' @param threshold The normalized edit distance to consider. The higher 
+#' @param threshold The normalized edit distance to consider. The higher
 #' the number the more similarity of sequence will be used for clustering.
 #' @param removeNA This will remove any chain without values.
 #' @param removeMulti This will remove barcodes with greater than 2 chains.
-#' @param filterMulti This option will allow for the selection of the 
+#' @param filterMulti This option will allow for the selection of the
 #' highest-expressing light and heavy chains, if not calling related clones.
-#' @param filterNonproductive This option will allow for the removal of 
+#' @param filterNonproductive This option will allow for the removal of
 #' nonproductive chains if the variable exists in the contig data. Default
 #' is set to TRUE to remove nonproductive contigs.
 #'
@@ -198,18 +202,23 @@ combineTCR <- function(input.data,
 #' @concept Loading_and_Processing_Contigs
 #' @return List of clones for individual cell barcodes
 combineBCR <- function(input.data,
-                       samples,
+                       samples = NULL,
                        ID = NULL,
                        call.related.clones = TRUE,
                        threshold = 0.85,
-                       removeNA = FALSE, 
+                       removeNA = FALSE,
                        removeMulti = FALSE,
                        filterMulti = TRUE,
                        filterNonproductive = TRUE) {
 
+    if (is.null(samples)) {
+        stop("combineBCR() requires the samples parameter for the calculation of edit distance.")
+    }
+
     assert_that(
-        isListOfNonEmptyDataFrames(input.data) || isNonEmptyDataFrame(input.data),
-        is.character(samples),
+        isListOfNonEmptyDataFrames(input.data) ||
+            isNonEmptyDataFrame(input.data),
+        is.character(samples) || is.null(samples),
         is.flag(call.related.clones),
         is.numeric(threshold),
         is.flag(removeNA),
@@ -220,7 +229,8 @@ combineBCR <- function(input.data,
     final <- input.data %>%
         .checkList() %>%
         .checkContigs() %>%
-        lapply(function(x) {
+        unname() %>%
+        purrr::imap(function(x, i) {
             x <- subset(x, chain %in% c("IGH", "IGK", "IGL"))
             if (!is.null(ID)) x$ID <- ID[i]
             if (filterNonproductive && "productive" %in% colnames(x)) {
@@ -230,7 +240,7 @@ combineBCR <- function(input.data,
                 # Keep IGH / IGK / IGL info in save_chain
                 x$save_chain <- x$chain
                 # Collapse IGK and IGL chains
-                x$chain <- ifelse(x$chain=="IGH","IGH","IGLC")
+                x$chain <- ifelse(x$chain == "IGH", "IGH", "IGLC")
                 x <- .filteringMulti(x)
                 # Get back IGK / IGL distinction
                 x$chain <- x$save_chain
@@ -238,7 +248,13 @@ combineBCR <- function(input.data,
             }
             x
         }) %>%
-        .modifyBarcodes(samples, ID) %>%
+        (function(x) {
+            if (!is.null(samples)) {
+                .modifyBarcodes(x, samples, ID)
+            } else { # https://github.com/ncborcherding/scRepertoire/pull/450
+                x
+            }
+        }) %>%
         lapply(function(x) {
             data2 <- data.frame(x)
             data2 <- .makeGenes(cellType = "B", data2)
@@ -270,26 +286,23 @@ combineBCR <- function(input.data,
       }
         final[[i]]$sample <- samples[i]
         final[[i]]$ID <- ID[i]
-        final[[i]][final[[i]] == "NA_NA" | final[[i]] == "NA;NA_NA;NA"] <- NA 
+        final[[i]][final[[i]] == "NA_NA" | final[[i]] == "NA;NA_NA;NA"] <- NA
         if (!is.null(sample) & !is.null(ID)) {
-          final[[i]]<- final[[i]][, c("barcode", "sample", "ID", 
+          final[[i]]<- final[[i]][, c("barcode", "sample", "ID",
               heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)]
         }
         else if (!is.null(sample) & is.null(ID)) {
-          final[[i]]<- final[[i]][, c("barcode", "sample", 
+          final[[i]]<- final[[i]][, c("barcode", "sample",
                     heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)]
         }
     }
-    names <- NULL
-    for (i in seq_along(samples)) {
-        if (!is.null(samples) && !is.null(ID)) {
-            c <- paste(samples[i], "_", ID[i], sep="")
-        } else if (!is.null(samples) && is.null(ID)) {
-            c <- paste(samples[i], sep = "")
-        }
-        names <- c(names, c)
+
+    names(final) <- if (!is.null(samples)) {
+        if (is.null(ID)) samples else paste0(samples, "_", ID)
+    } else {
+        paste0("S", seq_along(final))
     }
-    names(final) <- names
+
     for (i in seq_along(final)) {
         final[[i]] <- final[[i]][!duplicated(final[[i]]$barcode),]
         final[[i]] <- final[[i]][rowSums(is.na(final[[i]])) < 10, ]
diff --git a/man/combineBCR.Rd b/man/combineBCR.Rd
index 7363ce61..97964d98 100644
--- a/man/combineBCR.Rd
+++ b/man/combineBCR.Rd
@@ -6,7 +6,7 @@
 \usage{
 combineBCR(
   input.data,
-  samples,
+  samples = NULL,
   ID = NULL,
   call.related.clones = TRUE,
   threshold = 0.85,
@@ -64,8 +64,8 @@ CTstrict header.
 \examples{
 #Data derived from the 10x Genomics intratumoral NSCLC B cells
 BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv")
-combined <- combineBCR(BCR, 
-                       samples = "Patient1", 
+combined <- combineBCR(BCR,
+                       samples = "Patient1",
                        threshold = 0.85)
 
 }
diff --git a/man/combineTCR.Rd b/man/combineTCR.Rd
index 6c030fa7..0c53b18f 100644
--- a/man/combineTCR.Rd
+++ b/man/combineTCR.Rd
@@ -48,8 +48,8 @@ that control how the function deals with barcodes with multiple chains
 recovered.
 }
 \examples{
-combined <- combineTCR(contig_list, 
-                        samples = c("P17B", "P17L", "P18B", "P18L", 
+combined <- combineTCR(contig_list,
+                        samples = c("P17B", "P17L", "P18B", "P18L",
                                     "P19B","P19L", "P20B", "P20L"))
 
 }