From 12bac7e0bacb4c341e00323690e587dd6d5f5c21 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Thu, 7 Dec 2023 19:04:20 -0800
Subject: [PATCH 1/5] minor refactoring

---
 R/utils.R                            |  4 ++--
 src/ntKmers.cpp                      | 10 +++++-----
 tests/testthat/test-combineContigs.R |  1 +
 3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/R/utils.R b/R/utils.R
index 618f9357..71855ab6 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -278,14 +278,14 @@ is_seurat_or_se_object <- function(obj) {
 }
 
 # Assigning positions for TCR contig data
-# Used to be .parseTCR(Con.df, unique_df, data2)
+# Used to be .parseTCR(Con.df, unique_df, data2) in v1
 # but now also constructs Con.df and runs the parseTCR algorithm on it, all in Rcpp
 #' @author Gloria Kraus, Nick Bormann, Nicky de Vrij, Nick Borcherding, Qile Yang
 #' @keywords internal
 .constructConDfAndParseTCR <- function(data2) {
   rcppConstructConDfAndParseTCR(
     data2 %>% arrange(., chain, cdr3_nt),
-    unique(data2[[1]])
+    unique(data2[[1]]) # 1 is the index of the barcode column
   )
 }
 
diff --git a/src/ntKmers.cpp b/src/ntKmers.cpp
index b7791ba5..54369fe8 100644
--- a/src/ntKmers.cpp
+++ b/src/ntKmers.cpp
@@ -1,4 +1,4 @@
-// 2-bit-based nucleotide kmer counting
+// 2-bit-based nucleotide kmer counting - unoptimized
 // by Qile Yang
 
 #include <Rcpp.h>
@@ -50,12 +50,13 @@ Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k) {
     return motifs;
 }
 
-inline void updateSkip(int& skip, const char c, const int k) {
+inline bool updateSkipAndReturnIfShouldSkip(int& skip, const char c, const int k) {
     if (!isNt(c)) {
         skip = k;
     } else if (skip > 0) {
         skip--;
     }
+    return skip == 0;
 }
 
 // actual kmer counter - doesnt handle _NA_ for k = 1
@@ -71,13 +72,12 @@ inline void kmerCount(std::vector<double>& bins, const unsigned int mask, const
 
     for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted?
         kmer = (kmer << 2) | toNtIndex(seq[i]);
-        updateSkip(skip, seq[i], k);
+        updateSkipAndReturnIfShouldSkip(skip, seq[i], k);
     }
 
     for (int i = k - 1; i < n; i++) {
         kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]);
-        updateSkip(skip, seq[i], k);
-        if (skip == 0) {
+        if (updateSkipAndReturnIfShouldSkip(skip, seq[i], k)) {
             bins[kmer]++;
         }
     }
diff --git a/tests/testthat/test-combineContigs.R b/tests/testthat/test-combineContigs.R
index 3092b57d..d6692fab 100644
--- a/tests/testthat/test-combineContigs.R
+++ b/tests/testthat/test-combineContigs.R
@@ -36,4 +36,5 @@ test_that("combineTCR works", {
 }) 
 
 # TODO combineTCR (need more edge cases, different args, errors, etc.)
+# TODO test combineTCR with the output of loadContigs
 # TODO combineBCR

From ba0195c077c48dd02ebece9b2346f257db1636aa Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Thu, 7 Dec 2023 19:12:19 -0800
Subject: [PATCH 2/5] undo accidental combineBCR testcase deleton

---
 tests/testthat/test-combineContigs.R | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-combineContigs.R b/tests/testthat/test-combineContigs.R
index d6692fab..af7bc490 100644
--- a/tests/testthat/test-combineContigs.R
+++ b/tests/testthat/test-combineContigs.R
@@ -35,6 +35,14 @@ test_that("combineTCR works", {
 	expect_identical(trial4, getdata("combineContigs", "combineTCR_list_removeNA"))
 }) 
 
-# TODO combineTCR (need more edge cases, different args, errors, etc.)
-# TODO test combineTCR with the output of loadContigs
-# TODO combineBCR
+# TODO combineTCR & combineBCR (need more edge cases, different args, errors, etc.)
+
+test_that("combineBCR works", {
+
+  BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv")
+  trial1 <- combineBCR(BCR, 
+                    samples = "Patient1")
+
+  expect_identical(trial1, getdata("combineContigs", "combineBCR_list_expected"))
+
+})

From 4a488c0262c071af16e1759784c307c8a3e13795 Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Thu, 7 Dec 2023 19:38:01 -0800
Subject: [PATCH 3/5] tiny optimization to src/ntKmers.cpp

---
 src/constructConDfAndparseTCR.cpp |  2 +-
 src/ntKmers.cpp                   | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp
index 61c2edf1..d54d4599 100644
--- a/src/constructConDfAndparseTCR.cpp
+++ b/src/constructConDfAndparseTCR.cpp
@@ -43,7 +43,7 @@ class TcrParser {
     TcrParser(
         Rcpp::DataFrame& data2, std::vector<std::string>& uniqueData2Barcodes
     ) {
-        // construct conDf
+        // construct conDf, initializaing the matrix to "NA" *strings*
         conDf = scRepHelper::initStringMatrix(
             7, uniqueData2Barcodes.size(), "NA"
         );
diff --git a/src/ntKmers.cpp b/src/ntKmers.cpp
index 54369fe8..3fcbb31c 100644
--- a/src/ntKmers.cpp
+++ b/src/ntKmers.cpp
@@ -1,4 +1,5 @@
 // 2-bit-based nucleotide kmer counting - unoptimized
+// could use a kmercounter class with an uint_fast64_t[128] for toNtIndex instead of the switch statement
 // by Qile Yang
 
 #include <Rcpp.h>
@@ -15,13 +16,10 @@ inline unsigned short int toNtIndex(const char nt) {
     }
 }
 
+constexpr char Nts[4] = {'A', 'C', 'G', 'T'};
+
 inline char lastNt(unsigned int index) {
-    switch(index & 3) {
-        case 0: return 'A';
-        case 1: return 'C';
-        case 2: return 'G';
-        default: return 'T';
-    }
+    return Nts[index & 3];
 }
 
 inline std::string toNtKmer(unsigned long int index, int k) {
@@ -50,12 +48,16 @@ Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k) {
     return motifs;
 }
 
-inline bool updateSkipAndReturnIfShouldSkip(int& skip, const char c, const int k) {
+inline void updateSkip(int& skip, const char c, const int k) {
     if (!isNt(c)) {
         skip = k;
     } else if (skip > 0) {
         skip--;
     }
+}
+
+inline bool updateSkipAndReturnIfShouldntSkip(int& skip, const char c, const int k) {
+    updateSkip(skip, c, k);
     return skip == 0;
 }
 
@@ -72,12 +74,12 @@ inline void kmerCount(std::vector<double>& bins, const unsigned int mask, const
 
     for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted?
         kmer = (kmer << 2) | toNtIndex(seq[i]);
-        updateSkipAndReturnIfShouldSkip(skip, seq[i], k);
+        updateSkip(skip, seq[i], k);
     }
 
     for (int i = k - 1; i < n; i++) {
         kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]);
-        if (updateSkipAndReturnIfShouldSkip(skip, seq[i], k)) {
+        if (updateSkipAndReturnIfShouldntSkip(skip, seq[i], k)) {
             bins[kmer]++;
         }
     }

From f80163bb7e75878b4430169e1ad6a3c705765beb Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Thu, 7 Dec 2023 21:19:09 -0800
Subject: [PATCH 4/5] improved .theCall error handling

---
 R/utils.R | 70 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 14 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 69d1c8e4..944fb801 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -250,23 +250,10 @@ is_seurat_or_se_object <- function(obj) {
     return(data1)
 }
 
-
 # This is to help sort the type of clonotype data to use
 #' @keywords internal
 .theCall <- function(df, x, check.df = TRUE) {
-    x <- switch(x,
-                "gene" = "CTgene",
-                "genes" = "CTgene", 
-                "CTgene" = "CTgene",
-                "nt" = "CTnt",
-                "nucleotides" = "CTnt",
-                "CTnt" = "CTnt",
-                "aa" = "CTaa", 
-                "amino" = "CTaa", 
-                "CTaa" = "CTaa", 
-                "strict" = "CTstrict", 
-                "gene+nt" = "CTstrict",
-                "CTstrict" = "CTstrict")
+    x <- .convertClonecall(x)
     if(check.df) {
       if(inherits(df, "list") & !any(colnames(df[[1]]) %in% x)) {
         stop("Check the clonal variabe (cloneCall) being used in the function, it does not appear in the data provided.")
@@ -277,6 +264,61 @@ is_seurat_or_se_object <- function(obj) {
     return(x)
 }
 
+# helper for .theCall
+.convertClonecall <- function(x) {
+
+  clonecall_dictionary <- hash::hash(
+    "gene" = "CTgene",
+		"genes" = "CTgene",
+		"ctgene" = "CTgene",
+		"ctstrict" = "CTstrict",
+		"nt" = "CTnt",
+		"nucleotide" = "CTnt",
+		"nucleotides" = "CTnt",
+		"ctnt" = "CTnt",
+		"aa" = "CTaa",
+		"amino" = "CTaa",
+		"ctaa" = "CTaa",
+		"gene+nt" = "CTstrict",
+		"strict" = "CTstrict",
+		"ctstrict" = "CTstrict"
+	)
+
+	x <- tolower(x)
+
+	if (!is.null(clonecall_dictionary[[x]])) {
+		return(clonecall_dictionary[[x]])
+	}
+
+	stop(paste(
+		"invalid input cloneCall, did you mean: '",
+		closest_word(
+			x,
+			c(names(clonecall_dictionary),
+			  unname(hash::values(clonecall_dictionary)))
+		),
+		"'?",
+		sep = ""
+	))
+}
+
+# helper for .convertClonecall
+closest_word <- function(s, strset) {
+    strset_lowercase <- tolower(strset)
+    s <- tolower(s)
+
+    closest_w <- strset_lowercase[1]
+    closest_dist <- utils::adist(s, closest_w)
+    for(i in 2:length(strset_lowercase)) {
+        curr_dist <- utils::adist(s, strset_lowercase[i])
+        if (curr_dist < closest_dist) {
+            closest_w <- strset[i]
+            closest_dist <- curr_dist
+        }
+    }
+    closest_w
+}
+
 # Assigning positions for TCR contig data
 # Used to be .parseTCR(Con.df, unique_df, data2) in v1
 # but now also constructs Con.df and runs the parseTCR algorithm on it, all in Rcpp

From 63adabff1a8ccf810097bc8c6577964dcb2cdb6e Mon Sep 17 00:00:00 2001
From: Qile0317 <qile0317@gmail.com>
Date: Thu, 7 Dec 2023 23:21:15 -0800
Subject: [PATCH 5/5] imported hash and minor comments

---
 DESCRIPTION           | 3 ++-
 R/combineContigs.R    | 4 ++--
 R/combineExpression.R | 4 ++--
 R/utils.R             | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index bca66487..6586b2d7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -40,7 +40,8 @@ Imports:
     tidygraph,
     truncdist,
     utils,
-    VGAM
+    VGAM,
+    hash
 Suggests: 
     BiocManager,
     BiocStyle,
diff --git a/R/combineContigs.R b/R/combineContigs.R
index 9da8c38d..41c1802f 100644
--- a/R/combineContigs.R
+++ b/R/combineContigs.R
@@ -93,7 +93,7 @@ combineTCR <- function(input.data,
         Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA 
         data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], 
             Con.df, by = "barcode")
-        if (!is.null(samples) & !is.null(ID)) {
+        if (!is.null(samples) && !is.null(ID)) {
             data3 <- data3[, c("barcode", "sample", "ID", tcr1_lines, tcr2_lines,
                 CT_lines)] }
         else if (!is.null(samples) & is.null(ID)) {
@@ -104,7 +104,7 @@ combineTCR <- function(input.data,
     }
     name_vector <- character(length(samples))
     for (i in seq_along(samples)) { 
-        if (!is.null(samples) & !is.null(ID)) {
+        if (!is.null(samples) && !is.null(ID)) {
             curr <- paste(samples[i], "_", ID[i], sep="")
         } else if (!is.null(samples) & is.null(ID)) {
             curr <- paste(samples[i], sep="")
diff --git a/R/combineExpression.R b/R/combineExpression.R
index 58fbbdaf..34a0f402 100644
--- a/R/combineExpression.R
+++ b/R/combineExpression.R
@@ -62,7 +62,7 @@ combineExpression <- function(input.data,
     call_time <- Sys.time()
   
     options( dplyr.summarise.inform = FALSE )
-    if (!proportion & any(cloneSize < 1)) {
+    if (!proportion && any(cloneSize < 1)) {
         stop("Adjust the cloneSize parameter - there are groupings < 1")
     }
     cloneSize <- c(None = 0, cloneSize)
@@ -93,7 +93,7 @@ combineExpression <- function(input.data,
                              "clonalFrequency")]
             Con.df <- rbind.data.frame(Con.df, data)
         }
-    } else if (group.by != "none" | !is.null(group.by)) {
+    } else if (group.by != "none" || !is.null(group.by)) {
         data <- data.frame(bind_rows(input.data), stringsAsFactors = FALSE)
         data2 <- na.omit(unique(data[,c("barcode", cloneCall, group.by)]))
         data2 <- data2[data2[,"barcode"] %in% cell.names, ]
diff --git a/R/utils.R b/R/utils.R
index 944fb801..cdee4570 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -6,7 +6,7 @@ is_seurat_or_se_object <- function(obj) {
     is_seurat_object(obj) || is_se_object(obj)
 }
 
-#Use to shuffle between chains
+#Use to shuffle between chains Qile: the NA handling here *might* be related to the unnamed combineTCR bugs from the new rcpp con.df construction
 #' @keywords internal
 #' @author Ye-Lin Son Nick Borcherding
 .off.the.chain <- function(dat, chain, cloneCall) {