Merge pull request #285 from Qile0317/master

Sync master branch with Qile's fork & import hash
BorchLab · Dec 8, 2023 · 6f4d1c4 · 6f4d1c4
2 parents 28b4c40 + 63adabf
commit 6f4d1c4
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 37 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -40,7 +40,8 @@ Imports:
     tidygraph,
     truncdist,
     utils,
-    VGAM
+    VGAM,
+    hash
 Suggests: 
     BiocManager,
     BiocStyle,

diff --git a/R/combineContigs.R b/R/combineContigs.R
@@ -93,7 +93,7 @@ combineTCR <- function(input.data,
         Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA 
         data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], 
             Con.df, by = "barcode")
-        if (!is.null(samples) & !is.null(ID)) {
+        if (!is.null(samples) && !is.null(ID)) {
             data3 <- data3[, c("barcode", "sample", "ID", tcr1_lines, tcr2_lines,
                 CT_lines)] }
         else if (!is.null(samples) & is.null(ID)) {
@@ -104,7 +104,7 @@ combineTCR <- function(input.data,
     }
     name_vector <- character(length(samples))
     for (i in seq_along(samples)) { 
-        if (!is.null(samples) & !is.null(ID)) {
+        if (!is.null(samples) && !is.null(ID)) {
             curr <- paste(samples[i], "_", ID[i], sep="")
         } else if (!is.null(samples) & is.null(ID)) {
             curr <- paste(samples[i], sep="")

diff --git a/R/combineExpression.R b/R/combineExpression.R
@@ -62,7 +62,7 @@ combineExpression <- function(input.data,
     call_time <- Sys.time()
 
     options( dplyr.summarise.inform = FALSE )
-    if (!proportion & any(cloneSize < 1)) {
+    if (!proportion && any(cloneSize < 1)) {
         stop("Adjust the cloneSize parameter - there are groupings < 1")
     }
     cloneSize <- c(None = 0, cloneSize)
@@ -93,7 +93,7 @@ combineExpression <- function(input.data,
                              "clonalFrequency")]
             Con.df <- rbind.data.frame(Con.df, data)
         }
-    } else if (group.by != "none" | !is.null(group.by)) {
+    } else if (group.by != "none" || !is.null(group.by)) {
         data <- data.frame(bind_rows(input.data), stringsAsFactors = FALSE)
         data2 <- na.omit(unique(data[,c("barcode", cloneCall, group.by)]))
         data2 <- data2[data2[,"barcode"] %in% cell.names, ]

diff --git a/R/utils.R b/R/utils.R
@@ -6,7 +6,7 @@ is_seurat_or_se_object <- function(obj) {
     is_seurat_object(obj) || is_se_object(obj)
 }
 
-#Use to shuffle between chains
+#Use to shuffle between chains Qile: the NA handling here *might* be related to the unnamed combineTCR bugs from the new rcpp con.df construction
 #' @keywords internal
 #' @author Ye-Lin Son Nick Borcherding
 .off.the.chain <- function(dat, chain, cloneCall) {
@@ -250,23 +250,10 @@ is_seurat_or_se_object <- function(obj) {
     return(data1)
 }
 
-
 # This is to help sort the type of clonotype data to use
 #' @keywords internal
 .theCall <- function(df, x, check.df = TRUE) {
-    x <- switch(x,
-                "gene" = "CTgene",
-                "genes" = "CTgene", 
-                "CTgene" = "CTgene",
-                "nt" = "CTnt",
-                "nucleotides" = "CTnt",
-                "CTnt" = "CTnt",
-                "aa" = "CTaa", 
-                "amino" = "CTaa", 
-                "CTaa" = "CTaa", 
-                "strict" = "CTstrict", 
-                "gene+nt" = "CTstrict",
-                "CTstrict" = "CTstrict")
+    x <- .convertClonecall(x)
     if(check.df) {
       if(inherits(df, "list") & !any(colnames(df[[1]]) %in% x)) {
         stop("Check the clonal variabe (cloneCall) being used in the function, it does not appear in the data provided.")
@@ -277,15 +264,70 @@ is_seurat_or_se_object <- function(obj) {
     return(x)
 }
 
+# helper for .theCall
+.convertClonecall <- function(x) {
+
+  clonecall_dictionary <- hash::hash(
+    "gene" = "CTgene",
+		"genes" = "CTgene",
+		"ctgene" = "CTgene",
+		"ctstrict" = "CTstrict",
+		"nt" = "CTnt",
+		"nucleotide" = "CTnt",
+		"nucleotides" = "CTnt",
+		"ctnt" = "CTnt",
+		"aa" = "CTaa",
+		"amino" = "CTaa",
+		"ctaa" = "CTaa",
+		"gene+nt" = "CTstrict",
+		"strict" = "CTstrict",
+		"ctstrict" = "CTstrict"
+	)
+
+	x <- tolower(x)
+
+	if (!is.null(clonecall_dictionary[[x]])) {
+		return(clonecall_dictionary[[x]])
+	}
+
+	stop(paste(
+		"invalid input cloneCall, did you mean: '",
+		closest_word(
+			x,
+			c(names(clonecall_dictionary),
+			  unname(hash::values(clonecall_dictionary)))
+		),
+		"'?",
+		sep = ""
+	))
+}
+
+# helper for .convertClonecall
+closest_word <- function(s, strset) {
+    strset_lowercase <- tolower(strset)
+    s <- tolower(s)
+
+    closest_w <- strset_lowercase[1]
+    closest_dist <- utils::adist(s, closest_w)
+    for(i in 2:length(strset_lowercase)) {
+        curr_dist <- utils::adist(s, strset_lowercase[i])
+        if (curr_dist < closest_dist) {
+            closest_w <- strset[i]
+            closest_dist <- curr_dist
+        }
+    }
+    closest_w
+}
+
 # Assigning positions for TCR contig data
-# Used to be .parseTCR(Con.df, unique_df, data2)
+# Used to be .parseTCR(Con.df, unique_df, data2) in v1
 # but now also constructs Con.df and runs the parseTCR algorithm on it, all in Rcpp
 #' @author Gloria Kraus, Nick Bormann, Nicky de Vrij, Nick Borcherding, Qile Yang
 #' @keywords internal
 .constructConDfAndParseTCR <- function(data2) {
   rcppConstructConDfAndParseTCR(
     data2 %>% arrange(., chain, cdr3_nt),
-    unique(data2[[1]])
+    unique(data2[[1]]) # 1 is the index of the barcode column
   )
 }
 

diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp
@@ -43,7 +43,7 @@ class TcrParser {
     TcrParser(
         Rcpp::DataFrame& data2, std::vector<std::string>& uniqueData2Barcodes
     ) {
-        // construct conDf
+        // construct conDf, initializaing the matrix to "NA" *strings*
         conDf = scRepHelper::initStringMatrix(
             7, uniqueData2Barcodes.size(), "NA"
         );

diff --git a/src/ntKmers.cpp b/src/ntKmers.cpp
@@ -1,4 +1,5 @@
-// 2-bit-based nucleotide kmer counting
+// 2-bit-based nucleotide kmer counting - unoptimized
+// could use a kmercounter class with an uint_fast64_t[128] for toNtIndex instead of the switch statement
 // by Qile Yang
 
 #include <Rcpp.h>
@@ -15,13 +16,10 @@ inline unsigned short int toNtIndex(const char nt) {
     }
 }
 
+constexpr char Nts[4] = {'A', 'C', 'G', 'T'};
+
 inline char lastNt(unsigned int index) {
-    switch(index & 3) {
-        case 0: return 'A';
-        case 1: return 'C';
-        case 2: return 'G';
-        default: return 'T';
-    }
+    return Nts[index & 3];
 }
 
 inline std::string toNtKmer(unsigned long int index, int k) {
@@ -58,6 +56,11 @@ inline void updateSkip(int& skip, const char c, const int k) {
     }
 }
 
+inline bool updateSkipAndReturnIfShouldntSkip(int& skip, const char c, const int k) {
+    updateSkip(skip, c, k);
+    return skip == 0;
+}
+
 // actual kmer counter - doesnt handle _NA_ for k = 1
 inline void kmerCount(std::vector<double>& bins, const unsigned int mask, const std::string& seq, const int k) {
 
@@ -76,8 +79,7 @@ inline void kmerCount(std::vector<double>& bins, const unsigned int mask, const
 
     for (int i = k - 1; i < n; i++) {
         kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]);
-        updateSkip(skip, seq[i], k);
-        if (skip == 0) {
+        if (updateSkipAndReturnIfShouldntSkip(skip, seq[i], k)) {
             bins[kmer]++;
         }
     }

diff --git a/tests/testthat/test-combineContigs.R b/tests/testthat/test-combineContigs.R
@@ -37,13 +37,12 @@ test_that("combineTCR works", {
 
 # TODO combineTCR & combineBCR (need more edge cases, different args, errors, etc.)
 
-
 test_that("combineBCR works", {
-  
+
   BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv")
   trial1 <- combineBCR(BCR, 
                     samples = "Patient1")
-  
+
   expect_identical(trial1, getdata("combineContigs", "combineBCR_list_expected"))
-  
-})
+
+})