From 12bac7e0bacb4c341e00323690e587dd6d5f5c21 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Thu, 7 Dec 2023 19:04:20 -0800 Subject: [PATCH 1/5] minor refactoring --- R/utils.R | 4 ++-- src/ntKmers.cpp | 10 +++++----- tests/testthat/test-combineContigs.R | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/utils.R b/R/utils.R index 618f9357..71855ab6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -278,14 +278,14 @@ is_seurat_or_se_object <- function(obj) { } # Assigning positions for TCR contig data -# Used to be .parseTCR(Con.df, unique_df, data2) +# Used to be .parseTCR(Con.df, unique_df, data2) in v1 # but now also constructs Con.df and runs the parseTCR algorithm on it, all in Rcpp #' @author Gloria Kraus, Nick Bormann, Nicky de Vrij, Nick Borcherding, Qile Yang #' @keywords internal .constructConDfAndParseTCR <- function(data2) { rcppConstructConDfAndParseTCR( data2 %>% arrange(., chain, cdr3_nt), - unique(data2[[1]]) + unique(data2[[1]]) # 1 is the index of the barcode column ) } diff --git a/src/ntKmers.cpp b/src/ntKmers.cpp index b7791ba5..54369fe8 100644 --- a/src/ntKmers.cpp +++ b/src/ntKmers.cpp @@ -1,4 +1,4 @@ -// 2-bit-based nucleotide kmer counting +// 2-bit-based nucleotide kmer counting - unoptimized // by Qile Yang #include @@ -50,12 +50,13 @@ Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k) { return motifs; } -inline void updateSkip(int& skip, const char c, const int k) { +inline bool updateSkipAndReturnIfShouldSkip(int& skip, const char c, const int k) { if (!isNt(c)) { skip = k; } else if (skip > 0) { skip--; } + return skip == 0; } // actual kmer counter - doesnt handle _NA_ for k = 1 @@ -71,13 +72,12 @@ inline void kmerCount(std::vector& bins, const unsigned int mask, const for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted? kmer = (kmer << 2) | toNtIndex(seq[i]); - updateSkip(skip, seq[i], k); + updateSkipAndReturnIfShouldSkip(skip, seq[i], k); } for (int i = k - 1; i < n; i++) { kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]); - updateSkip(skip, seq[i], k); - if (skip == 0) { + if (updateSkipAndReturnIfShouldSkip(skip, seq[i], k)) { bins[kmer]++; } } diff --git a/tests/testthat/test-combineContigs.R b/tests/testthat/test-combineContigs.R index 3092b57d..d6692fab 100644 --- a/tests/testthat/test-combineContigs.R +++ b/tests/testthat/test-combineContigs.R @@ -36,4 +36,5 @@ test_that("combineTCR works", { }) # TODO combineTCR (need more edge cases, different args, errors, etc.) +# TODO test combineTCR with the output of loadContigs # TODO combineBCR From ba0195c077c48dd02ebece9b2346f257db1636aa Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Thu, 7 Dec 2023 19:12:19 -0800 Subject: [PATCH 2/5] undo accidental combineBCR testcase deleton --- tests/testthat/test-combineContigs.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-combineContigs.R b/tests/testthat/test-combineContigs.R index d6692fab..af7bc490 100644 --- a/tests/testthat/test-combineContigs.R +++ b/tests/testthat/test-combineContigs.R @@ -35,6 +35,14 @@ test_that("combineTCR works", { expect_identical(trial4, getdata("combineContigs", "combineTCR_list_removeNA")) }) -# TODO combineTCR (need more edge cases, different args, errors, etc.) -# TODO test combineTCR with the output of loadContigs -# TODO combineBCR +# TODO combineTCR & combineBCR (need more edge cases, different args, errors, etc.) + +test_that("combineBCR works", { + + BCR <- read.csv("https://www.borch.dev/uploads/contigs/b_contigs.csv") + trial1 <- combineBCR(BCR, + samples = "Patient1") + + expect_identical(trial1, getdata("combineContigs", "combineBCR_list_expected")) + +}) From 4a488c0262c071af16e1759784c307c8a3e13795 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Thu, 7 Dec 2023 19:38:01 -0800 Subject: [PATCH 3/5] tiny optimization to src/ntKmers.cpp --- src/constructConDfAndparseTCR.cpp | 2 +- src/ntKmers.cpp | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp index 61c2edf1..d54d4599 100644 --- a/src/constructConDfAndparseTCR.cpp +++ b/src/constructConDfAndparseTCR.cpp @@ -43,7 +43,7 @@ class TcrParser { TcrParser( Rcpp::DataFrame& data2, std::vector& uniqueData2Barcodes ) { - // construct conDf + // construct conDf, initializaing the matrix to "NA" *strings* conDf = scRepHelper::initStringMatrix( 7, uniqueData2Barcodes.size(), "NA" ); diff --git a/src/ntKmers.cpp b/src/ntKmers.cpp index 54369fe8..3fcbb31c 100644 --- a/src/ntKmers.cpp +++ b/src/ntKmers.cpp @@ -1,4 +1,5 @@ // 2-bit-based nucleotide kmer counting - unoptimized +// could use a kmercounter class with an uint_fast64_t[128] for toNtIndex instead of the switch statement // by Qile Yang #include @@ -15,13 +16,10 @@ inline unsigned short int toNtIndex(const char nt) { } } +constexpr char Nts[4] = {'A', 'C', 'G', 'T'}; + inline char lastNt(unsigned int index) { - switch(index & 3) { - case 0: return 'A'; - case 1: return 'C'; - case 2: return 'G'; - default: return 'T'; - } + return Nts[index & 3]; } inline std::string toNtKmer(unsigned long int index, int k) { @@ -50,12 +48,16 @@ Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k) { return motifs; } -inline bool updateSkipAndReturnIfShouldSkip(int& skip, const char c, const int k) { +inline void updateSkip(int& skip, const char c, const int k) { if (!isNt(c)) { skip = k; } else if (skip > 0) { skip--; } +} + +inline bool updateSkipAndReturnIfShouldntSkip(int& skip, const char c, const int k) { + updateSkip(skip, c, k); return skip == 0; } @@ -72,12 +74,12 @@ inline void kmerCount(std::vector& bins, const unsigned int mask, const for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted? kmer = (kmer << 2) | toNtIndex(seq[i]); - updateSkipAndReturnIfShouldSkip(skip, seq[i], k); + updateSkip(skip, seq[i], k); } for (int i = k - 1; i < n; i++) { kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]); - if (updateSkipAndReturnIfShouldSkip(skip, seq[i], k)) { + if (updateSkipAndReturnIfShouldntSkip(skip, seq[i], k)) { bins[kmer]++; } } From f80163bb7e75878b4430169e1ad6a3c705765beb Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Thu, 7 Dec 2023 21:19:09 -0800 Subject: [PATCH 4/5] improved .theCall error handling --- R/utils.R | 70 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/R/utils.R b/R/utils.R index 69d1c8e4..944fb801 100644 --- a/R/utils.R +++ b/R/utils.R @@ -250,23 +250,10 @@ is_seurat_or_se_object <- function(obj) { return(data1) } - # This is to help sort the type of clonotype data to use #' @keywords internal .theCall <- function(df, x, check.df = TRUE) { - x <- switch(x, - "gene" = "CTgene", - "genes" = "CTgene", - "CTgene" = "CTgene", - "nt" = "CTnt", - "nucleotides" = "CTnt", - "CTnt" = "CTnt", - "aa" = "CTaa", - "amino" = "CTaa", - "CTaa" = "CTaa", - "strict" = "CTstrict", - "gene+nt" = "CTstrict", - "CTstrict" = "CTstrict") + x <- .convertClonecall(x) if(check.df) { if(inherits(df, "list") & !any(colnames(df[[1]]) %in% x)) { stop("Check the clonal variabe (cloneCall) being used in the function, it does not appear in the data provided.") @@ -277,6 +264,61 @@ is_seurat_or_se_object <- function(obj) { return(x) } +# helper for .theCall +.convertClonecall <- function(x) { + + clonecall_dictionary <- hash::hash( + "gene" = "CTgene", + "genes" = "CTgene", + "ctgene" = "CTgene", + "ctstrict" = "CTstrict", + "nt" = "CTnt", + "nucleotide" = "CTnt", + "nucleotides" = "CTnt", + "ctnt" = "CTnt", + "aa" = "CTaa", + "amino" = "CTaa", + "ctaa" = "CTaa", + "gene+nt" = "CTstrict", + "strict" = "CTstrict", + "ctstrict" = "CTstrict" + ) + + x <- tolower(x) + + if (!is.null(clonecall_dictionary[[x]])) { + return(clonecall_dictionary[[x]]) + } + + stop(paste( + "invalid input cloneCall, did you mean: '", + closest_word( + x, + c(names(clonecall_dictionary), + unname(hash::values(clonecall_dictionary))) + ), + "'?", + sep = "" + )) +} + +# helper for .convertClonecall +closest_word <- function(s, strset) { + strset_lowercase <- tolower(strset) + s <- tolower(s) + + closest_w <- strset_lowercase[1] + closest_dist <- utils::adist(s, closest_w) + for(i in 2:length(strset_lowercase)) { + curr_dist <- utils::adist(s, strset_lowercase[i]) + if (curr_dist < closest_dist) { + closest_w <- strset[i] + closest_dist <- curr_dist + } + } + closest_w +} + # Assigning positions for TCR contig data # Used to be .parseTCR(Con.df, unique_df, data2) in v1 # but now also constructs Con.df and runs the parseTCR algorithm on it, all in Rcpp From 63adabff1a8ccf810097bc8c6577964dcb2cdb6e Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Thu, 7 Dec 2023 23:21:15 -0800 Subject: [PATCH 5/5] imported hash and minor comments --- DESCRIPTION | 3 ++- R/combineContigs.R | 4 ++-- R/combineExpression.R | 4 ++-- R/utils.R | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index bca66487..6586b2d7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,8 @@ Imports: tidygraph, truncdist, utils, - VGAM + VGAM, + hash Suggests: BiocManager, BiocStyle, diff --git a/R/combineContigs.R b/R/combineContigs.R index 9da8c38d..41c1802f 100644 --- a/R/combineContigs.R +++ b/R/combineContigs.R @@ -93,7 +93,7 @@ combineTCR <- function(input.data, Con.df[Con.df == "NA_NA" | Con.df == "NA;NA_NA;NA"] <- NA data3 <- merge(data2[,-which(names(data2) %in% c("TCR1","TCR2"))], Con.df, by = "barcode") - if (!is.null(samples) & !is.null(ID)) { + if (!is.null(samples) && !is.null(ID)) { data3 <- data3[, c("barcode", "sample", "ID", tcr1_lines, tcr2_lines, CT_lines)] } else if (!is.null(samples) & is.null(ID)) { @@ -104,7 +104,7 @@ combineTCR <- function(input.data, } name_vector <- character(length(samples)) for (i in seq_along(samples)) { - if (!is.null(samples) & !is.null(ID)) { + if (!is.null(samples) && !is.null(ID)) { curr <- paste(samples[i], "_", ID[i], sep="") } else if (!is.null(samples) & is.null(ID)) { curr <- paste(samples[i], sep="") diff --git a/R/combineExpression.R b/R/combineExpression.R index 58fbbdaf..34a0f402 100644 --- a/R/combineExpression.R +++ b/R/combineExpression.R @@ -62,7 +62,7 @@ combineExpression <- function(input.data, call_time <- Sys.time() options( dplyr.summarise.inform = FALSE ) - if (!proportion & any(cloneSize < 1)) { + if (!proportion && any(cloneSize < 1)) { stop("Adjust the cloneSize parameter - there are groupings < 1") } cloneSize <- c(None = 0, cloneSize) @@ -93,7 +93,7 @@ combineExpression <- function(input.data, "clonalFrequency")] Con.df <- rbind.data.frame(Con.df, data) } - } else if (group.by != "none" | !is.null(group.by)) { + } else if (group.by != "none" || !is.null(group.by)) { data <- data.frame(bind_rows(input.data), stringsAsFactors = FALSE) data2 <- na.omit(unique(data[,c("barcode", cloneCall, group.by)])) data2 <- data2[data2[,"barcode"] %in% cell.names, ] diff --git a/R/utils.R b/R/utils.R index 944fb801..cdee4570 100644 --- a/R/utils.R +++ b/R/utils.R @@ -6,7 +6,7 @@ is_seurat_or_se_object <- function(obj) { is_seurat_object(obj) || is_se_object(obj) } -#Use to shuffle between chains +#Use to shuffle between chains Qile: the NA handling here *might* be related to the unnamed combineTCR bugs from the new rcpp con.df construction #' @keywords internal #' @author Ye-Lin Son Nick Borcherding .off.the.chain <- function(dat, chain, cloneCall) {