Skip to content

Commit

Permalink
Merge pull request #275 from Qile0317/v2
Browse files Browse the repository at this point in the history
Sped up testing and AA kmer counting (with Rcpp)
  • Loading branch information
ncborcherding authored Nov 7, 2023
2 parents 940ef6e + afc62ee commit 357e63f
Show file tree
Hide file tree
Showing 47 changed files with 448 additions and 277 deletions.
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

rcppGetAaKmerPercent <- function(seqs, motifs, k) {
.Call(`_scRepertoire_rcppGetAaKmerPercent`, seqs, motifs, k)
}

rcppGenerateUniqueNtMotifs <- function(k) {
.Call(`_scRepertoire_rcppGenerateUniqueNtMotifs`, k)
}
Expand Down
19 changes: 10 additions & 9 deletions R/clonalBias.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,14 @@ clonalBias <- function(sc.data,
bias$cloneSize[i] <- as.vector(meta[which(meta[,cloneCall] == clone & meta[,split.by] == split),"cloneSize"])[1]
}

bias$cloneSize <- factor(bias$cloneSize , rev(levels(meta[,"cloneSize"])))
#Plotting
plot <- ggplot(bias, aes(x=ncells,y=bias)) +
bias$cloneSize <- factor(bias$cloneSize , rev(levels(meta[, "cloneSize"])))

if (exportTable) {
return(bias)
}

#else, return the plot
ggplot(bias, aes(x=ncells,y=bias)) +
geom_point(aes(fill=Top_state, size = cloneSize), shape = 21, stroke = 0.25) +
stat_quantile(data=df_shuffle,
quantiles = c(corrected_p),
Expand All @@ -115,13 +120,9 @@ clonalBias <- function(sc.data,
color = "black",
lty = 2) +
scale_fill_manual(values = .colorizer(palette, length(unique(bias[,"Top_state"])))) +
theme_classic() +
xlab("Clone Size") +
theme_classic() +
xlab("Clone Size") +
ylab("Clonotype Bias")
if (exportTable == TRUE) {
return(bias)
}
return(plot)
}

#Background summary of clones
Expand Down
7 changes: 4 additions & 3 deletions R/clonalNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ clonalNetwork <- function(sc.data,
V(graph)$size <- unname(clone.number)
centers <- centers[rownames(centers) %in% names(V(graph)),]

if (exportTable) {
return(edge.list1)
}

plot <- ggraph(graph, layout = centers[match(names(V(graph)), rownames(centers)),]) +
geom_point(data = coord, aes(x = coord[,1], y = coord[,2], color = group.by)) +
geom_edge_bend(aes(edge_color = as.numeric(weight)),
Expand All @@ -228,9 +232,6 @@ clonalNetwork <- function(sc.data,
axis.line = element_line(colour = "black"),
legend.key=element_blank()
)
if (exportTable) {
return(edge.list1)
}
return(plot)
}

4 changes: 2 additions & 2 deletions R/clonalRarefaction.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ clonalRarefaction <- function(input.data,
})
col <- length(input.data)
mat <- iNEXT(mat.list, q=hill.numbers, datatype="abundance",nboot = n.boots)
plot <- ggiNEXT(mat, type=plot.type) +
plot <- suppressMessages(ggiNEXT(mat, type=plot.type) +
scale_shape_manual(values = rep(16,col)) +
scale_fill_manual(values = rep("white", col)) +
scale_color_manual(values = c(.colorizer(palette,col))) +
theme_classic()
theme_classic())
if (exportTable == TRUE) {
return(plot["data"])
} else {
Expand Down
9 changes: 9 additions & 0 deletions R/percentKmer.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ percentKmer <- function(input.data,
next
}

if (identical(cloneCall, "CTaa") && motif.length > 1L && motif.length <= 12L) {
mat[i, ] <- rcppGetAaKmerPercent(input.data[[i]][,cloneCall], unique.motifs, motif.length)
next
}

# this point could only be reached if motif.length is extremely long or is one.
# which wouldn't really be practical. Maybe the following should just be deleted
# and a cap could be put on motif.length?

motifs <- .tokenize_multiple_sequences(input.data[[i]][,cloneCall], motif.length)
motif.table <- table(unlist(motifs))
if(any(grepl("\\;", names(motif.table)))) {
Expand Down
14 changes: 14 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// rcppGetAaKmerPercent
Rcpp::NumericVector rcppGetAaKmerPercent(const std::vector<std::string>& seqs, const std::vector<std::string>& motifs, const int k);
RcppExport SEXP _scRepertoire_rcppGetAaKmerPercent(SEXP seqsSEXP, SEXP motifsSEXP, SEXP kSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::vector<std::string>& >::type seqs(seqsSEXP);
Rcpp::traits::input_parameter< const std::vector<std::string>& >::type motifs(motifsSEXP);
Rcpp::traits::input_parameter< const int >::type k(kSEXP);
rcpp_result_gen = Rcpp::wrap(rcppGetAaKmerPercent(seqs, motifs, k));
return rcpp_result_gen;
END_RCPP
}
// rcppGenerateUniqueNtMotifs
Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k);
RcppExport SEXP _scRepertoire_rcppGenerateUniqueNtMotifs(SEXP kSEXP) {
Expand All @@ -35,6 +48,7 @@ END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_scRepertoire_rcppGetAaKmerPercent", (DL_FUNC) &_scRepertoire_rcppGetAaKmerPercent, 3},
{"_scRepertoire_rcppGenerateUniqueNtMotifs", (DL_FUNC) &_scRepertoire_rcppGenerateUniqueNtMotifs, 1},
{"_scRepertoire_rcppGetNtKmerPercent", (DL_FUNC) &_scRepertoire_rcppGetNtKmerPercent, 2},
{NULL, NULL, 0}
Expand Down
114 changes: 114 additions & 0 deletions src/aaKmers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// naive implementation of hashmap-based AA counting, with a 5-bit AA representation.
// By Qile Yang

#include <Rcpp.h>
#include <vector>
#include <string>
#include <unordered_map>
#include "scRepHelper.h"

std::unordered_map<char, unsigned long int> allAaMap() {
std::unordered_map<char, unsigned long int> map;
map['A'] = 0; map['C'] = 1; map['D'] = 2; map['E'] = 3;
map['F'] = 4; map['G'] = 5; map['H'] = 6; map['I'] = 7;
map['K'] = 8; map['L'] = 9; map['M'] = 10; map['N'] = 11;
map['P'] = 12; map['Q'] = 13; map['R'] = 14; map['S'] = 15;
map['T'] = 16; map['V'] = 17; map['W'] = 18; map['Y'] = 19;
return map;
}

class AaKmerCounter {
private:
// ideally these are all constants except bins but this works :/
std::unordered_map<unsigned long int, int> aaUIntKmerMap;
int k;
unsigned long int mask;
std::unordered_map<char, unsigned long int> aaIndexMap;
std::vector<long double> bins;

std::unordered_map<unsigned long int, int> toAaUIntKmerMap(const std::vector<std::string>& motifs) {
std::unordered_map<unsigned long int, int> map;
for (int i = 0; i < (int) motifs.size(); i++) {
unsigned long int kmer = 0;
for (char aa : motifs[i]) {
kmer = (kmer << 5) | toAaIndex(aa);
}
map[kmer] = i;
}
return map;
}

inline unsigned long int toAaIndex(const char aa) {
if (aaIndexMap.find(aa) == aaIndexMap.end()) {
return 20;
}
return aaIndexMap[aa];
}

inline void updateSkip(int& skip, const char c) {
if (aaIndexMap.find(c) == aaIndexMap.end()) {
skip = k;
} else if (skip > 0) {
skip--;
}
}

public:
AaKmerCounter(const std::vector<std::string>& motifs, const int _k) {
aaIndexMap = allAaMap();
k = _k;
mask = (unsigned long int) ((1 << (_k * 5)) - 1);
aaUIntKmerMap = toAaUIntKmerMap(motifs);
bins = std::vector<long double> (motifs.size(), 0.0);
}

void countKmers(const std::vector<std::string>& seqs) {
for (std::string seq : seqs) {
if ((int) seq.size() < k) {
continue;
}

int skip = 0;
unsigned long int kmer = 0;

for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted?
kmer = (kmer << 5) | toAaIndex(seq[i]);
updateSkip(skip, seq[i]);
}

for (int i = (k - 1); i < (int) seq.size(); i++) {
kmer = ((kmer << 5) & mask) | toAaIndex(seq[i]);
updateSkip(skip, seq[i]);
if (skip == 0) {
bins[aaUIntKmerMap[kmer]]++;
}
}
}
}

std::vector<long double> getCounts() {
return bins;
}
};

// [[Rcpp::export]]
Rcpp::NumericVector rcppGetAaKmerPercent(
const std::vector<std::string>& seqs, const std::vector<std::string>& motifs, const int k
) {

AaKmerCounter counter = AaKmerCounter(motifs, k);
counter.countKmers(seqs);
std::vector<long double> bins = counter.getCounts();

long double binSum = scRepHelper::sum(bins);
if (binSum == 0.0) { // pretty sure this can only happen if there arent valid seqs?
return Rcpp::NumericVector (motifs.size(), R_NaReal);
}

double scaleFactor = 1 / binSum;
for (int i = 0; i < (int) motifs.size(); i++) {
bins[i] *= scaleFactor;
}

return scRepHelper::convertZerosToNA(bins, motifs.size());
}
23 changes: 14 additions & 9 deletions src/ntKmers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <Rcpp.h>
#include <vector>
#include <string>
#include "screpUtils.h"
#include "scRepHelper.h"

inline unsigned short int toNtIndex(const char nt) {
switch(nt) {
Expand Down Expand Up @@ -44,13 +44,13 @@ inline bool isNt(char c) {
Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k) {
long int numKmers = 1 << (k + k);
Rcpp::CharacterVector motifs (numKmers);
for (unsigned long int i = 0; i < numKmers; i++) {
for (long int i = 0; i < numKmers; i++) {
motifs[i] = toNtKmer(i, k);
}
return motifs;
}

inline void updateSkip(int& skip, char c, int k) {
inline void updateSkip(int& skip, const char c, const int k) {
if (!isNt(c)) {
skip = k;
} else if (skip > 0) {
Expand All @@ -59,19 +59,21 @@ inline void updateSkip(int& skip, char c, int k) {
}

// actual kmer counter - doesnt handle _NA_ for k = 1
inline void kmerCount(std::vector<long double>& bins, const unsigned int mask, const std::string& seq, int k) {
inline void kmerCount(std::vector<long double>& bins, const unsigned int mask, const std::string& seq, const int k) {
int skip = 0;
unsigned long int kmer = 0;

for (int i = 0; i < k - 1; i++) {
for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted?
kmer = (kmer << 2) | toNtIndex(seq[i]);
updateSkip(skip, seq[i], k);
}

for (int i = k - 1; i < (int) seq.size(); i++) {
kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]);
updateSkip(skip, seq[i], k);
if (skip == 0) {bins[kmer]++;}
if (skip == 0) {
bins[kmer]++;
}
}
}

Expand All @@ -82,18 +84,21 @@ Rcpp::NumericVector rcppGetNtKmerPercent(const std::vector<std::string>& seqs, c
std::vector<long double> bins (numKmers, 0);

for (std::string seq : seqs) {
if (seq.size() < k) {
continue;
}
kmerCount(bins, mask, seq, k);
}

long double binSum = sum(bins);
long double binSum = scRepHelper::sum(bins);
if (binSum == 0.0) { // pretty sure this can only happen if there arent valid seqs?
return Rcpp::NumericVector (numKmers, R_NaReal);
}

const double scaleFactor = 1 / binSum;
double scaleFactor = 1 / binSum;
for (int i = 0; i < numKmers; i++) {
bins[i] *= scaleFactor;
}

return convertZerosToNA(bins, numKmers);
return scRepHelper::convertZerosToNA(bins, numKmers);
}
23 changes: 23 additions & 0 deletions src/scRepHelper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include <Rcpp.h>
#include <vector>

class scRepHelper {
public:
static const long double sum(std::vector<long double>& v) {
long double n = 0;
for (long double num : v) {
n += num;
}
return n;
}

static Rcpp::NumericVector convertZerosToNA(std::vector<long double>& v, int len) {
Rcpp::NumericVector converted (len, R_NaReal);
for (int i = 0; i < len; i++) {
if (v[i] > 0) {
converted[i] = v[i];
}
}
return converted;
}
};
21 changes: 0 additions & 21 deletions src/screpUtils.h

This file was deleted.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 357e63f

Please sign in to comment.