Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sped up testing and AA kmer counting (with Rcpp) #275

Merged
merged 6 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

rcppGetAaKmerPercent <- function(seqs, motifs, k) {
.Call(`_scRepertoire_rcppGetAaKmerPercent`, seqs, motifs, k)
}

rcppGenerateUniqueNtMotifs <- function(k) {
.Call(`_scRepertoire_rcppGenerateUniqueNtMotifs`, k)
}
Expand Down
19 changes: 10 additions & 9 deletions R/clonalBias.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,14 @@ clonalBias <- function(sc.data,
bias$cloneSize[i] <- as.vector(meta[which(meta[,cloneCall] == clone & meta[,split.by] == split),"cloneSize"])[1]
}

bias$cloneSize <- factor(bias$cloneSize , rev(levels(meta[,"cloneSize"])))
#Plotting
plot <- ggplot(bias, aes(x=ncells,y=bias)) +
bias$cloneSize <- factor(bias$cloneSize , rev(levels(meta[, "cloneSize"])))

if (exportTable) {
return(bias)
}

#else, return the plot
ggplot(bias, aes(x=ncells,y=bias)) +
geom_point(aes(fill=Top_state, size = cloneSize), shape = 21, stroke = 0.25) +
stat_quantile(data=df_shuffle,
quantiles = c(corrected_p),
Expand All @@ -115,13 +120,9 @@ clonalBias <- function(sc.data,
color = "black",
lty = 2) +
scale_fill_manual(values = .colorizer(palette, length(unique(bias[,"Top_state"])))) +
theme_classic() +
xlab("Clone Size") +
theme_classic() +
xlab("Clone Size") +
ylab("Clonotype Bias")
if (exportTable == TRUE) {
return(bias)
}
return(plot)
}

#Background summary of clones
Expand Down
7 changes: 4 additions & 3 deletions R/clonalNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ clonalNetwork <- function(sc.data,
V(graph)$size <- unname(clone.number)
centers <- centers[rownames(centers) %in% names(V(graph)),]

if (exportTable) {
return(edge.list1)
}

plot <- ggraph(graph, layout = centers[match(names(V(graph)), rownames(centers)),]) +
geom_point(data = coord, aes(x = coord[,1], y = coord[,2], color = group.by)) +
geom_edge_bend(aes(edge_color = as.numeric(weight)),
Expand All @@ -228,9 +232,6 @@ clonalNetwork <- function(sc.data,
axis.line = element_line(colour = "black"),
legend.key=element_blank()
)
if (exportTable) {
return(edge.list1)
}
return(plot)
}

4 changes: 2 additions & 2 deletions R/clonalRarefaction.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ clonalRarefaction <- function(input.data,
})
col <- length(input.data)
mat <- iNEXT(mat.list, q=hill.numbers, datatype="abundance",nboot = n.boots)
plot <- ggiNEXT(mat, type=plot.type) +
plot <- suppressMessages(ggiNEXT(mat, type=plot.type) +
scale_shape_manual(values = rep(16,col)) +
scale_fill_manual(values = rep("white", col)) +
scale_color_manual(values = c(.colorizer(palette,col))) +
theme_classic()
theme_classic())
if (exportTable == TRUE) {
return(plot["data"])
} else {
Expand Down
9 changes: 9 additions & 0 deletions R/percentKmer.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ percentKmer <- function(input.data,
next
}

if (identical(cloneCall, "CTaa") && motif.length > 1L && motif.length <= 12L) {
mat[i, ] <- rcppGetAaKmerPercent(input.data[[i]][,cloneCall], unique.motifs, motif.length)
next
}

# this point could only be reached if motif.length is extremely long or is one.
# which wouldn't really be practical. Maybe the following should just be deleted
# and a cap could be put on motif.length?

motifs <- .tokenize_multiple_sequences(input.data[[i]][,cloneCall], motif.length)
motif.table <- table(unlist(motifs))
if(any(grepl("\\;", names(motif.table)))) {
Expand Down
14 changes: 14 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// rcppGetAaKmerPercent
Rcpp::NumericVector rcppGetAaKmerPercent(const std::vector<std::string>& seqs, const std::vector<std::string>& motifs, const int k);
RcppExport SEXP _scRepertoire_rcppGetAaKmerPercent(SEXP seqsSEXP, SEXP motifsSEXP, SEXP kSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::vector<std::string>& >::type seqs(seqsSEXP);
Rcpp::traits::input_parameter< const std::vector<std::string>& >::type motifs(motifsSEXP);
Rcpp::traits::input_parameter< const int >::type k(kSEXP);
rcpp_result_gen = Rcpp::wrap(rcppGetAaKmerPercent(seqs, motifs, k));
return rcpp_result_gen;
END_RCPP
}
// rcppGenerateUniqueNtMotifs
Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k);
RcppExport SEXP _scRepertoire_rcppGenerateUniqueNtMotifs(SEXP kSEXP) {
Expand All @@ -35,6 +48,7 @@ END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_scRepertoire_rcppGetAaKmerPercent", (DL_FUNC) &_scRepertoire_rcppGetAaKmerPercent, 3},
{"_scRepertoire_rcppGenerateUniqueNtMotifs", (DL_FUNC) &_scRepertoire_rcppGenerateUniqueNtMotifs, 1},
{"_scRepertoire_rcppGetNtKmerPercent", (DL_FUNC) &_scRepertoire_rcppGetNtKmerPercent, 2},
{NULL, NULL, 0}
Expand Down
114 changes: 114 additions & 0 deletions src/aaKmers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// naive implementation of hashmap-based AA counting, with a 5-bit AA representation.
// By Qile Yang

#include <Rcpp.h>
#include <vector>
#include <string>
#include <unordered_map>
#include "scRepHelper.h"

std::unordered_map<char, unsigned long int> allAaMap() {
std::unordered_map<char, unsigned long int> map;
map['A'] = 0; map['C'] = 1; map['D'] = 2; map['E'] = 3;
map['F'] = 4; map['G'] = 5; map['H'] = 6; map['I'] = 7;
map['K'] = 8; map['L'] = 9; map['M'] = 10; map['N'] = 11;
map['P'] = 12; map['Q'] = 13; map['R'] = 14; map['S'] = 15;
map['T'] = 16; map['V'] = 17; map['W'] = 18; map['Y'] = 19;
return map;
}

class AaKmerCounter {
private:
// ideally these are all constants except bins but this works :/
std::unordered_map<unsigned long int, int> aaUIntKmerMap;
int k;
unsigned long int mask;
std::unordered_map<char, unsigned long int> aaIndexMap;
std::vector<long double> bins;

std::unordered_map<unsigned long int, int> toAaUIntKmerMap(const std::vector<std::string>& motifs) {
std::unordered_map<unsigned long int, int> map;
for (int i = 0; i < (int) motifs.size(); i++) {
unsigned long int kmer = 0;
for (char aa : motifs[i]) {
kmer = (kmer << 5) | toAaIndex(aa);
}
map[kmer] = i;
}
return map;
}

inline unsigned long int toAaIndex(const char aa) {
if (aaIndexMap.find(aa) == aaIndexMap.end()) {
return 20;
}
return aaIndexMap[aa];
}

inline void updateSkip(int& skip, const char c) {
if (aaIndexMap.find(c) == aaIndexMap.end()) {
skip = k;
} else if (skip > 0) {
skip--;
}
}

public:
AaKmerCounter(const std::vector<std::string>& motifs, const int _k) {
aaIndexMap = allAaMap();
k = _k;
mask = (unsigned long int) ((1 << (_k * 5)) - 1);
aaUIntKmerMap = toAaUIntKmerMap(motifs);
bins = std::vector<long double> (motifs.size(), 0.0);
}

void countKmers(const std::vector<std::string>& seqs) {
for (std::string seq : seqs) {
if ((int) seq.size() < k) {
continue;
}

int skip = 0;
unsigned long int kmer = 0;

for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted?
kmer = (kmer << 5) | toAaIndex(seq[i]);
updateSkip(skip, seq[i]);
}

for (int i = (k - 1); i < (int) seq.size(); i++) {
kmer = ((kmer << 5) & mask) | toAaIndex(seq[i]);
updateSkip(skip, seq[i]);
if (skip == 0) {
bins[aaUIntKmerMap[kmer]]++;
}
}
}
}

std::vector<long double> getCounts() {
return bins;
}
};

// [[Rcpp::export]]
Rcpp::NumericVector rcppGetAaKmerPercent(
const std::vector<std::string>& seqs, const std::vector<std::string>& motifs, const int k
) {

AaKmerCounter counter = AaKmerCounter(motifs, k);
counter.countKmers(seqs);
std::vector<long double> bins = counter.getCounts();

long double binSum = scRepHelper::sum(bins);
if (binSum == 0.0) { // pretty sure this can only happen if there arent valid seqs?
return Rcpp::NumericVector (motifs.size(), R_NaReal);
}

double scaleFactor = 1 / binSum;
for (int i = 0; i < (int) motifs.size(); i++) {
bins[i] *= scaleFactor;
}

return scRepHelper::convertZerosToNA(bins, motifs.size());
}
23 changes: 14 additions & 9 deletions src/ntKmers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <Rcpp.h>
#include <vector>
#include <string>
#include "screpUtils.h"
#include "scRepHelper.h"

inline unsigned short int toNtIndex(const char nt) {
switch(nt) {
Expand Down Expand Up @@ -44,13 +44,13 @@ inline bool isNt(char c) {
Rcpp::CharacterVector rcppGenerateUniqueNtMotifs(int k) {
long int numKmers = 1 << (k + k);
Rcpp::CharacterVector motifs (numKmers);
for (unsigned long int i = 0; i < numKmers; i++) {
for (long int i = 0; i < numKmers; i++) {
motifs[i] = toNtKmer(i, k);
}
return motifs;
}

inline void updateSkip(int& skip, char c, int k) {
inline void updateSkip(int& skip, const char c, const int k) {
if (!isNt(c)) {
skip = k;
} else if (skip > 0) {
Expand All @@ -59,19 +59,21 @@ inline void updateSkip(int& skip, char c, int k) {
}

// actual kmer counter - doesnt handle _NA_ for k = 1
inline void kmerCount(std::vector<long double>& bins, const unsigned int mask, const std::string& seq, int k) {
inline void kmerCount(std::vector<long double>& bins, const unsigned int mask, const std::string& seq, const int k) {
int skip = 0;
unsigned long int kmer = 0;

for (int i = 0; i < k - 1; i++) {
for (int i = 0; i < (k - 1); i++) { // this segment to initialize the kmer should be deletable if skip is adjusted?
kmer = (kmer << 2) | toNtIndex(seq[i]);
updateSkip(skip, seq[i], k);
}

for (int i = k - 1; i < (int) seq.size(); i++) {
kmer = ((kmer << 2) & mask) | toNtIndex(seq[i]);
updateSkip(skip, seq[i], k);
if (skip == 0) {bins[kmer]++;}
if (skip == 0) {
bins[kmer]++;
}
}
}

Expand All @@ -82,18 +84,21 @@ Rcpp::NumericVector rcppGetNtKmerPercent(const std::vector<std::string>& seqs, c
std::vector<long double> bins (numKmers, 0);

for (std::string seq : seqs) {
if (seq.size() < k) {
continue;
}
kmerCount(bins, mask, seq, k);
}

long double binSum = sum(bins);
long double binSum = scRepHelper::sum(bins);
if (binSum == 0.0) { // pretty sure this can only happen if there arent valid seqs?
return Rcpp::NumericVector (numKmers, R_NaReal);
}

const double scaleFactor = 1 / binSum;
double scaleFactor = 1 / binSum;
for (int i = 0; i < numKmers; i++) {
bins[i] *= scaleFactor;
}

return convertZerosToNA(bins, numKmers);
return scRepHelper::convertZerosToNA(bins, numKmers);
}
23 changes: 23 additions & 0 deletions src/scRepHelper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include <Rcpp.h>
#include <vector>

class scRepHelper {
public:
static const long double sum(std::vector<long double>& v) {
long double n = 0;
for (long double num : v) {
n += num;
}
return n;
}

static Rcpp::NumericVector convertZerosToNA(std::vector<long double>& v, int len) {
Rcpp::NumericVector converted (len, R_NaReal);
for (int i = 0; i < len; i++) {
if (v[i] > 0) {
converted[i] = v[i];
}
}
return converted;
}
};
21 changes: 0 additions & 21 deletions src/screpUtils.h

This file was deleted.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading