Skip to content

Commit

Permalink
dont evaluate TLEN in duplication evaluation to save RAM and CPU
Browse files Browse the repository at this point in the history
  • Loading branch information
sfchen committed Jun 2, 2018
1 parent 8995b3e commit d711b0c
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 73 deletions.
68 changes: 29 additions & 39 deletions src/duplicate.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "duplicate.h"
#include "overlapanalysis.h"
#include <memory.h>
#include <math.h>

Duplicate::Duplicate(Options* opt) {
mOptions = opt;
Expand All @@ -10,10 +11,8 @@ Duplicate::Duplicate(Options* opt) {
memset(mDups, 0, sizeof(uint64)*mKeyLenInBit);
mCounts = new uint16[mKeyLenInBit];
memset(mCounts, 0, sizeof(uint16)*mKeyLenInBit);
mLength = new uint16[mKeyLenInBit];
memset(mLength, 0, sizeof(uint16)*mKeyLenInBit);
mGC = new uint16[mKeyLenInBit];
memset(mGC, 0, sizeof(uint16)*mKeyLenInBit);
mGC = new uint8[mKeyLenInBit];
memset(mGC, 0, sizeof(uint8)*mKeyLenInBit);
}

Duplicate::~Duplicate(){
Expand Down Expand Up @@ -48,11 +47,10 @@ uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid)
return ret;
}

void Duplicate::addRecord(uint32 key, uint64 kmer32, int tlen, int gc) {
void Duplicate::addRecord(uint32 key, uint64 kmer32, uint8 gc) {
if(mCounts[key] == 0) {
mCounts[key] = 1;
mDups[key] = kmer32;
mLength[key] = tlen;
mGC[key] = gc;
} else {
if(mDups[key] == kmer32)
Expand Down Expand Up @@ -89,7 +87,9 @@ void Duplicate::statRead(Read* r) {
}
}

addRecord(key, kmer32, r->length(), gc);
gc = round(255.0 * (double) gc / (double) r->length());

addRecord(key, kmer32, (uint8)gc);
}

void Duplicate::statPair(Read* r1, Read* r2) {
Expand All @@ -112,33 +112,27 @@ void Duplicate::statPair(Read* r1, Read* r2) {
int gc = 0;

// not calculated
int tlen = 0;
if(mCounts[key] == 0) {
OverlapResult ov = OverlapAnalysis::analyze(r1, r2);
if(ov.overlap_len > 30) {
if(ov.offset < 0)
tlen = ov.overlap_len;
else
tlen = r1->length() + r2->length() - ov.overlap_len;
}
for(int i=0; i<r1->length() && i<tlen; i++) {
for(int i=0; i<r1->length(); i++) {
if(data1[i] == 'G' || data1[i] == 'C')
gc++;
}
for(int i=0; i<r2->length() && i<tlen-r1->length(); i++) {
for(int i=0; i<r2->length(); i++) {
if(data2[i] == 'G' || data2[i] == 'C')
gc++;
}
}

addRecord(key, kmer32, tlen, gc);
gc = round(255.0 * (double) gc / (double)( r1->length() + r2->length()));

addRecord(key, kmer32, gc);
}

double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN, double* meanGC, int histSize) {
double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanGC, int histSize) {
long totalNum = 0;
long dupNum = 0;
int* gcTlenNum = new int[histSize];
memset(gcTlenNum, 0, sizeof(int)*histSize);
int* gcStatNum = new int[histSize];
memset(gcStatNum, 0, sizeof(int)*histSize);
for(int key=0; key<list[0]->mKeyLenInBit; key++) {
bool consistent = true;
for(int i=0; i<list.size()-1; i++) {
Expand All @@ -148,15 +142,13 @@ double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN,
}
if(consistent) {
int count = 0;
int numSum = 0;
double tlenSum = 0;
double gcSum = 0;
int num = 0;
for(int i=0; i<list.size(); i++) {
count += list[i]->mCounts[key];
if(list[i]->mLength[key] > 0) {
numSum++;
tlenSum += list[i]->mLength[key];
gcSum += (double)list[i]->mGC[key] / (double)list[i]->mLength[key];
if(list[i]->mGC[key]>0) {
gcSum += (double)list[i]->mGC[key];
num++;
}
}

Expand All @@ -166,32 +158,30 @@ double Duplicate::statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN,

if(count >= histSize){
hist[histSize-1]++;
if(numSum > 0) {
meanTLEN[histSize-1] += tlenSum/numSum;
meanGC[histSize-1] += gcSum/numSum;
gcTlenNum[histSize-1]++;
if(num>0) {
meanGC[histSize-1] += gcSum/num;
gcStatNum[histSize-1]++;
}
}
else{
hist[count]++;
if(numSum > 0) {
meanTLEN[count] += tlenSum/numSum;
meanGC[count] += gcSum/numSum;
gcTlenNum[count]++;
if(num>0) {
meanGC[count] += gcSum/num;
gcStatNum[count]++;
}

}
}
}
}

for(int i=0; i<histSize; i++) {
if(gcTlenNum[i] > 0) {
meanTLEN[i] = meanTLEN[i] / gcTlenNum[i];
meanGC[i] = meanGC[i] / gcTlenNum[i];
if(gcStatNum[i] > 0) {
meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i];
}
}

delete[] gcTlenNum;
delete[] gcStatNum;

if(totalNum == 0)
return 0.0;
Expand Down
7 changes: 3 additions & 4 deletions src/duplicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,18 @@ class Duplicate{
void statRead(Read* r1);
void statPair(Read* r1, Read* r2);
uint64 seq2int(const char* data, int start, int keylen, bool& valid);
void addRecord(uint32 key, uint64 kmer32, int tlen, int gc);
void addRecord(uint32 key, uint64 kmer32, uint8 gc);

// make histogram and get duplication rate
static double statAll(vector<Duplicate*>& list, int* hist, double* meanTLEN, double* meanGC, int histSize);
static double statAll(vector<Duplicate*>& list, int* hist, double* meanGC, int histSize);

private:
Options* mOptions;
int mKeyLenInBase;
int mKeyLenInBit;
uint64* mDups;
uint16* mCounts;
uint16* mLength;
uint16* mGC;
uint8* mGC;

};

Expand Down
12 changes: 7 additions & 5 deletions src/htmlreporter.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "htmlreporter.h"
#include <chrono>
#include <memory.h>

extern string command;

Expand All @@ -12,9 +13,8 @@ HtmlReporter::HtmlReporter(Options* opt){
HtmlReporter::~HtmlReporter(){
}

void HtmlReporter::setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate) {
void HtmlReporter::setDupHist(int* dupHist, double* dupMeanGC, double dupRate) {
mDupHist = dupHist;
mDupMeanTlen = dupMeanTlen;
mDupMeanGC = dupMeanGC;
mDupRate = dupRate;
}
Expand Down Expand Up @@ -185,8 +185,11 @@ void HtmlReporter::reportDuplication(ofstream& ofs) {
allCount += mDupHist[i+1];
}
double* percents = new double[total];
for(int i=0; i<total; i++) {
percents[i] = (double)mDupHist[i+1] * 100.0 / (double)allCount;
memset(percents, 0, sizeof(double)*total);
if(allCount > 0) {
for(int i=0; i<total; i++) {
percents[i] = (double)mDupHist[i+1] * 100.0 / (double)allCount;
}
}
int maxGC = total;
double* gc = new double[total];
Expand All @@ -196,7 +199,6 @@ void HtmlReporter::reportDuplication(ofstream& ofs) {
if(percents[i] <= 0.05 && maxGC == total)
maxGC = i;
}
double* tlen = mDupMeanTlen + 1;

json_str += "{";
json_str += "x:[" + Stats::list2string(x, total) + "],";
Expand Down
3 changes: 1 addition & 2 deletions src/htmlreporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class HtmlReporter{
public:
HtmlReporter(Options* opt);
~HtmlReporter();
void setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate);
void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);

static void outputRow(ofstream& ofs, string key, long value);
Expand All @@ -35,7 +35,6 @@ class HtmlReporter{
private:
Options* mOptions;
int* mDupHist;
double* mDupMeanTlen;
double* mDupMeanGC;
double mDupRate;
};
Expand Down
10 changes: 1 addition & 9 deletions src/jsonreporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ JsonReporter::JsonReporter(Options* opt){
JsonReporter::~JsonReporter(){
}

void JsonReporter::setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate) {
void JsonReporter::setDupHist(int* dupHist, double* dupMeanGC, double dupRate) {
mDupHist = dupHist;
mDupMeanTlen = dupMeanTlen;
mDupMeanGC = dupMeanGC;
mDupRate = dupRate;
}
Expand Down Expand Up @@ -104,13 +103,6 @@ void JsonReporter::report(FilterResult* result, Stats* preStats1, Stats* postSta
ofs << ",";
}
ofs << "]," << endl;
ofs << "\t\t\"mean_tlen\": [";
for(int d=1; d<mOptions->duplicate.histSize; d++) {
ofs << mDupMeanTlen[d];
if(d!=mOptions->duplicate.histSize-1)
ofs << ",";
}
ofs << "]," << endl;
ofs << "\t\t\"mean_gc\": [";
for(int d=1; d<mOptions->duplicate.histSize; d++) {
ofs << mDupMeanGC[d];
Expand Down
3 changes: 1 addition & 2 deletions src/jsonreporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ class JsonReporter{
JsonReporter(Options* opt);
~JsonReporter();

void setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate);
void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);

private:
Options* mOptions;
int* mDupHist;
double* mDupMeanTlen;
double* mDupMeanGC;
double mDupRate;
};
Expand Down
9 changes: 3 additions & 6 deletions src/peprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,27 +154,25 @@ bool PairEndProcessor::process(){
if(mOptions->duplicate.enabled) {
dupHist = new int[mOptions->duplicate.histSize];
memset(dupHist, 0, sizeof(int) * mOptions->duplicate.histSize);
dupMeanTlen = new double[mOptions->duplicate.histSize];
memset(dupMeanTlen, 0, sizeof(double) * mOptions->duplicate.histSize);
dupMeanGC = new double[mOptions->duplicate.histSize];
memset(dupMeanGC, 0, sizeof(double) * mOptions->duplicate.histSize);
vector<Duplicate*> dupList;
for(int t=0; t<mOptions->thread; t++){
dupList.push_back(configs[t]->getDuplicate());
}
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanTlen, dupMeanGC, mOptions->duplicate.histSize);
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanGC, mOptions->duplicate.histSize);
cout << endl;
cout << "Duplication rate: " << dupRate * 100.0 << "%" << endl;
}

// make JSON report
JsonReporter jr(mOptions);
jr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
jr.setDupHist(dupHist, dupMeanGC, dupRate);
jr.report(finalFilterResult, finalPreStats1, finalPostStats1, finalPreStats2, finalPostStats2);

// make HTML report
HtmlReporter hr(mOptions);
hr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
hr.setDupHist(dupHist, dupMeanGC, dupRate);
hr.report(finalFilterResult, finalPreStats1, finalPostStats1, finalPreStats2, finalPostStats2);

// clean up
Expand All @@ -193,7 +191,6 @@ bool PairEndProcessor::process(){

if(mOptions->duplicate.enabled) {
delete[] dupHist;
delete[] dupMeanTlen;
delete[] dupMeanGC;
}

Expand Down
9 changes: 3 additions & 6 deletions src/seprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,27 +128,25 @@ bool SingleEndProcessor::process(){
if(mOptions->duplicate.enabled) {
dupHist = new int[mOptions->duplicate.histSize];
memset(dupHist, 0, sizeof(int) * mOptions->duplicate.histSize);
dupMeanTlen = new double[mOptions->duplicate.histSize];
memset(dupMeanTlen, 0, sizeof(double) * mOptions->duplicate.histSize);
dupMeanGC = new double[mOptions->duplicate.histSize];
memset(dupMeanGC, 0, sizeof(double) * mOptions->duplicate.histSize);
vector<Duplicate*> dupList;
for(int t=0; t<mOptions->thread; t++){
dupList.push_back(configs[t]->getDuplicate());
}
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanTlen, dupMeanGC, mOptions->duplicate.histSize);
dupRate = Duplicate::statAll(dupList, dupHist, dupMeanGC, mOptions->duplicate.histSize);
cout << endl;
cout << "Duplication rate: " << dupRate * 100.0 << "%" << endl;
}

// make JSON report
JsonReporter jr(mOptions);
jr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
jr.setDupHist(dupHist, dupMeanGC, dupRate);
jr.report(finalFilterResult, finalPreStats, finalPostStats);

// make HTML report
HtmlReporter hr(mOptions);
hr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate);
hr.setDupHist(dupHist, dupMeanGC, dupRate);
hr.report(finalFilterResult, finalPreStats, finalPostStats);

// clean up
Expand All @@ -165,7 +163,6 @@ bool SingleEndProcessor::process(){

if(mOptions->duplicate.enabled) {
delete[] dupHist;
delete[] dupMeanTlen;
delete[] dupMeanGC;
}

Expand Down

0 comments on commit d711b0c

Please sign in to comment.