From d711b0c78837901b7a51cd8222bb28a0ccaad046 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sat, 2 Jun 2018 15:52:37 +0800 Subject: [PATCH] dont evaluate TLEN in duplication evaluation to save RAM and CPU --- src/duplicate.cpp | 68 +++++++++++++++++++------------------------- src/duplicate.h | 7 ++--- src/htmlreporter.cpp | 12 ++++---- src/htmlreporter.h | 3 +- src/jsonreporter.cpp | 10 +------ src/jsonreporter.h | 3 +- src/peprocessor.cpp | 9 ++---- src/seprocessor.cpp | 9 ++---- 8 files changed, 48 insertions(+), 73 deletions(-) diff --git a/src/duplicate.cpp b/src/duplicate.cpp index 0738b63..fa08d52 100644 --- a/src/duplicate.cpp +++ b/src/duplicate.cpp @@ -1,6 +1,7 @@ #include "duplicate.h" #include "overlapanalysis.h" #include +#include Duplicate::Duplicate(Options* opt) { mOptions = opt; @@ -10,10 +11,8 @@ Duplicate::Duplicate(Options* opt) { memset(mDups, 0, sizeof(uint64)*mKeyLenInBit); mCounts = new uint16[mKeyLenInBit]; memset(mCounts, 0, sizeof(uint16)*mKeyLenInBit); - mLength = new uint16[mKeyLenInBit]; - memset(mLength, 0, sizeof(uint16)*mKeyLenInBit); - mGC = new uint16[mKeyLenInBit]; - memset(mGC, 0, sizeof(uint16)*mKeyLenInBit); + mGC = new uint8[mKeyLenInBit]; + memset(mGC, 0, sizeof(uint8)*mKeyLenInBit); } Duplicate::~Duplicate(){ @@ -48,11 +47,10 @@ uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid) return ret; } -void Duplicate::addRecord(uint32 key, uint64 kmer32, int tlen, int gc) { +void Duplicate::addRecord(uint32 key, uint64 kmer32, uint8 gc) { if(mCounts[key] == 0) { mCounts[key] = 1; mDups[key] = kmer32; - mLength[key] = tlen; mGC[key] = gc; } else { if(mDups[key] == kmer32) @@ -89,7 +87,9 @@ void Duplicate::statRead(Read* r) { } } - addRecord(key, kmer32, r->length(), gc); + gc = round(255.0 * (double) gc / (double) r->length()); + + addRecord(key, kmer32, (uint8)gc); } void Duplicate::statPair(Read* r1, Read* r2) { @@ -112,33 +112,27 @@ void Duplicate::statPair(Read* r1, Read* r2) { int gc = 0; // not calculated - int tlen = 0; if(mCounts[key] == 0) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2); - if(ov.overlap_len > 30) { - if(ov.offset < 0) - tlen = ov.overlap_len; - else - tlen = r1->length() + r2->length() - ov.overlap_len; - } - for(int i=0; ilength() && ilength(); i++) { if(data1[i] == 'G' || data1[i] == 'C') gc++; } - for(int i=0; ilength() && ilength(); i++) { + for(int i=0; ilength(); i++) { if(data2[i] == 'G' || data2[i] == 'C') gc++; } } - addRecord(key, kmer32, tlen, gc); + gc = round(255.0 * (double) gc / (double)( r1->length() + r2->length())); + + addRecord(key, kmer32, gc); } -double Duplicate::statAll(vector& list, int* hist, double* meanTLEN, double* meanGC, int histSize) { +double Duplicate::statAll(vector& list, int* hist, double* meanGC, int histSize) { long totalNum = 0; long dupNum = 0; - int* gcTlenNum = new int[histSize]; - memset(gcTlenNum, 0, sizeof(int)*histSize); + int* gcStatNum = new int[histSize]; + memset(gcStatNum, 0, sizeof(int)*histSize); for(int key=0; keymKeyLenInBit; key++) { bool consistent = true; for(int i=0; i& list, int* hist, double* meanTLEN, } if(consistent) { int count = 0; - int numSum = 0; - double tlenSum = 0; double gcSum = 0; + int num = 0; for(int i=0; imCounts[key]; - if(list[i]->mLength[key] > 0) { - numSum++; - tlenSum += list[i]->mLength[key]; - gcSum += (double)list[i]->mGC[key] / (double)list[i]->mLength[key]; + if(list[i]->mGC[key]>0) { + gcSum += (double)list[i]->mGC[key]; + num++; } } @@ -166,32 +158,30 @@ double Duplicate::statAll(vector& list, int* hist, double* meanTLEN, if(count >= histSize){ hist[histSize-1]++; - if(numSum > 0) { - meanTLEN[histSize-1] += tlenSum/numSum; - meanGC[histSize-1] += gcSum/numSum; - gcTlenNum[histSize-1]++; + if(num>0) { + meanGC[histSize-1] += gcSum/num; + gcStatNum[histSize-1]++; } } else{ hist[count]++; - if(numSum > 0) { - meanTLEN[count] += tlenSum/numSum; - meanGC[count] += gcSum/numSum; - gcTlenNum[count]++; + if(num>0) { + meanGC[count] += gcSum/num; + gcStatNum[count]++; } + } } } } for(int i=0; i 0) { - meanTLEN[i] = meanTLEN[i] / gcTlenNum[i]; - meanGC[i] = meanGC[i] / gcTlenNum[i]; + if(gcStatNum[i] > 0) { + meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i]; } } - delete[] gcTlenNum; + delete[] gcStatNum; if(totalNum == 0) return 0.0; diff --git a/src/duplicate.h b/src/duplicate.h index 1fe9870..f982271 100644 --- a/src/duplicate.h +++ b/src/duplicate.h @@ -18,10 +18,10 @@ class Duplicate{ void statRead(Read* r1); void statPair(Read* r1, Read* r2); uint64 seq2int(const char* data, int start, int keylen, bool& valid); - void addRecord(uint32 key, uint64 kmer32, int tlen, int gc); + void addRecord(uint32 key, uint64 kmer32, uint8 gc); // make histogram and get duplication rate - static double statAll(vector& list, int* hist, double* meanTLEN, double* meanGC, int histSize); + static double statAll(vector& list, int* hist, double* meanGC, int histSize); private: Options* mOptions; @@ -29,8 +29,7 @@ class Duplicate{ int mKeyLenInBit; uint64* mDups; uint16* mCounts; - uint16* mLength; - uint16* mGC; + uint8* mGC; }; diff --git a/src/htmlreporter.cpp b/src/htmlreporter.cpp index 284b865..a93b6ca 100644 --- a/src/htmlreporter.cpp +++ b/src/htmlreporter.cpp @@ -1,5 +1,6 @@ #include "htmlreporter.h" #include +#include extern string command; @@ -12,9 +13,8 @@ HtmlReporter::HtmlReporter(Options* opt){ HtmlReporter::~HtmlReporter(){ } -void HtmlReporter::setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate) { +void HtmlReporter::setDupHist(int* dupHist, double* dupMeanGC, double dupRate) { mDupHist = dupHist; - mDupMeanTlen = dupMeanTlen; mDupMeanGC = dupMeanGC; mDupRate = dupRate; } @@ -185,8 +185,11 @@ void HtmlReporter::reportDuplication(ofstream& ofs) { allCount += mDupHist[i+1]; } double* percents = new double[total]; - for(int i=0; i 0) { + for(int i=0; iduplicate.histSize; d++) { - ofs << mDupMeanTlen[d]; - if(d!=mOptions->duplicate.histSize-1) - ofs << ","; - } - ofs << "]," << endl; ofs << "\t\t\"mean_gc\": ["; for(int d=1; dduplicate.histSize; d++) { ofs << mDupMeanGC[d]; diff --git a/src/jsonreporter.h b/src/jsonreporter.h index 2fa296a..f64a95d 100644 --- a/src/jsonreporter.h +++ b/src/jsonreporter.h @@ -17,13 +17,12 @@ class JsonReporter{ JsonReporter(Options* opt); ~JsonReporter(); - void setDupHist(int* dupHist, double* dupMeanTlen, double* dupMeanGC, double dupRate); + void setDupHist(int* dupHist, double* dupMeanGC, double dupRate); void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL); private: Options* mOptions; int* mDupHist; - double* mDupMeanTlen; double* mDupMeanGC; double mDupRate; }; diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp index 52db15d..c9269df 100644 --- a/src/peprocessor.cpp +++ b/src/peprocessor.cpp @@ -154,27 +154,25 @@ bool PairEndProcessor::process(){ if(mOptions->duplicate.enabled) { dupHist = new int[mOptions->duplicate.histSize]; memset(dupHist, 0, sizeof(int) * mOptions->duplicate.histSize); - dupMeanTlen = new double[mOptions->duplicate.histSize]; - memset(dupMeanTlen, 0, sizeof(double) * mOptions->duplicate.histSize); dupMeanGC = new double[mOptions->duplicate.histSize]; memset(dupMeanGC, 0, sizeof(double) * mOptions->duplicate.histSize); vector dupList; for(int t=0; tthread; t++){ dupList.push_back(configs[t]->getDuplicate()); } - dupRate = Duplicate::statAll(dupList, dupHist, dupMeanTlen, dupMeanGC, mOptions->duplicate.histSize); + dupRate = Duplicate::statAll(dupList, dupHist, dupMeanGC, mOptions->duplicate.histSize); cout << endl; cout << "Duplication rate: " << dupRate * 100.0 << "%" << endl; } // make JSON report JsonReporter jr(mOptions); - jr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate); + jr.setDupHist(dupHist, dupMeanGC, dupRate); jr.report(finalFilterResult, finalPreStats1, finalPostStats1, finalPreStats2, finalPostStats2); // make HTML report HtmlReporter hr(mOptions); - hr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate); + hr.setDupHist(dupHist, dupMeanGC, dupRate); hr.report(finalFilterResult, finalPreStats1, finalPostStats1, finalPreStats2, finalPostStats2); // clean up @@ -193,7 +191,6 @@ bool PairEndProcessor::process(){ if(mOptions->duplicate.enabled) { delete[] dupHist; - delete[] dupMeanTlen; delete[] dupMeanGC; } diff --git a/src/seprocessor.cpp b/src/seprocessor.cpp index a8921c3..cef3ec5 100644 --- a/src/seprocessor.cpp +++ b/src/seprocessor.cpp @@ -128,27 +128,25 @@ bool SingleEndProcessor::process(){ if(mOptions->duplicate.enabled) { dupHist = new int[mOptions->duplicate.histSize]; memset(dupHist, 0, sizeof(int) * mOptions->duplicate.histSize); - dupMeanTlen = new double[mOptions->duplicate.histSize]; - memset(dupMeanTlen, 0, sizeof(double) * mOptions->duplicate.histSize); dupMeanGC = new double[mOptions->duplicate.histSize]; memset(dupMeanGC, 0, sizeof(double) * mOptions->duplicate.histSize); vector dupList; for(int t=0; tthread; t++){ dupList.push_back(configs[t]->getDuplicate()); } - dupRate = Duplicate::statAll(dupList, dupHist, dupMeanTlen, dupMeanGC, mOptions->duplicate.histSize); + dupRate = Duplicate::statAll(dupList, dupHist, dupMeanGC, mOptions->duplicate.histSize); cout << endl; cout << "Duplication rate: " << dupRate * 100.0 << "%" << endl; } // make JSON report JsonReporter jr(mOptions); - jr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate); + jr.setDupHist(dupHist, dupMeanGC, dupRate); jr.report(finalFilterResult, finalPreStats, finalPostStats); // make HTML report HtmlReporter hr(mOptions); - hr.setDupHist(dupHist, dupMeanTlen, dupMeanGC, dupRate); + hr.setDupHist(dupHist, dupMeanGC, dupRate); hr.report(finalFilterResult, finalPreStats, finalPostStats); // clean up @@ -165,7 +163,6 @@ bool SingleEndProcessor::process(){ if(mOptions->duplicate.enabled) { delete[] dupHist; - delete[] dupMeanTlen; delete[] dupMeanGC; }