diff --git a/configs/sdm_src/DNAconsts.cpp b/configs/sdm_src/DNAconsts.cpp deleted file mode 100644 index b81ada1..0000000 --- a/configs/sdm_src/DNAconsts.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand -email: Falk.Hildebrand@gmail.com - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include "DNAconsts.h" -char DNA_trans[256]; -short DNA_amb[256];//to count amb chars -short DNA_IUPAC[256 * 256]; -short NT_POS[256]; - - -void ini_DNAconstants(){ -//static char* DNA_trans[256] = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; - //DNA_trans.resize(256,'X'); - for (int i = 0; i<256; i++){ DNA_trans[i] = 'N'; } - DNA_trans['A'] = 'T'; DNA_trans['T'] = 'A'; - DNA_trans['C'] = 'G'; DNA_trans['G'] = 'C'; - DNA_trans['a'] = 'T'; DNA_trans['t'] = 'A'; - DNA_trans['c'] = 'G'; DNA_trans['g'] = 'C'; - for (int i = 0; i<256; i++){ DNA_amb[i] = 0; } - DNA_amb['A'] = 1; DNA_amb['T'] = 1; - DNA_amb['C'] = 1; DNA_amb['G'] = 1; - DNA_amb['a'] = 1; DNA_amb['t'] = 1; - DNA_amb['c'] = 1; DNA_amb['g'] = 1; - - - for ( int i = 0; i < 256 ; i++ ) { NT_POS[i] = 5; } - NT_POS['A'] = 0; NT_POS['T'] = 1; NT_POS['G'] = 2; NT_POS['C'] = 3; - NT_POS['N'] = 4; - NT_POS['a'] = 0; NT_POS['t'] = 1; NT_POS['g'] = 2; NT_POS['c'] = 3; - NT_POS['n'] = 4; - for ( int i = 0; i < 256 * 256; i++ ) { DNA_IUPAC[i] = 1; } - for ( int i = 0; i<14; i++ ) {//first: N is always a hit - DNA_IUPAC['N'+256*DNA_SPACE[i]]= 0; - DNA_IUPAC[256*'N'+DNA_SPACE[i]]= 0; - } - for ( int i = 0; i<5; i++ ) {//first: N is always a hit - DNA_IUPAC['B'+256*DNA_SPACE[i]]=0; - DNA_IUPAC[256*'B'+DNA_SPACE[i]]=0; - DNA_IUPAC['H'+256*DNA_SPACE[i]]=0; - DNA_IUPAC[256*'H'+DNA_SPACE[i]]=0; - DNA_IUPAC['D'+256*DNA_SPACE[i]]=0; - DNA_IUPAC[256*'D'+DNA_SPACE[i]]=0; - DNA_IUPAC['V'+256*DNA_SPACE[i]]=0; - DNA_IUPAC[256*'V'+DNA_SPACE[i]]=0; - } - - DNA_IUPAC['B'+256*'A']=1;DNA_IUPAC[256*'B'+'A']=1; - DNA_IUPAC[('D'+256*'C')]=1;DNA_IUPAC[256*'D'+'C']=1; - DNA_IUPAC['H'+256*'G']=1;DNA_IUPAC[256*'H'+'G']=1; - DNA_IUPAC['V'+256*'T']=1;DNA_IUPAC[256*'V'+'T']=1; - - - - DNA_IUPAC['R'+256*'A']=0;DNA_IUPAC[256*'R'+'A']=0; - DNA_IUPAC['R'+256*'G']=0;DNA_IUPAC[256*'R'+'G']=0; - DNA_IUPAC['M'+256*'C']=0;DNA_IUPAC[256*'M'+'C']=0; - DNA_IUPAC['M'+256*'A']=0;DNA_IUPAC[256*'M'+'A']=0; - DNA_IUPAC['Y'+256*'C']=0;DNA_IUPAC[256*'Y'+'C']=0; - DNA_IUPAC['Y'+256*'T']=0;DNA_IUPAC[256*'Y'+'T']=0; - DNA_IUPAC['K'+256*'G']=0;DNA_IUPAC[256*'K'+'G']=0; - DNA_IUPAC['K'+256*'T']=0;DNA_IUPAC[256*'K'+'T']=0; - DNA_IUPAC['W'+256*'A']=0;DNA_IUPAC[256*'W'+'A']=0; - DNA_IUPAC['W'+256*'T']=0;DNA_IUPAC[256*'W'+'T']=0; - DNA_IUPAC['S'+256*'C']=0;DNA_IUPAC[256*'S'+'C']=0; - DNA_IUPAC['S'+256*'G']=0;DNA_IUPAC[256*'S'+'G']=0; - - DNA_IUPAC['A'+256*'A']=0;DNA_IUPAC[256*'A'+'A']=0; - DNA_IUPAC['T'+256*'T']=0;DNA_IUPAC[256*'T'+'T']=0; - DNA_IUPAC['G'+256*'G']=0;DNA_IUPAC[256*'G'+'G']=0; - DNA_IUPAC['C'+256*'C']=0;DNA_IUPAC[256*'C'+'C']=0; - //DEBUG - - //fake use to surpress compiler warnings - //if (sdm_version == 0.f){ cout << "too low version" << sdm_status; } - //if (strcmp(sdm_status, "XX")){ cout << "Some"; } - -} \ No newline at end of file diff --git a/configs/sdm_src/DNAconsts.h b/configs/sdm_src/DNAconsts.h deleted file mode 100644 index 0265bfb..0000000 --- a/configs/sdm_src/DNAconsts.h +++ /dev/null @@ -1,157 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - -#ifndef _DNAconsts_h -#define _DNAconsts_h - -//compile with multi threading support -#define _THREADE //D - -//do vector based DNA matching -#define _NEWMATCH - -//sum up uc file to OTU abundance matrix in seed extension step -#define matrix_sum - -//match barcodes based on maps -#define _fastBCmatch - -//keep a map of dereplicated file -#define _MAPDEREPLICATE - -//KHASH for faster / lower mem access -#define KHAS_H - -//disable win warning about fopen -#define _CRT_SECURE_NO_WARNINGS - - -//read gzip'd files using zlib.h -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) -#define _gziprea//d -#else -#define _gzipread -#endif - -//DEBUG mode: more output -#define DEB//UG - - -#include -#include -#include -#include -#include -#include -//#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//#include -#ifdef KHASH -#include "khash.hh" -#endif - -#ifdef _gzipread -#include "gzstream.h" -#endif - - -#ifdef _THREADED -#include -#include -#endif -//#include - -#ifdef _WIN32 -//#include -#endif // _WIN32 - - -static const float sdm_version = 1.50f; -static const char* sdm_status = "beta"; - - -using namespace std; - -static const double SAqualP[110] = {1.000000e+00,7.943282e-01,6.309573e-01,5.011872e-01,3.981072e-01,3.162278e-01,2.511886e-01,1.995262e-01,1.584893e-01,1.258925e-01 -,1.000000e-01,7.943282e-02,6.309573e-02,5.011872e-02,3.981072e-02,3.162278e-02,2.511886e-02,1.995262e-02,1.584893e-02,1.258925e-02 -,1.000000e-02,7.943282e-03,6.309573e-03,5.011872e-03,3.981072e-03,3.162278e-03,2.511886e-03,1.995262e-03,1.584893e-03,1.258925e-03 -,1.000000e-03,7.943282e-04,6.309573e-04,5.011872e-04,3.981072e-04,3.162278e-04,2.511886e-04,1.995262e-04,1.584893e-04,1.258925e-04 -,1.000000e-04,7.943282e-05,6.309573e-05,5.011872e-05,3.981072e-05,3.162278e-05,2.511886e-05,1.995262e-05,1.584893e-05,1.258925e-05 -,1.000000e-05,7.943282e-06,6.309573e-06,5.011872e-06,3.981072e-06,3.162278e-06,2.511886e-06,1.995262e-06,1.584893e-06,1.258925e-06 -,1.000000e-06,7.943282e-07,6.309573e-07,5.011872e-07,3.981072e-07,3.162278e-07,2.511886e-07,1.995262e-07,1.584893e-07,1.258925e-07 -,1.000000e-07, 1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 -,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 -, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07 -, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07 }; - -static const char DNA_SPACE[15] = {'A','C','G','T','N','R','Y','M','K','W','S','B','D','H','V'}; -static const int DNAinMemory = 5000; -static const unsigned int maxFileStreams = 500; -static const int RDBUFFER = 4096; - -typedef unsigned int uint; -typedef unsigned long ulong; -typedef int qual_score; //used for quality scores in vectors - -//seeding -static const float BestLengthRatio = 0.83f; -static const float RefLengthRatio = 0.9f; -static const qual_score MinQualDiff = 5; - - - -void ini_DNAconstants(); - - -// first base is [ACTG], second can be IUPAC -/*code description -A Adenine -C Cytosine -G Guanine -T Thymine -U Uracil -R Purine (A or G) -Y Pyrimidine (C, T, or U) -M C or A -K T, U, or G -W T, U, or A -S C or G -B C, T, U, or G (not A) -D A, T, U, or G (not C) -H A, T, U, or C (not G) -V A, C, or G (not T, not U) -N Any base (A, C, G, T, or U) -*/ - - - -#endif \ No newline at end of file diff --git a/configs/sdm_src/DNAconsts.o b/configs/sdm_src/DNAconsts.o deleted file mode 100644 index e0d0472..0000000 Binary files a/configs/sdm_src/DNAconsts.o and /dev/null differ diff --git a/configs/sdm_src/Demultipl.cpp b/configs/sdm_src/Demultipl.cpp deleted file mode 100644 index 3f3769e..0000000 --- a/configs/sdm_src/Demultipl.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand -email: Falk.Hildebrand@gmail.com - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - - -#include "IO.h" - - - -int main(int argc, char* argv[]) -{ - if (argc<3){ - //help_options,help_map,help_commands - if (argc==2){ - if (string(argv[1])=="-help_commands"){ - printCmdsHelp(); - } else if (string(argv[1])=="-help_options"){ - printOptionHelp(); - } else if (string(argv[1])=="-help_map"){ - printMapHelp(); - } - else if (string(argv[1]) == "-version" || string(argv[1]) == "-v"){ - printVersion(); - } - exit(0); - } - general_help(); - exit(0); - } -#ifdef DEBUG - cerr << "DEBUG mode"< fil = make_shared(cmdArgs); - bool bReads = fil->readMap(cmdArgs); -#ifdef DEBUG - cerr << "filter setup & map is read" << endl; -#endif - if (!bReads){cerr<<"Failed to read Map.\n";exit(3);} - //cerr<setcmdArgsFiles(cmdArgs); - - clock_t tStart = clock(); - //main function - - separateByFile(fil,cmdArgs); - //cerr << "\nXXXX\n\n"; -// delete fil; - - fprintf(stderr,"Time taken: %.2fs\n", (double)(clock() - tStart) / CLOCKS_PER_SEC); - - return 0; -} - - - diff --git a/configs/sdm_src/Demultipl.o b/configs/sdm_src/Demultipl.o deleted file mode 100644 index 6d09da8..0000000 Binary files a/configs/sdm_src/Demultipl.o and /dev/null differ diff --git a/configs/sdm_src/IO.cpp b/configs/sdm_src/IO.cpp deleted file mode 100644 index 74c6965..0000000 --- a/configs/sdm_src/IO.cpp +++ /dev/null @@ -1,1282 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand -email: Falk.Hildebrand@gmail.com - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - -#include "IO.h" -//#include -void threadAnalyzeDNA(shared_ptr tdn, shared_ptr MD,int thrCnt){ - //if (threadActive){ - // threads[thrCnt].join(); - //} - //threads[thrdsCnt] = - - //auto f1 = std::async(&MultiDNA::analyzeDNA,this,tdn); - int tagIdx(-2); - MD->analyzeDNA(tdn,thrCnt,-1,tagIdx); - MD->saveForWrite(tdn); -} -/*void trippleThreadAnalyzeDNA(shared_ptr MD, shared_ptr tdn,shared_ptr tdn2, - shared_ptr MIDseq,bool changePHead){//,int thrCnt){ - - - int thrCnt = 0; - vector chs = MD->analyzeDNA(tdn,tdn2,MIDseq,changePHead,thrCnt); - - if (chs[0] && chs[1]){ - MD->saveForWrite(tdn,1); - MD->saveForWrite(tdn2,2); - } else if (chs[0]){ - MD->saveForWrite(tdn,3); - MD->getFilters(thrCnt)->colStats[0].singleton++; - } else if (chs[1]){ - MD->saveForWrite(tdn2,4); - MD->getFilters(thrCnt)->colStats[1].singleton++; - } - -}*/ - -void read_single(OptContainer& cmdArgs, shared_ptr MD, shared_ptr IS){ - //output files -#ifdef _THREADED - int Nthrds = atoi(cmdArgs["-threads"].c_str()) -2 ; - int thrCnt = 0; - MD->setSubfilters(Nthrds); - bool threadActive(false); - bool writeThread(false); - vector threads( 0 ); - if (Nthrds>=0){ - MD->createWriteThread();writeThread=true; - threads.resize(Nthrds); - } -#endif - shared_ptr curFil = MD->getFilters(); - bool cont(true); bool sync(false); - while (cont){ - shared_ptr tdn1 = IS->getDNA(cont,0,sync); - if (tdn1 == NULL) { -#ifdef DEBUG - cerr << "NULL read returned" << endl; -#endif - break; - } - - /*if (!tdn1->isPassed()){ - MD->addNoHeadDNA(tdn1); - tdn1 = tdn2; tdn2 = new DNA("",""); - continue; - }*/ -#ifdef _THREADED - if (Nthrds>0){ - if (threadActive){ - threads[thrCnt].join(); - } - - //threadAnalyzeDNA(MD,tdn); - threads[thrCnt] = std::thread(threadAnalyzeDNA,tdn,MD,thrCnt); - thrCnt++; - if (thrCnt >= Nthrds){ - thrCnt=0; - threadActive=true; - } - } else { //single Core - MD->analyzeDNA(tdn); - MD->saveForWrite(tdn); - } -#else - curFil->sTotalPlus(0); - int tagIdx(-2); - MD->analyzeDNA(tdn1,-1,-1,tagIdx); - //here BC has to be correctly set within DNA object - if (tagIdx == -1 ) { - tdn1->setBarcodeDetected(false); - } - MD->depPrep(tdn1,NULL); - curFil->write2Demulti(tdn1, 0,MD->getfastQoutVer()); - - if (!MD->saveForWrite(tdn1)) { - cont = false; - break; - } - -#endif - //if (tdn!=NULL && ch1 != tdn->isPassed()){cerr<<"isPassed is != ch1! Aborting..\n";exit(12);} - } -#ifdef _THREADED - if (threadActive){ - for (uint i=0; icloseOutStreams(); -} - - -//is called from a while loop, that reads the DNA pairs -bool read_paired_DNAready(shared_ptr tdn, shared_ptr tdn2, shared_ptr MIDseq, bool MIDuse, shared_ptr MD, int& revConstellation) { - - if (tdn == NULL) { return true; } //|| tdn->length()==0 - - shared_ptr curFil = MD->getFilters(); - - //register read at all with stat counter: - curFil->sTotalPlus(0); curFil->sTotalPlus(1); - - //prep some variables - int BCoffs = curFil->getBCoffset(); - bool checkBC2ndRd = curFil->checkBC2ndRd(); - bool dualBCs = curFil->doubleBarcodes(); - bool doBCsAtAll = curFil->doBarcodes(); - bool checkReversedRead = curFil->checkRevRd(); - - - int tagIdx(-2); int tagIdx2(-2); - string presentBC(""); int c_err(0); - bool isReversed(false);//was a reversion detected? - - if (MIDuse && MIDseq!= 0) { - tagIdx = curFil->cutTag(MIDseq, presentBC, c_err, true); -// delete MIDseq; - tdn->setBCnumber(tagIdx, BCoffs); - } - if (checkBC2ndRd ) { - if (!dualBCs) { - bool revT = false; - bool Pr1 = curFil->findPrimer(tdn, 0, false, 0); - bool Pr2 = curFil->findPrimer(tdn2, 0, false, 0); - tagIdx = curFil->findTag(tdn, presentBC, c_err, true); - tagIdx2 = curFil->findTag(tdn2, presentBC, c_err, true); - if ( true &&checkReversedRead && (tagIdx2 < 0 && tagIdx < 0) ) { - tdn->reverse_transcribe(); tdn2->reverse_transcribe(); - Pr1 = curFil->findPrimer(tdn, 0, false, 0); - Pr2 = curFil->findPrimer(tdn2, 0, false, 0); - tagIdx = curFil->findTag(tdn, presentBC, c_err, true); - tagIdx2 = curFil->findTag(tdn2, presentBC, c_err, true); - revT = true; - } - if ((tagIdx2 >= 0 && tagIdx < 0 && !Pr1) || (Pr2 && !Pr1)) { //swap first & second read - swap(tdn, tdn2); - revConstellation++; - } - /*else if (tagIdx2 < 0 && tagIdx < 0) { - int x = 0; - }*/ - if (revT) { - tdn->reverse_transcribe(); tdn2->reverse_transcribe(); - } - } - tagIdx2 = -2; tagIdx = -2; - tdn2->setpairREV(); tdn->setpairFWD(); - } - - //tdn->reverse_transcribe(); - MD->analyzeDNA(tdn, -1, 0, tagIdx); - //tdn->matchSeqRev - bool ch1(false); if (tdn != NULL) { ch1 = tdn->isPassed(); } - bool ch2(false); bool ch2n(false); - - //this is all about barcodes.. - if (checkReversedRead && tdn != NULL && tagIdx < 0) { - if (!MIDuse) { tagIdx = -2; } -// curFil->sTotalMinus(0); - tdn->reverse_transcribe(); - MD->analyzeDNA(tdn, -1, 0, tagIdx); - ch1 = tdn->isPassed(); - isReversed = ch1; - if (!isReversed) {//reset - tdn->reverse_transcribe(); - } - } - - //test for reverse complemented reads (mohammad samples), when BC not found (NOT dual BC) - //in that case, this is the first read - if (false &&checkBC2ndRd && tagIdx < 0 && tdn2 != NULL) {// && !tdn->getBarcodeDetected() ) { - //tdn2->reverse_transcribe(); - if (!MIDuse) { tagIdx = -2; } -// curFil->sTotalMinus(0); - MD->analyzeDNA(tdn2, -1, 0, tagIdx); - ch2n = tdn2->isPassed(); - if (!ch2n && checkReversedRead) { - if (!MIDuse) { tagIdx = -2; } -// curFil->sTotalMinus(0); - tdn2->reverse_transcribe(); - MD->analyzeDNA(tdn2, -1, 0, tagIdx); - ch2n = tdn2->isPassed(); - isReversed = ch2n; - if (!ch2n) { tdn2->reverse_transcribe(); }//reset to ori - } - if (ch2n) {//passed ch2 through BC filter, now really reverse - //1st, now 2nd pair - tdn2->setpairFWD(); - ch1 = ch2n; - if (tdn != NULL) { - //tdn->reverse_transcribe(); - tdn->setpairREV(); - tdn->reset(); - if (!dualBCs) { tagIdx2 = tdn->getBCnumber(); } // no 2nd BC, thus no BC search in 2nd read - MD->analyzeDNA(tdn, -1, 1, tagIdx2); - ch2 = tdn->isPassed(); - } - swap(tdn, tdn2); - revConstellation++; - } - - } - - - - //if ( ch1 ) { cerr << cnt << " \n"; } - //normal case for check 2nd read - if (!ch2 && tdn2 != NULL) { //ch1&& - //tdn2->setBCnumber(tdn->getBCnumber()); - if (doBCsAtAll && !dualBCs) { //only check in read1 for BC, if not dual BCing!! - tagIdx2 = tdn->getBCnumber(); // no 2nd BC, thus no BC search in 2nd read - if (tagIdx2 >= 0) { - tagIdx2 -= BCoffs; - }else if (tagIdx2 < -1) {//something wrong with BCoffs - cerr << "tagidx2 wrongly truncated to " << tagIdx2 << endl; - } - } - if (isReversed) { tdn2->reverse_transcribe(); } - MD->analyzeDNA(tdn2, -1, 1, tagIdx2); - ch2 = tdn2->isPassed(); - } - - //set up BC in DNA header - //remember that dual BCs are only valid after this step! - if (dualBCs) { - //tagIdx2 = -2; //reset just to be sure - curFil->dblBCeval(tagIdx, tagIdx2, presentBC, tdn, tdn2); - c_err = -1; - - //check a second time that barcode was correctly identified, just to be double sure... - if (tagIdx != tagIdx2 || tdn->getBCnumber() != tdn2->getBCnumber()) { - cerr << "Unequal BC numbers:" << tagIdx << " : " << tagIdx2 << "; in object: " << tdn->getBCnumber() << " : " << tdn2->getBCnumber() << endl; - cerr << "In read:" << tdn->getID() << endl; - exit(835); - } - } - else if (tagIdx >= 0) { - if (MIDuse&&ch1) { curFil->BCintoHead(tagIdx, tdn, presentBC, c_err, true); } - else { curFil->setBCdna(tagIdx, tdn); } - if (ch2) { curFil->BCintoHead(tagIdx, tdn2, presentBC, c_err, true); } - } - - if (tagIdx == -1 || tagIdx2 == -1) { - if (ch1) { - tdn->setBarcodeDetected(false); - } - if (ch2) { - tdn2->setBarcodeDetected(false); - } - } - - //demultiplex write? do this first before DNA is deleted.. - if (curFil->Demulti2Fls()) { - curFil->write2Demulti(tdn, 0, MD->getfastQoutVer()); - curFil->write2Demulti(tdn2, 1, MD->getfastQoutVer()); - } - - - //at this point the tagIDX *MUST* be correctly set + BCoffset (in the DNA object, tagIDX doesn;t matter) - MD->depPrep(tdn, tdn2); - MD->writeNonBCReads(tdn, tdn2); - - int idx1 = 1; int idx2 = 2; - if (ch1 && !ch2) { - idx1 = 3; idx2 = 4; - if (tdn2 != NULL) { tdn2->failed(); } -// delete tdn2; - } - else if (ch2 && !ch1) { - idx2 = 4; idx1 = 3; - if (tdn != NULL) { tdn->failed(); } -// delete tdn; - } - else if (!ch1 && !ch2){ //nothing passes - if (tdn != NULL) { tdn->failed(); } - if (tdn2 != NULL) { tdn2->failed(); } -// delete tdn; delete tdn2; - } - - //save for later .. and collect stats - if (!MD->saveForWrite(tdn, idx1) || - !MD->saveForWrite(tdn2, idx2)) { - return false; - } - return true; -} - -bool read_paired(OptContainer& cmdArgs, shared_ptr MD, shared_ptr IS, bool MIDuse) { - DNAmap oldMIDs; - bool fqHeadVer(true); - shared_ptr MIDseq(NULL); - bool syncedMID(MD->getFilters()->synRdPairs()); - - bool sync2pair(true); - DNAmap pair1rem,pair2rem,MIDrem; - - /*if (sync2pair && MIDuse) { - cout << "Can not sync read pairs, while explicit MID sequences are being used! (not supported, sorry)\n"; - sync2pair = false; - } */ - - //bool syncedMID = false; -#ifdef DEBUG - cerr << "Read paired routine" << endl; -#endif -#ifdef _THREADED - vector threads( Nthrds ); - int Nthrds = atoi(cmdArgs["-threads"].c_str()) -1 ; - int thrCnt = 0; - bool threadActive(false); - MD->setSubfilters(Nthrds); - int DNAinMem(0); -#endif - bool cont(true),cont2(true),cont3(true); - int revConstellation(0); - int cnt(0); - string tdnSh(""), tdnSh2(""); - bool switching(true); // important to keep track of this, to fix swapped read pairs - - while ( cont ) { - - - bool settdnSh(false); - shared_ptr tdn = IS->getDNA(cont, 0, sync2pair); - - cnt++; - if ( tdn == NULL && !cont) { break; } - //tagIdx = -2; tagIdx2 = -2; - shared_ptr tdn2 = IS->getDNA(cont2, 1, sync2pair);//read_fastq_entry(fna2,fastQver,minQScore,lnCnt); - if (!sync2pair && !cont2 && cont ) { - cerr << "Second provided file has not the same number of entries as first file.\n"; - exit(5); - } - //syn 2nd pair - if (sync2pair) { - if (!settdnSh) { tdnSh = tdn->getIDshort(); settdnSh = true; } - if (tdn2 != NULL) { tdnSh2 = tdn2->getIDshort(); }else { tdnSh2 = ""; } - DNAmap::iterator search; - if (tdnSh2 != tdnSh) {//something wrong at all? ini search on old pair2 - if (switching) { - search = pair2rem.find(tdnSh); - if (search != pair2rem.end()) { pair2rem[tdnSh2] = tdn2; tdn2 = search->second; pair2rem.erase(search); tdnSh2 = tdn2->getIDshort(); switching = !switching;} - } else { - search = pair1rem.find(tdnSh2); - if (search != pair1rem.end()) { pair1rem[tdnSh] = tdn; tdn = search->second; pair1rem.erase(search); tdnSh = tdn->getIDshort(); switching = !switching;} - } - } - while (tdnSh2 != tdnSh) {// still wrong? search in old unmatched reads, switching between 1/2 - if (switching) { - search = pair1rem.find(tdnSh2); pair1rem[tdnSh] = tdn; - if (search != pair1rem.end()) { - tdn = search->second; pair1rem.erase(search); - }else {//nothing? try getting new reads, maybe there is a match here - tdn = IS->getDNA(cont, 0, sync2pair); - } - if (tdn != NULL) { tdnSh = tdn->getIDshort(); } else {tdnSh = "";} - } else { - search = pair2rem.find(tdnSh); pair2rem[tdnSh2] = tdn2; - if (search != pair2rem.end()) { tdn2 = search->second; pair2rem.erase(search); - } else { - tdn2 = IS->getDNA(cont2, 1, sync2pair); - } - if (tdn2 != NULL) { tdnSh2 = tdn2->getIDshort(); } else { tdnSh2 = ""; } - } - - //read 1 / read 2 - switching = !switching; - } - } - if (cnt % 50 == 0) { switching = !switching; } //add some randomness to the process. - //security check that pairs are in sync - if (!sync2pair && cnt % 1000 == 0) {//default check for synced reads, no matter what - if (!settdnSh) { tdnSh = tdn->getIDshort(); settdnSh = true; } - if (!tdn2->sameHead(tdnSh) ) { cerr << "WARNING: read pairs out of sync (" << cnt << "): " << tdnSh << " " << tdnSh2 << endl; } - } - //sync mid sequence header - if (MIDuse) { - if (!settdnSh) {tdnSh = tdn->getIDshort(); settdnSh = true; } - //1st try to find in old heap - if ( !syncedMID && (cont || cont2) ) { - auto search = oldMIDs.find(tdnSh); - if ( search != oldMIDs.end() ) { - //it's in old.. give it to MIDseq, other rountines will del it - MIDseq = (*search).second; oldMIDs.erase(search); - } else { - //read some new lines, maybe here? - MIDseq = IS->getDNA(cont3, 2, sync2pair); - bool SameMIDHead(MIDseq->sameHead(tdnSh)); - while ( !SameMIDHead && cont3 ) { - //current MID not matching.. stove away - oldMIDs[MIDseq->getIDshort()] = MIDseq; MIDseq = IS->getDNA(cont3, 2, sync2pair); - SameMIDHead = MIDseq->sameHead(tdnSh); - } - } - } else {MIDseq = IS->getDNA(cont3, 2, sync2pair);} - MIDseq->setMIDseq(true); - //FQ header version changes have to occur, before the MID tag is labelled - - } - //check if the PE format is right - if ( fqHeadVer ) { MD->checkFastqHeadVersion(tdn); fqHeadVer = false; } - - cont = read_paired_DNAready(tdn,tdn2, MIDseq, MIDuse, MD, revConstellation); - } - - //check on remainders in pair2rem / pair1rem - if (sync2pair) { - if (pair1rem.size() > 0 && pair2rem.size() > 0) { - cerr << "Trying to match " << pair1rem.size() << " / " << pair2rem.size() << " out-of-sync read pairs..\n"; - DNAmap::iterator search; - for (DNAmap::iterator sr = pair1rem.begin(); sr != pair1rem.end(); sr++) { - search = pair2rem.find(sr->first); - if (search != pair2rem.end()) { - cont = read_paired_DNAready(sr->second, search->second, MIDseq, MIDuse, MD, revConstellation); - pair2rem[search->first] = NULL; pair1rem[sr->first] = NULL; - pair2rem.erase(search); pair1rem.erase( sr ); - } - } - cerr << "Writing remaining " << pair1rem.size() << " / " << pair2rem.size() << " out-of-sync reads as singletons.\n"; - } else if (pair1rem.size() > 0 || pair2rem.size() > 0) { - cerr << "Found " << pair1rem.size() << " / " << pair2rem.size() << " out-of-sync read pairs.\n"; - } - for (DNAmap::iterator sr = pair1rem.begin(); sr != pair1rem.end(); sr++) { - cont = read_paired_DNAready(sr->second, NULL, MIDseq, MIDuse, MD, revConstellation); - pair1rem[sr->first] = NULL;//object needs to remain in mem - } - for (DNAmap::iterator sr = pair2rem.begin(); sr != pair2rem.end(); sr++) { - cont = read_paired_DNAready(NULL, sr->second, MIDseq, MIDuse, MD, revConstellation); - pair2rem[sr->first] = NULL; -// pair2rem.erase(sr); - } - - } - //close shop - MD->revConstellationCnts(revConstellation); - MD->closeOutStreams(); - return true; -} - -bool readCmdArgs(int argc, char* argv[],OptContainer& cmdArgs){ - if (argc%2!=1){ - cerr<<"It seems command line arguments were not passed in pairs. Aborting.\n"; - exit(666); - } - for (int i=1; i\n"; - exit(2); - } - if (cmdArgs.find("-i_qual") == cmdArgs.end()){ - string newQ = cmdArgs["-i_fna"]; - int pos = (int)newQ.find_last_of("."); - newQ = newQ.substr(0,pos); - newQ += string(".qual"); - fstream fin; - fin.open(newQ.c_str(),ios::in); - if( fin.is_open() ) { - cerr<<"Using quality file: "<\n"; - newQ = ""; - //fin.close(); exit(2); - } - fin.close(); - cmdArgs["-i_qual"] = newQ; - } - } - //auto create output file name - if (cmdArgs.find("-o_fna") == cmdArgs.end()){ - if (cmdArgs.find("-o_fastq") == cmdArgs.end()){ - //cmdArgs["-o_fna"] = cmdArgs["-i_fna"]+string(".sdm"); - //cerr<<"Writing output fasta into "<\n"; - exit(2); - } */ - if (cmdArgs.find("-o_qual") == cmdArgs.end()){ - cmdArgs["-o_qual"] = ""; - } else { - if (cmdArgs.find("-o_fastq") != cmdArgs.end()){ - cerr<<"\"-o_qual\" was over-writen by \"-o_fastq\"\n"; - cmdArgs["-o_qual"] = ""; - } - } - if (cmdArgs.find("-options") == cmdArgs.end()){ - cmdArgs["-options"] = string("sdm_options.txt"); - } - if (cmdArgs.find("-threads") == cmdArgs.end()){ - cmdArgs["-threads"] = "1"; - } - if (cmdArgs.find("-log") == cmdArgs.end()){ - string ofile1 = cmdArgs["-o_fna"]; - if (ofile1==""){ofile1 = cmdArgs["-o_fastq"];} - vector tvec = splitByComma(ofile1,false); - ofile1 = tvec[0]; - //remove file ending - unsigned int pos = (unsigned int) ofile1.find_last_of("."); - if (pos != string::npos){ofile1 = ofile1.substr(0,pos); } - if (tvec.size()==2){ - ofile1+= "_" + getFileNoPath(tvec[1]); - pos = (unsigned int) ofile1.find_last_of("."); - if (pos != string::npos){ofile1 = ofile1.substr(0,pos); } - } - cmdArgs["-log"] = ofile1 + string(".log"); - } - string ofile1 = cmdArgs["-log"]; - ofile1.find_last_of(".log"); - size_t logPos = ofile1.find_last_of("."); - if (logPos != std::string::npos){ - ofile1 = ofile1.substr(0,logPos); - } - if (cmdArgs.find("-length_hist") == cmdArgs.end()){ - cmdArgs["-length_hist"] = ofile1 + string("_lenHist.txt"); - } - if (cmdArgs.find("-qual_hist") == cmdArgs.end()){ - cmdArgs["-qual_hist"] = ofile1 + string("_qualHist.txt"); - } - //-length_hist -qual_hist - - if (cmdArgs.find("-sample_sep") == cmdArgs.end()){ - cmdArgs["-sample_sep"] = DEFAULT_BarcodeNameSep; - } else if (cmdArgs["-sample_sep"]==""){ - cerr<<"Invalid sample separator (empty).\nAborting..\n";exit(82); - } - - - if (cmdArgs.find("-o_qual_offset") == cmdArgs.end()) { - cmdArgs["-o_qual_offset"] = DEFAULT_output_qual_offset; - } - - if (cmdArgs.find("-ignore_IO_errors") == cmdArgs.end()) { - cmdArgs["-ignore_IO_errors"] = DEFAULT_ignore_IO_errors; - } else if (cmdArgs["-ignore_IO_errors"] != "0" && cmdArgs["-ignore_IO_errors"] != "1") { - cerr << "Argument \"ignore_IO_errors\" can only be \"1\" or \"0\". Instead it has value: " << cmdArgs["-ignore_IO_errors"] << endl; - exit(323); - } - if (cmdArgs.find("-o_dereplicate") == cmdArgs.end()) { - cmdArgs["-o_dereplicate"] = ""; - } - if (cmdArgs.find("-derep_map") == cmdArgs.end()) { - cmdArgs["-derep_map"] = ""; - } - - - - //if (cmdArgs.count("-i_fna")==0){} - - return true; -} - - -/******************************************* -* read_fasta * -******************************************* - -void openOutFiles(string files, string fmt, string xtr){ - ofstream fnaOut; - - vector tfnaout(0); - if (files.find(",") != string::npos){ - tfnaout = splitByCommas(files); - } else { - tfnaout.push_back(files); - } - bool multiple = tfnaout.size() > 1; - string xtr2 = ""; - if (multiple){xtr2 = "paired ";} - for (uint i =0; i< tfnaout.size(); i++){ - fnaOut.open ( tfnaout[i].c_str(),ios_base::out); - if (!fnaOut){ cerr<<"Could not open "< tfnaout = splitByComma(cmdArgs["-o_fna"],false); - for (unsigned int i=0; i tqout = splitByComma(cmdArgs["-o_qual"],false); - for (unsigned int i=0; i mainFil,OptContainer& cmdArgs){ -#ifdef DEBUG - cerr << "separateByFile"< FastaF = mainFil->getFastaFiles(); - vector QualF = mainFil->getQualFiles(); - vector FastqF = mainFil->getFastqFiles(); - vector MIDfq = mainFil->getMIDfqFiles(); - vector tar; - vector < vector > idx(0); - bool bFASTQ = true; - //prepareOutFiles(cmdArgs); - string path=""; - if (cmdArgs.find("-i_path") != cmdArgs.end() && cmdArgs["-i_path"].length() > 2){ - path=cmdArgs["-i_path"] + string("/"); - } - - - if (FastaF.size()>0){ //fasta way - tar = FastaF; - bFASTQ = false; - } else { // fastq way - tar = FastqF; - if (FastqF.size()==0){ - cerr<<"No FastQ or Fasta file given.\n Aborting..\n"; - exit(12); - } - } - - vector uniqueFas(1,tar[0]); - idx.push_back(vector (1,0)); - - for (unsigned int i=1; i (1,i)); - } - } - - //unique Fas files set up.. check for their existence - shared_ptr testFiles = - make_shared(!bFASTQ, mainFil->getuserReqFastqVer(), "1"); - for (unsigned int i = 0; i < uniqueFas.size(); i++) { - int tarID = idx[i][0]; string tmp; - string x = testFiles->setupInput(path, i, tarID, uniqueFas, FastqF, FastaF, QualF, MIDfq, mainFil->isPaired(), cmdArgs["-onlyPair"], tmp, true); - } -// delete testFiles; - - string mainFile = "", outFile = cmdArgs["-o_fna"]; - - //prepare for Seed extension or Read subselection, if requested - UClinks *ucl = NULL; shared_ptr RDSset; - shared_ptr Dere ; - if (mainFil->doOptimalClusterSeq()){ - ucl = new UClinks(cmdArgs); - if (cmdArgs.find("-mergedPairs") != cmdArgs.end() && cmdArgs["-mergedPairs"] == "1"){ - ucl->pairedSeqsMerged(mainFil); - } - else { - mainFil->setFloatingEWin(10, 25); - } - //are fallback fasta sequences available? - if (cmdArgs["-OTU_fallback"] != ""){ - shared_ptr FALL = make_shared(true, mainFil->getuserReqFastqVer(), cmdArgs["-ignore_IO_errors"]); - FALL->setupFna(cmdArgs["-OTU_fallback"]); - ucl->setupDefSeeds(FALL,mainFil); - } - } - else if (mainFil->doSubselReads()){ - //this will select a list of reads and distribute these into multiple files - RDSset = make_shared(cmdArgs["-specificReads"],""); - } else if (mainFil->doDereplicate()) { - Dere = make_shared(cmdArgs); - } - //needs to attach to existing file sometimes - std::ios_base::openmode writeStatus = ios_base::out; - bool shortStats = false; - string shrtLog = ""; - - - // main loop that goes over different files - int maxRds = mainFil->getXreads(); - int totReadsRead(0); - for (unsigned int i=0; i0 && maxRds - totReadsRead <= 0) { break; } - shared_ptr fil = make_shared(mainFil, idx[i][0]); - unsigned int tarSi = (unsigned int) idx[i].size(); - fil->allResize(tarSi); - int tarID=-1; - bool BC2mode = mainFil->doubleBarcodes(); - //int readsRead(0); - - - for (unsigned int j=0; jPrimerIdx[tarID]>-1) { - fil->addPrimerL(mainFil->PrimerL[mainFil->PrimerIdx[tarID]], j); - } - if (mainFil->doReversePrimers() && mainFil->PrimerIdxRev[tarID]>-1) { - fil->addPrimerR(mainFil->PrimerR[mainFil->PrimerIdxRev[tarID]], j); - } - fil->Barcode[j] = mainFil->Barcode[tarID]; - if ( BC2mode ) { - fil->Barcode2[j] = mainFil->Barcode2[tarID]; - } - fil->SampleID[j] = mainFil->SampleID[tarID]; - fil->SampleID_Combi[j] = mainFil->SampleID_Combi[tarID]; - fil->HeadSmplID[j] = mainFil->HeadSmplID[tarID]; - if (fil->Demulti2Fls()) { - fil->demultiSinglFiles[j] = mainFil->demultiSinglFiles[tarID]; - fil->demultiSinglFilesF[j] = mainFil->demultiSinglFilesF[tarID]; - - //closing of ofstreams is only handled on the main object - //mainFil->demultiSinglFiles[tarID] = vector (2,NULL); - } - } - fil->checkDoubleBarcode(); - - if (tarID==-1){cerr<<"tar == -1. abort.\n";exit(10);} - - //initialize object to handle all input file combinations - - shared_ptr IS = make_shared(!bFASTQ, mainFil->getuserReqFastqVer(), cmdArgs["-ignore_IO_errors"]); - if (tarSi < 2 && uniqueFas.size() > 1) { - IS->atFileYofX(i + 1, (uint)uniqueFas.size(), tarSi); - } - string mainFileShort = ""; - mainFile = IS->setupInput(path, i, tarID, uniqueFas, FastqF, FastaF, QualF, MIDfq, fil->isPaired(), cmdArgs["-onlyPair"], mainFileShort, false); - if (!IS->qualityPresent()) { - fil->deactivateQualFilter(); - cerr << "\n*********\nWarning:: Quality file is not present.\nRecommended to abort demultiplexing.\n*********\n\n"; - } - fil->BarcodePreStats(); - fil->checkDoubleBarcode(); - fil->checkDoubleSampleIDHead(); - - - if (mainFil->doOptimalClusterSeq()){ - ucl->findSeq2UCinstruction(IS,bFASTQ,mainFil); - continue; - } - - -#ifdef DEBUG - cerr << "Setting up output" << endl; -#endif - //MultiDNA MD = MultiDNA(&fil, cmdArgs, writeStatus, RDSset); - shared_ptr MD = make_shared(fil, cmdArgs, writeStatus, RDSset); - fil->setMultiDNA(MD); - if (maxRds > 0) { MD->setReadLimit(maxRds - totReadsRead); } - writeStatus = ofstream::app; - //prepare for BC checking (rev/fwd) - if (fil->doDemultiplex()){ - MD->setBCfixed(false, true); - if (MD->isPEseq() == 2) { MD->setBCfixed(false, false); } - } - if (cmdArgs.find("-oneLineFastaFormat") != cmdArgs.end() && cmdArgs["-oneLineFastaFormat"] == "1") { - MD->setOneLinerFastaFmt(true); - } - //cout << Dere->Nms_size() << " DEBCs\n"; - MD->attachDereplicator(Dere); - //only pull out a subset of sequences - if (mainFil->doSubselReads()) { - if (cmdArgs.find("-mocatFix") != cmdArgs.end()) { - cerr << "MOCAT fix appplies\n"; - RDSset->findMatches(IS, MD, true); - }else{ - RDSset->findMatches(IS, MD, false); - } - //delete MD; - continue; - } - - -#ifdef DEBUG - cerr << "Processing reads" << endl; -#endif - - - - - - //********************** - //heavy reading routine - //********************** - if (MD->isPEseq() == 2){ - //read_paired(cmdArgs,MD,IS); - while ( !read_paired(cmdArgs, MD, IS, IS->hasMIDseqs()) ) { - //reset output files to previous state - MD->resetOutFilesAndFilter(); - } - } else { - read_single(cmdArgs,MD,IS); - } - - - - -#ifdef DEBUG - cerr << "All read processed" << endl; -#endif - outFile = MD->leadOutFile(); -// delete MD; -#ifdef DEBUG - cerr << "MD deleted" << endl; -#endif - - //stats - fil->prepStats(); - if (IS->getCurFileN() == 0) { - fil->printStats(cerr, mainFile, outFile, true); - } else { - cerr<shortStats(""); shortStats = true; - } - - totReadsRead += fil->totalAccepts(); - - shrtLog += fil->shortStats( mainFileShort); -// delete IS; - //write log file - if (uniqueFas.size() > 1){//only print sub log if neccessary - ofstream log; - string logF = cmdArgs["-log"] + string("0") + itos(i); - log.open (logF.c_str() ,ios_base::out); - fil->printStats(log,mainFile,outFile,true); - log.close(); - } - mainFil->addStats(fil,idx[i]); -#ifdef DEBUG - cerr << "Delete tmp filter" << endl; -#endif - //and cleanup - // - //fil; - } -#ifdef DEBUG - cerr << "Prep final logging" << endl; -#endif -//write log files - if (uniqueFas.size() > 1){ - mainFile = "several"; - } - - ofstream log; string deLog(""); - string logF = cmdArgs["-log"], logFA = cmdArgs["-log"].substr(0, cmdArgs["-log"].length()-3) + "add.log"; - - //different logfile for SEED extension - if (mainFil->doOptimalClusterSeq()){ - //finish up dereplication file (creating pseudo seeds with counts) - ucl->finishMAPfile(); - if (cmdArgs["-ucAdditionalCounts"] != ""){ - ucl->set2UC(); - ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts"], true);//with smplHead (.mid) - ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts1"], false);//without smplHead (.rest) - } - if (cmdArgs["-ucAdditionalCounts_refclust"] != ""){ - //reference based clustering has some high qual seqs (no replacement with reads..) - shared_ptr FALL = make_shared(true, mainFil->getuserReqFastqVer()); - //this reads in the SLV fna's & creates matrix entries for these - FALL->setupFna(cmdArgs["-OTU_fallback_refclust"]); - ucl->setRefMode(); - ucl->addDefSeeds(FALL, mainFil); - ucl->set2UC(); - //mapping from ref OTU clustering - ucl->finishUCfile(mainFil, cmdArgs["-optimalRead2Cluster_ref"], false); - //mid / rest mappings - ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts_refclust"], true);//with smplHead (.rest) - ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts_refclust1"], false);//without smplHead (.rest) - - } - if (cmdArgs["-log"] != "nolog") { - log.open(logF.c_str(), ios_base::out); - ucl->printStats(cerr); - ucl->printStats(log); - log.close(); - } - - //everything done on DNA? Then write & delete - if (cmdArgs["-otu_matrix"] != "") { - ucl->writeOTUmatrix(cmdArgs["-otu_matrix"], mainFil); - } - //needs to be written after OTU matrix (renaming scheme) - shared_ptr MD = make_shared(mainFil, cmdArgs, ios::out, RDSset); - mainFil->setMultiDNA(MD); - ucl->writeNewSeeds(MD, mainFil,false); - //delete MD; - //new fastas also need to be written.. - MD.reset(new MultiDNA(mainFil, cmdArgs, ios::out, RDSset, ".ref", 1));//force fna output - mainFil->setMultiDNA( MD ); - ucl->writeNewSeeds(MD, mainFil,true,true); - //delete MD; - - - return; - } else if (mainFil->doDereplicate()) { -#ifdef DEBUG - cerr << "write Dereplicated DNA" << endl; -#endif - deLog = Dere->writeDereplDNA(mainFil); -#ifdef DEBUG - cerr << "done write Dere" << endl; -#endif - } -#ifdef DEBUG - cerr << "Logging almost finished" << endl; -#endif - - if (cmdArgs["-log"] == "nolog") { -// delete Dere; - return; - } -#ifdef DEBUG - cerr << "DereLog start" << endl; -#endif - if (mainFil->doDereplicate()) { - string dereLog = logF.substr(0,logF.length()-3) + "dere"; - Dere->writeLog(dereLog, deLog); -// delete Dere; - } - -#ifdef DEBUG - cerr << "DereLog end" << endl; -#endif - if (shortStats) { - mainFil->printStats(std::cerr, mainFile, outFile, true); - } -#ifdef DEBUG - cerr << "other logs start" << endl; -#endif - //per sample success rate - string logPS = logF.substr(0, logF.length() - 3) + "acceptsPerSample.log"; - log.open(logPS.c_str(), ios_base::out); - mainFil->SmplSpecStats(log); - log.close(); - log.open(logF.c_str(), ios_base::out); - mainFil->printStats(log,mainFile,outFile,true); - log.close(); - - string logFs = logF.substr(0, logF.length() - 3) + "acceptsPerFile.log"; - log.open(logFs.c_str(), ios_base::out); - log << shrtLog; - log.close(); - - string logFGC = logF.substr(0, logF.length() - 3) + "GC.txt"; - log.open(logFGC.c_str(), ios_base::out); - mainFil->printGC(log, mainFil->isPaired()); - log.close(); -#ifdef DEBUG - cerr << "other logs end" << endl; -#endif - - -//for additional files - if (mainFil->secondaryOutput()){ - log.open (logFA.c_str() ,ios_base::out); - mainFil->printStats(log,mainFile,outFile,false); - log.close(); - } - - - //length histogram - logF = cmdArgs["-length_hist"]; - log.open (logF.c_str() ,ios_base::out); - mainFil->printHisto(log,0); - log.close(); - //quality histogram - logF = cmdArgs["-qual_hist"]; - log.open (logF.c_str() ,ios_base::out); - mainFil->printHisto(log,1); - log.close(); - mainFil->close_outFiles_demulti(); -#ifdef DEBUG - cerr << "separateByFile finished" << endl; -#endif - -} - -void rewriteNumbers(OptContainer& cmdArgs){ - //no renumbering asked for - if (!(cmdArgs.find("-number") != cmdArgs.end() && cmdArgs["-number"]=="T")){ - return; - } - string prefix=""; - if (cmdArgs.find("-prefix") != cmdArgs.end()){ - prefix = cmdArgs["-prefix"]; - } - //read fasta & write with new headers - int cnt=0; - - string line; - //ofstream qualOut,fnaOut; - ifstream fna; - ofstream ofna; - - // rerwite input fasta file - string tname="",tseq=""; - fna.open(cmdArgs["-i_fna"].c_str(),ios::in); - ofna.open(cmdArgs["-o_fna"].c_str(),ios::out); - while (getline(fna,line,'\n')){ - - if (line[0]=='$'){ //$ marks comment - continue; - } - if(line[0] == '>'){ //fasta description - if (cnt!=0){ - tname = ">"+prefix+itos(cnt)+"\n"; - ofna << tname << tseq; - } - cnt++;tseq=""; - continue; - } - tseq += line+"\n"; - } - tname = ">"+prefix+itos(cnt)+"\n"; - ofna << tname << tseq; - ofna.close(); fna.close(); - - exit(0); -} - -void Announce_sdm(){ - cerr << endl << "This is sdm (simple demultiplexer) " << sdm_version << " " << sdm_status << "." << endl << endl; -} -void help_head(){ - cout <<"------------------------------\nThis is sdm version "<\n------OR------\n -i \n------OR------\n -i_fastq \n------OR------\n -i_fna (required)\n -i_qual (required, unless quality file is \"xx1.qual\" and fasta is \"xx1.yy\")\n\n -map (optional)\n -o_fna (optional)\n -o_qual (optional)\n -o_fastq (optional)\n -log (optional). Set to \"nolog\" to deactivate alltogether.\n \n-sample_sep \"X\" string X is used to delimit samples and ID (optional, default:\"" << def_sep << "\")\n -paired 1/2/3 (input is paired end sequenced(2), assumes two input files delimited by \',\'. 1=singleton (default); 3=paired end (R1,R3) + one file with MID (R2))\n"; - cout << " -o_demultiplex [path] write input into single, demultiplexed files\n"; - cout << " -onlyPair [1/2] consider only read pair 1 or 2. Useful when streamlining inputs (LotuS) or considering double barcoding.\n -i_MID_fastq fastq file with only MID sequences; if paired reads are supplied with -i_fna/-i_fastq and the MID identifier via -i_MID_fastq, paired has to be set to 2. If e.g. merged reads are supplied + mids, paired has to be set to 1.\n"; - cout << " -oneLineFastaFormat [0/1] write Fasta and Quality file sequence string in one line, opposed to default 80 characters per line.\n -o_dereplicate of dereplicated DNA reads (with size in header)\n -dere_size_fmt [0/1] either (0) usearch format \"size=X;\" or (1) \"_X\"\n -min_derep_copies only print seq if at least X copies present. Can be complex terms like \"10:1,3:3\" -> meaning at least 10x in 1 sample or 3x in 3 different samples.\n"; - cout << " -SyncReadPairs [T/F] sdm can check, if read pairs occur in the same (correct) order in the input files, and correct this in case not (T).\n"; - cout << " -maxReadsPerOutput number of filtered reads in output files. If more reads, a new file is created. Only works with -o_fna\n -mergedPairs <1/0> 1: paired sequences were merged externally, important for assumption that read quality is detoriating.\n -OTU_fallback : Fallback fasta sequences for OTU's, only used in SEED extension mode\n"; - cout << " -i_qual_offset [0-64] fastq offset for quality values. Set this to \'0\' or \'auto\' if you are unsure which fastq version is being used (default: read from sdm option file)\n -o_qual_offset [0-64] set quality offset for fastq outfile. Default: 33\n"; - cout << " -ignore_IO_errors [0/1]: 1=Errors in fastq reads are ignored, with sdm trying to sync reads pairs after corrupted single reads (default: 0)\n"; - //-binomialFilterBothPairs [1/0] - //-count_chimeras [T/F] - // ucAdditionalCounts_refclust -OTU_fallback_refclust -optimalRead2Cluster_ref - cout<<"\nMinimal Example:\n./sdm -i test.fna -map mapping.txt (assuming quality file is \"test.qual\")\n"; - // further options (undocumented) : - //-length_hist -qual_hist - //-suppressOutput[0/1] - // -} -void printOptionHelp(){ - help_head(); - cout<<"The option file, specified via the \"-options\" argument, provides more specific control over filtering, barcode handling, and sequencing technologies, among others. A reference option file is printed below.\n\n"; - string helpOptionFile=""; - /*helpOptionFile += "minSeqLength - minimal accepted Sequence Length\nmaxSeqLength - maximal Length of Sequence\nminAvgQuality - minimal average Quality\nmaxAmbiguousNT - max number of Ambigous nt's in sequence\nQualWindowThreshhold - Q threshold where seq is rejected\nQualWindowWidth - average quality in this windows is used for QualWindowThreshhold\n"; - helpOptionFile += string("TrimWindowThreshhold - Q value below which sequence is 3' trimmed\nTrimWindowWidth - window size used for TrimWindowThreshhold\nmaxBarcodeErrs - max accepted barcode errors\nmaxPrimerErrs - max accepted Primer errors\nkeepBarcodeSeq - leave Barcode Seq on read? (0/1)\n"); - helpOptionFile += "keepPrimerSeq - keep Primer attached to seq? (0/1)\nmaxHomonucleotide - sequences with a homonucleotide run longer will be rejected\nmaxAccumulatedError - if P is surpassed, sequence is trimmed at that point\nTechnicalAdapter - sequence of the technical adapter (will be removed, if found 5')\nPEheaderPairFmt - ?\nTrimStartNTs - trim X nucleotides from the start of the sequence "; - helpOptionFile += "fastqVersion - 1 = ";*/ - - helpOptionFile += "#--- Example ---\n#copy into new file\n#sequence length refers to sequence length AFTER removal of Primers, Barcodes and trimming. this ensures that downstream analyis tools will have appropiate sequence information\nminSeqLength 250\nmaxSeqLength 1000\nminAvgQuality 25\n\n"; - helpOptionFile += "#Ambiguous bases in Sequence - uclust only supports 0 ambiguous nucleotides\nmaxAmbiguousNT 0\n\n#Homonucleotide Runs.. this should normally be filtered by sequencer software\nmaxHomonucleotide 8\n\n"; - helpOptionFile += "#Filter whole sequence if one window of quality scores is below average\nQualWindowWidth 50\nQualWindowThreshhold 25\n\n#Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence\n\nTrimWindowWidth 20\nTrimWindowThreshhold 25\n\n#Max number of accumulated P for a mismatch. After this length, the rest of the sequence will be deleted. Complimentary to TrimWindowThreshhold. (-1) deactivates this option.\nmaxAccumulatedError 1\n\n"; - helpOptionFile += "#Barcode Errors - currently this can only be 0; \nmaxBarcodeErrs 0\nmaxPrimerErrs 0\n\n#keep Barcode / Primer Sequence in the output fasta file - in a normal 16S analysis this should be deactivated (0) for Barcode and de-activated (0) for primer\nkeepBarcodeSeq 0\nkeepPrimerSeq 0\n\n"; - helpOptionFile += "#set fastqVersion to 1 if you use Sanger, Illumina 1.8+ or NCBI SRA files. Set fastqVersion to 2, if you use Illumina 1.3+ - 1.7+ or Solexa fastq files.\n\nfastqVersion 1\n\n#if one or more files have a technical adapter still included (e.g. TCAG 454) this can be removed by setting this option\n\nTechnicalAdapter TCAG\n\n#delete X NTs (e.g. if the first 5 bases are known to have strange biases)\n\nTrimStartNTs 0\n"; - helpOptionFile += "#truncate total Sequence length to X (length after Barcode, Adapter and Primer removals)\nTruncateSequenceLength 200\n"; - helpOptionFile += "#correct PE header format (1/2) this is to accomodate the illumina miSeq paired end annotations 2=\"@XXX 1:0:4\" instead of 1=\"@XXX/1\". Note that the format will be automatically detected\nPEheaderPairFmt 1\n\n#sets if sequences without match to reverse primer will be accepted (T=reject ; F=accept all); default=F\nRejectSeqWithoutRevPrim F\n"; - helpOptionFile += "#sets if sequences without a forward (LinkerPrimerSequence) primer will be accepted (T=reject ; F=accept all); default=T\nRejectSeqWithoutFwdPrim T\n\n"; - helpOptionFile += "#checks if the whole amplicon was reverse-transcribed sequenced (Default = F)\nCheckForReversedSeqs F\n\n"; - helpOptionFile += "#this option should be \"T\" if your amplicons are possibly shorter than a read in a paired end sequencing run (e.g. amplicon of 300 in 250x2 miSeq is \"T\")\nAmpliconShortPE T\n\n"; - //CheckForMixedPairs CheckForReversedSeqs - cout<& cmdArgs); diff --git a/configs/sdm_src/IO.h b/configs/sdm_src/IO.h deleted file mode 100644 index 88d593f..0000000 --- a/configs/sdm_src/IO.h +++ /dev/null @@ -1,63 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ -//most obvious input / output operations - -#ifndef _IO_h -#define _IO_h - - -#include "containers.h" - - -typedef std::map> DNAmap; - - -//void openOutFiles(string files, string fmt,string ); -//void prepareOutFiles(OptContainer& cmdArgs); -//void read_fastq(OptContainer& cmdArgs, MultiDNA* MD,string fileS); -bool read_paired(OptContainer& cmdArgs, shared_ptr MD, shared_ptr,bool ); -bool read_paired_DNAready(shared_ptr tdn, shared_ptr tdn2, shared_ptr MIDseq, bool MIDuse, MultiDNA* MD, int& revConstellation); - -//bool read_tripple(OptContainer& cmdArgs, MultiDNA* MD, InputStreamer*); - -void separateByFile(shared_ptr mainFil,OptContainer& cmdArgs); - -void threadAnalyzeDNA(shared_ptr tdn, shared_ptr MD,int thrCnt); -//void trippleThreadAnalyzeDNA(shared_ptr MD, shared_ptr tdn,shared_ptrtdn2,shared_ptr MIDseq,bool changePHead);//,int thrCnt=0); - -void read_single(OptContainer& cmdArgs, shared_ptr MD, shared_ptr IS); - -bool readCmdArgs(int argc, char* argv[],OptContainer& cmdArgs); - - - - -//specialized functions .. end sdm after execution -void rewriteNumbers(OptContainer& cmdArgs); - - -void Announce_sdm(); -void help_head(); -void general_help(); -void printCmdsHelp(); -void printOptionHelp(); -void printMapHelp(); -void printVersion(); - - -//bool readCmdArgs(int argc, char* argv[],map& cmdArgs); -#endif \ No newline at end of file diff --git a/configs/sdm_src/IO.o b/configs/sdm_src/IO.o deleted file mode 100644 index b2a1b21..0000000 Binary files a/configs/sdm_src/IO.o and /dev/null differ diff --git a/configs/sdm_src/InputStream.cpp b/configs/sdm_src/InputStream.cpp deleted file mode 100644 index ea76b4a..0000000 --- a/configs/sdm_src/InputStream.cpp +++ /dev/null @@ -1,1974 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand -email: Falk.Hildebrand@gmail.com - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include "InputStream.h" -string spaceX(uint k){ - string ret = ""; - for (uint i = 0; i < k; i++){ - ret += " "; - } - return ret; -} - -int digitsInt(int x){ - int length = 1; - while (x /= 10) - length++; - return length; -} -int digitsFlt(float x){ - std::stringstream s; - s << x; - return (int)s.str().length(); -} -string intwithcommas(int value) { - string numWithCommas = std::to_string((long long)value); - int insertPosition = (int)numWithCommas.length() - 3; - while (insertPosition > 0) { - numWithCommas.insert(insertPosition, ","); - insertPosition -= 3; - } - return (numWithCommas); -} - -std::string itos(int number) { - std::stringstream ss; - ss << number; - return ss.str(); -} -std::string ftos(float number) { - std::stringstream ss; - ss << number; - return ss.str(); -} -bool isGZfile(const string fi) { - string subst = fi.substr(fi.length() - 3); - if (subst == ".gz") { - return true; - } - return false; -} - -std::istream& safeGetline(std::istream& is, std::string& t) { - t.clear(); - //from http://stackoverflow.com/questions/6089231/getting-std-ifstream-to-handle-lf-cr-and-crlf - // The characters in the stream are read one-by-one using a std::streambuf. - // That is faster than reading them one-by-one using the std::istream. - // Code that uses streambuf this way must be guarded by a sentry object. - // The sentry object performs various tasks, - // such as thread synchronization and updating the stream state. - - std::istream::sentry se(is, true); - std::streambuf* sb = is.rdbuf(); - - for (;;) { - int c = sb->sbumpc(); - switch (c) { - case '\n': - return is; - case '\r': - if (sb->sgetc() == '\n') - sb->sbumpc(); - return is; - case EOF: - // Also handle the case when the last line has no line ending - if (t.empty()) - is.setstate(std::ios::eofbit); - return is; - default: - t += (char)c; - } - } -} -//MOCAT -std::vector header_string_split(const std::string str, const std::string sep) { - std::vector tokens; - tokens.reserve(13); - size_t start = 0; - size_t pos = 0; - while ((pos = str.find_first_of(sep, start)) != std::string::npos) { - tokens.push_back(str.substr(start, pos - start)); - start = pos + 1; - } - if (start < str.length()) { - tokens.push_back(str.substr(start)); - } else if (start == str.length()) { - tokens.push_back("0"); - } - return tokens; -} -void remove_paired_info(string &s, short RP) { - size_t f1 = s.find(" "); - if (f1 != string::npos) { - s = s.substr(0, f1); - f1 = string::npos; - } - if (RP == 0) { - f1 = s.find("/1"); - } - else if (RP == 1) { - f1 = s.find("/2"); - } - else { - f1 = s.find("/1"); - if (f1 == string::npos) { f1 = s.find("/2"); } - } - if (f1 != string::npos && f1 == s.size() - 2) { - s = s.substr(0, f1); - } -} -std::string header_stem(string& header) { - const size_t slash = header.find('/'); - if (slash != std::string::npos) { - return header.substr(0, slash); - } - - std::vector tokens = header_string_split(header, ": "); - - if (tokens.size() == 11) { - //if (tokens[8] == "Y") seq.clear(); - return tokens[0] + ":" + tokens[3] + ":" + tokens[4] - + ":" + tokens[5] + ":" + tokens[6];// +"#" + tokens[10]; - } else if (tokens.size() == 6 || tokens.size() == 7) { - return tokens[0] + ":" + tokens[2] + ":" + tokens[3] - + ":" + tokens[4] + ":" + tokens[5];// + "#0"; - } else { - cerr << "fastq header format error\n"; exit(98); - } - //throw std::runtime_error("fastq header format error\n" + header); } -} -void reverseTS(string & Seq) { - int qs = (int)Seq.length() - 1; - string S2 = Seq.c_str(); - for (int i = qs; i >= 0; i--) { - Seq[i] = DNA_trans[(int)S2[qs - i]]; - } -} -string reverseTS2(const string & Seq) { - int qs = (int)Seq.length() - 1; - string S2 = Seq.c_str(); - for (int i = qs; i >= 0; i--) { - S2[i] = DNA_trans[(int)Seq[qs - i]]; - } - return S2; -} -bool any_lowered(const string& is) { - for (uint i = 0; i < is.length(); i++) { - char c = is[i]; - if (islower(c)) { return true; } - } - return false; -} -string applyFileIT(string x, int it,const string xtr){ - - size_t pos = x.find_last_of("."); - if (pos != string::npos && isGZfile(x)) { - pos = x.find_last_of(".", pos-1); - } - if (it == 0) { - if (pos == string::npos) { - return x + xtr; - } - return x.substr(0, pos) + xtr + x.substr(pos); - } - ostringstream ss; - ss << it; - if (pos == string::npos) { - return x + "." + ss.str() + xtr ; - } - return x.substr(0,pos) + "."+ss.str() + xtr + x.substr(pos); - - -} -bool fileExists(const std::string& name, int i, bool extiffail) { - if (name == "") { return true; } - if (FILE *file = fopen(name.c_str(), "r")) { - fclose(file); return true; - } else { - if (extiffail) { - cerr << "ERROR: Could not find file " << name << endl; - if (i >= 0) { - cerr << "on mapping file line " << i << endl; - } - exit(92); - } - return false; - } -} - -string detectSeqFmt(const string inF) { - - vector tfasP = splitByCommas(inF, ';'); - for (size_t i = 0; i < tfasP.size(); i++) { - vector tfas = splitByCommas(tfasP[i]); - string fileS = tfas[0]; - istream* fnax(NULL); - string file_type = "test file"; - if (fileS != "") { - if (isGZfile(fileS)) { -#ifdef _gzipread - file_type = "gzipped fasta file"; - fnax = new igzstream(fileS.c_str(), ios::in); -#else - cerr << "gzip not supported in your sdm build\n" << fileS; exit(50); -#endif - } - else { - fnax = new ifstream(fileS.c_str(), ios::in); - - } - if (!*(fnax)) { cerr << "\nCouldn't find " << file_type << " file \"" << fileS << "\"!\n Aborting..\n"; exit(4); } - //char Buffer[RDBUFFER]; - //fna_u[0]->rdbuf()->pubsetbuf(Buffer, RDBUFFER); - } - string tmp(""); - string ret = ""; - while (safeGetline(*fnax, tmp)) { - if (tmp[0] == '>') { - ret = "-i_fna"; break; - } - else if (tmp[0] == '@') { - ret = "-i_fastq"; break; - } - else if (tmp.length() == 0) {//do nothing - ; - } - else { - cerr << " Could not auto detect input format. First non-empty line of your file looked like:\n" << tmp << endl; - exit(888); - } - } - if (ret == "") { - cerr << "Empty input file detected:\n" << fileS << endl; - } else { - return ret; - } - - delete fnax; - } - - return "empty"; -} - - -/*vector orderOfVec(vector& vin) { - struct MyStruct - { - int key; - int Value; - MyStruct() :key(0), Value(0) {} - MyStruct(int k, const int s) : key(k), Value(s) {} - - bool operator < (const MyStruct& str) const { - return (key > str.key); - } - }; - std::vector < MyStruct > vec(vin.size()); - //fill vector - for (int i = 0; i < (int)vin.size(); i++) { - vec[i] = MyStruct(vin[i], i); - } - - sort(vec.begin(), vec.end()); - //extract from sorted vector - vector ret(vin.size(), 0); - for (size_t i = 0; i < vin.size(); i++) { - ret[i] = vec[i].Value; - } - return ret; - } - */ -bool DNA::seal() {//DN = Seq.c_str(); - size_t QSi = Qual.size(); - if (QSi == 0 && Seq == "" ) { - - this->setPassed(false); - return true;//nothing to be done, just empty DNA - } - if (QSi != Seq.length()) { - cerr << "Unequal length of seq and quality for name " << this->getID() << "\n"; - this->setPassed(false); - return false; - } - //uppercase DNA - std::transform(Seq.begin(), Seq.end(), Seq.begin(), ::toupper); - SeqLength = Seq.length(); - return true; -} - -string DNA::getIDPosFree() { // remove /1 /2 #1:0 etc - string s = this->getIDshort(); - remove_paired_info(s,Read_position); - return s; -} - - -int DNA::numACGT(){ - int DNAch = 0; - for (unsigned int i = 0; i < length(); i++){ - - DNAch += DNA_amb[(int)Seq[i]]; -// if (tmp == 'A' || tmp == 'C' || tmp == 'G' || tmp == 'T'){ DNAch++; } - } - return static_cast (length()) - DNAch; -} -void DNA::Qappend(const vector &q) -{ - Qual.insert(Qual.end(), q.begin(), q.end()); - /*unsigned int Qsiz = (unsigned int) Qual.size(); - Qual.resize(Qsiz+q.size(),0); - for (register unsigned int i=0; ilength(); i++){ - Qsum += Qual[i]; - } - } - avgQual = static_cast (Qsum) / static_cast (this->length()); - - } - return avgQual; -} - -/*float DNA::qualWinfloat_hybr(int W, float T, int W2, float T2, int& reason){//not used - //if (T==0.f){return true;} - int AQS=0, AQL; - int TotQ = 0; - int upTs = static_cast(T * W); - int upTl = static_cast(T2 * W2); - int QS = int (Qual.size()); - if (W>=QS){W = QS;} // too short - int smallerW = W, largerW = W2; - - bool W1IsSmall = true; - if (smallerW > W2){ largerW=W; smallerW = W2; W1IsSmall=false; - std::swap(upTl,upTs); } - - for (unsigned int i=0; i<(unsigned int) smallerW; i++){ - AQS += Qual[i]; - } - AQL = AQS; - - //hybrid schleife - for (unsigned int i=smallerW; i<(unsigned int) largerW; i++){ - AQL += Qual[i]; - AQS += Qual[i]; AQS -= Qual[i-smallerW]; - if (AQS < upTs){ - if (W1IsSmall){ reason=0; return 0.f; - } else {reason=1; this->cutSeq(i/2,this->length()); return float(AQL)/float(QS);} - } - } - - TotQ = AQL; - for (unsigned int i=W; i<(unsigned int) QS; i++){ - AQS += Qual[i]; AQL += Qual[i]; - TotQ += Qual[i]; - AQS -= Qual[i-W];AQL -= Qual[i-W]; - if (AQS < upTs || AQL < upTl){ - if (W1IsSmall){ reason=0; return 0.f; - } else {reason=1; this->cutSeq(i/2,this->length()); return float(AQL)/float(QS);} - } - } - //if (averageQ > static_cast (TotQ) /static_cast ( Qual.size())){return false;} - return static_cast (TotQ) /static_cast ( QS); - } - */ - -// modified from https://github.com/fpusan/moira/blob/master/moira/bernoullimodule.c -float DNA::interpolate(int errors1, float prob1, int errors2, float prob2, float alpha) -{ - float result = errors1 + ((errors2 - errors1) * ((1 - alpha) - prob1) / (prob2 - prob1)); - if (result < 0) //Happens only for very-short high qual sequences in which the probability of having 0 errors is higher than 1 - alpha. - { - result = 0; - } - return result; -} -float DNA::prob_j_errors(float p, float j, float n) //Where p is the error probability, j is the number of errors and n the number of observations. -{ - float per_position_accum_probs; - if (j > n) { - return 0.0f; //The formula below would also return 0. - } - per_position_accum_probs = pow((1 - p), n); //For j == 0. - float i(1); - for (; i <= j; i += 1.f) {//For j > 0. - per_position_accum_probs = ((n - i + 1.f) / (1.0f*i)) * (p / (1.f - p)) * per_position_accum_probs; - } - return per_position_accum_probs; - -} -float DNA::sum_of_binomials(const float j, int k, float n, int qual_length, const vector& error_probs, const vector< vector> & per_position_accum_probs) -//#Where j is the number of errors and k is the position in the sequence. -{ - float probability = 0; - float i(0); int k1 = (int) k - 1; - - for (; i <= j; i+=1.f) - { - probability += DNA::prob_j_errors(error_probs[k], i, n) * per_position_accum_probs[int(j - i)][k1]; - //Where error_probs[k] is the error probability of the k-th position. - //Where per_position_accum_probs[j-i][k-1] is the probability that all the bases from position 0 to k-1 had a total of j-i errors. - } - - return probability; -} - -float DNA::binomialFilter(int maxErr, float alpha){ - - if (alpha == -1.f|| this->length()<3){ return 0; }//deactivated - - ///Initialize some variables. - - int SeqLengthI = (int)SeqLength; - int n = 1; //Since we have a Bernoulli distribution. - float n_f = (float)n; - float alpha1 = 1.f - alpha; - - ///Translate quality scores into error probabilities. - vector error_probs (SeqLength,0.f); - for (size_t i = 0; i < SeqLength; i++){ - error_probs[i] = (float)SAqualP[Qual[i]];//pow(10, (contig_quals[i] / -10.0)); //Since we want a continuous list of non-N error_probs. - } - - ///Actually get the job done. - int max_expected_errors = maxErr + 3;// (int)SeqLength + 1; - int expected_errors = 0; - float probability; - vector accumulated_probs (max_expected_errors,0.f); - //int j; - int k; - vector empty(SeqLength, 0.f); - vector< vector> per_position_accum_probs(max_expected_errors, empty); - - while (1) - { - float expected_errors_f = (float)expected_errors; - //vector per_position_accum_probs(SeqLength, 0.f); - for (k = 0; k < (int)SeqLength; k++) { - if (k == 0) { - per_position_accum_probs[expected_errors][k] = DNA::prob_j_errors(error_probs[k], expected_errors_f, n_f); - } else { - - per_position_accum_probs[expected_errors][k] = DNA::sum_of_binomials((float)expected_errors, k, n_f, SeqLengthI, error_probs, per_position_accum_probs); - } - } - probability = per_position_accum_probs[expected_errors][SeqLengthI - 1]; - - if (expected_errors == 0){ - accumulated_probs[expected_errors] = probability; - }else{ - accumulated_probs[expected_errors] = accumulated_probs[expected_errors - 1] + probability; - } - - if (accumulated_probs[expected_errors] > (alpha1) || expected_errors >= (max_expected_errors-1)){ - break; - }else{ - expected_errors++; - } - } - if (expected_errors == 0){ - return 0; - } - float EXE = interpolate(expected_errors - 1, accumulated_probs[expected_errors - 1], expected_errors, accumulated_probs[expected_errors], alpha); - return EXE; -} - -float DNA::qualWinfloat(unsigned int W, float T, int& reason){ - //if (T==0.f){return true;} - int AQS=0; - int TotQ = 0; - int upTs = static_cast(T * W); - unsigned int QS = this->length();//static_cast (Qual.size()); - if (W>=QS){W = QS;} // too short - -//1st loop to ini window - for (unsigned int i=0; i<(unsigned int) W; i++){ - AQS += Qual[i]; - } - TotQ = AQS; - - for (unsigned int i=W; i<(unsigned int) QS; i++){ - AQS += Qual[i] - Qual[i - W]; - TotQ += Qual[i]; - if (AQS < upTs ){ - reason=1; return 0.f; - } - } - //if (averageQ > static_cast (TotQ) /static_cast ( Qual.size())){return false;} - return static_cast (TotQ) /static_cast ( QS); -} - -int DNA::qualAccumulate(double d){ - unsigned int i(0);double accErr(0.0); - for (; ilength(); i++) { - accErr+=SAqualP[Qual[i]]; - if (accErr >= d){break;} - } - - this->AccumError = accErr; - - return i; -} -void DNA::NTspecQualScores(vector& qsc, vector& ntcnt) { - size_t sql = Seq.length(); - if (qsc.size() < sql) { - // cerr << "qsc"; - qsc.resize(sql,0); - } - if (ntcnt.size() < sql) { - // cerr << "ntcnt"; - ntcnt.resize(sql,0); - } - for ( uint i = 0; i < sql; i++ ) { - short p = NT_POS[(int)Seq[i]]; - qsc[p] += Qual[i]; - ntcnt[p]++; - } -} - -bool DNA::qualAccumTrim(double d){ - if (d == -1.) { - return true; - } - unsigned int i(qualAccumulate(d)); - if (i != this->length()){ - //cut 3' end - this->cutSeq(i,this->length()); - this->QualCtrl.AccErrTrimmed = true; - return false; - } - //did not cut this sequence: - return true; -} -bool DNA::qualWinPos(unsigned int W, float T){ - if (T==0.f){return true;} - int AQ=0; - int unT = static_cast((float)W*T); - unsigned int QS = this->length();// (unsigned int)Qual.size(); - unsigned int QSh = QS >> 2; - QSh = max(QSh,W); - if (W>=QS){return true;} // too short - - vector WQ((int) W); - //TODO: check that the right num of nucs is taken.. - int cnt=0; - if (QS < W) {return true;} - for (unsigned int i=QS-1; i> QS-(unsigned int) W-1; i--){ - AQ += Qual[i]; - cnt++; - } - if (AQ > unT){return true;} - int curW = QS-(unsigned int) W; - for (uint i=QS-(unsigned int) W-1; i > QSh; i--){ - AQ += Qual[i]; - AQ -= Qual[i+W]; - - if (AQ < unT){ //min Window qual was broken.. kick seq - curW = i; - } else { - break; - } - } - - //partial seq removal - int pos = curW - (W>>1); - this->cutSeq(pos); - this->QualCtrl.QWinTrimmed = true; - return false; -} - -//removes part of seq and qual indexes between start & stop -bool DNA::cutSeq(int start, int stop,bool Pseudo){ - - if (stop == -1) { - if (start >= (int) SeqLength) { return false; } - } else if (start >= stop || stop > (int)Qual.size() || start >= (int)Qual.size()) { - return false; - } - - //pseudo deactivates cutting of 3' - if (Pseudo) { - if (stop == -1 || stop <= (int) SeqLength) { - SeqLength = start; - return true; - } - } - - string se = Seq; - if (stop == -1) { - stop = (int)Seq.length(); - } - if (start == 0) { - Seq = se.substr(stop); - } else { - Seq = se.substr(0, start) + se.substr(stop); - } - - //DN = Seq.c_str(); - //Quali - Qual.erase(Qual.begin()+start, Qual.begin()+stop); - - SeqLength = Seq.length(); - - return true; -} - -int DNA::matchSeq(std::string PrSt,int Err,int tolerance, int startPos){ - //const char* DN = Seq.c_str(); - //const char* Pr = PrSt.c_str(); - int PrL = (int) PrSt.length(); - int mthL = this->length() - PrL; - //int wantSc = PrL - Err; - int endPos(-1),pos(startPos), Prp(0), c_err(0),Prp2(0); - //bool res(false); - for (; pos< tolerance; pos++){ - if (pos > mthL) { break; } - c_err=0;Prp=0; Prp2=pos; - do { - -#ifdef _NEWMATCH - //new vector based matching - c_err += DNA_IUPAC[Seq[Prp2]+256*PrSt[Prp]]; if (c_err > Err){break;} -#else - //old, direct match - if (!matchDNA(Seq[Prp2],PrSt[Prp])){c_err++;if (c_err > Err){break;}} -#endif - Prp++; Prp2++; - } while ( Prp < PrL); - if (c_err<=Err ){endPos=pos;break;} - } - //if(!suc){pos=-1;} - return endPos; -} -void DNA::reset() { - AccumError = 0.; goodQual = false; midQual = false; - //FtsDetected.reset(); - avgQual = -1.f; Qsum = 0; tempFloat = 0.f; - QualTraf = ""; - - this->resetTruncation(); -} - -void DNA::reverse_transcribe() { - reverseTS(Seq); - std::reverse(Qual.begin(), Qual.end()); - AccumError = 0.; goodQual = false; midQual = false; - avgQual = -1.f; Qsum = 0; tempFloat = 0.f; - QualTraf = ""; - -} - -//match from end of Seq to find rev primer -int DNA::matchSeqRev(std::string PrSt,int Err, int check_l, - int coverage){ - //fail::ret -1 - int PrL = (int) PrSt.length(); - if (coverage==0){coverage=5;} //default seed set to 5 - int SeL = (int) Seq.size(); - //int wantSc = PrL - Err; - int pos(SeL-coverage), Prp(0), c_err(0),endPos(-1); - for (; pos> check_l; pos--){ - c_err=0;Prp=0; - int PrL2 = min(PrL,SeL-pos); - do { -#ifdef _NEWMATCH - c_err += DNA_IUPAC[Seq[pos+Prp]+256*PrSt[Prp]]; if (c_err > Err){break;} -#else - if(!matchDNA(Seq[pos+Prp],PrSt[Prp])){c_err++;if (c_err > Err){break;} } -#endif - Prp++; - } while (Prp < PrL2); - if (c_err<=Err ){ - endPos=pos;break;} - } - //secondary check for last few NT's - if (endPos==-2){ - pos = (SeL-1); - for (; pos> (SeL-coverage); pos--){ - c_err=0;Prp=0; - int PrL2 = min(PrL,SeL-pos); - do { -#ifdef _NEWMATCH - c_err += DNA_IUPAC[Seq[pos+Prp]+256*PrSt[Prp]]; if (c_err > Err){break;} -#else - if(!matchDNA(Seq[pos+Prp],PrSt[Prp])){c_err++; if (c_err > Err){break;}} -#endif - Prp++; - } while (Prp < PrL2); - if (c_err<=Err ){ - endPos=pos;break;} - } - } - return endPos; -} -// looks through total DNA seq -int DNA::matchSeq_tot(std::string Pr,int Err,int MaxPos, int& c_err){ - //const char* Pr = PrSt.c_str(); - int PrL = (int) Pr.length(); - int pos(0), Prp(0), Prp2(0); - bool suc(false); - for (pos=0; pos< MaxPos; pos++){ - c_err=0;Prp=0;Prp2=pos; - do { - if(Seq[Prp2]!=Pr[Prp]){ - c_err++; - } - Prp++; Prp2++; - } while (c_err <= Err && Prp< PrL); - if (c_err<=Err){suc=true;break;} - } - if(!suc){pos=-1;} - return pos; -} - - -bool DNA::matchDNA(char t1,char t2){ - if (t1==t2){ - return true; - } - switch (t2){ - case 'N': return true; - case 'R': if (t1=='A' || t1=='G' ) {return true;}break; - case 'Y': if (t1=='T' || t1=='C' ) {return true;}break; - case 'M': if (t1=='C' || t1=='A' ) {return true;}break; - case 'K': if (t1=='T' || t1=='G' ) {return true;}break; - case 'W': if (t1=='T' || t1=='A' ) {return true;}break; - case 'S': if (t1=='C' || t1=='G' ) {return true;}break; - case 'B': if (t1!='A') {return true;}break; - case 'D': if (t1!='C' ) {return true;}break; - case 'H': if (t1!='G' ) {return true;}break; - case 'V': if (t1!='T' ) {return true;}break; - } - return false; -} -bool DNA::HomoNTRuns(int max){ - char lastC = Seq[0]; - int rowC=1; - for (unsigned int i=1;i= max){ - return false; - } - } else { - rowC = 1; - lastC = Seq[i]; - } - } - return true; -} - -/* -void DNA::writeSeq(ofstream& wr){ - int cnt=0; - if (Seq.size()==0){return;} - wr<<">"<0){ - wr<"<0){ - wr<"<0){ - wr< Q2(Qual); - for (int i=qs;i>=0;i--){ - Qual[i] = Q2[qs-i]; - } - reverseTS(Seq); -} -*/ -bool DNA::sameHead(shared_ptr d){ - if (d == NULL){return false;} - return sameHead(d->getIDshort()); -} -bool DNA::sameHead(const string& oID) { - size_t pos = getShorterHeadPos(ID); - if (oID.size() < pos) { return false; } - if (ID.substr(0, pos) == oID.substr(0, pos)) { - return true; - } - return false; -} - - -void DNA::setPassed(bool b){ - goodQual=b; - if (goodQual && midQual) { - midQual = false; - } -} -int DNA::getBCnumber() { - //if (Sample==-1 ) - return Sample; -} - -void DNA::prepareWrite(int ofastQver) { - uint len = length(); - if (QualTraf.size() == len) { - return; - } - QualTraf.resize(len); - unsigned int i = 0; - for (; i < len; i++) { - QualTraf[i] = char(Qual[i] + ofastQver); - } - QualTraf[i] = '\0'; -} -void DNA::resetQualOffset(int x, bool fqSol) { - for (size_t i = 0; i < Qual.size(); i++) { Qual[i] += x; } - if (fqSol) {//quick hack, since q<13 scores are the only deviants and uninteresting in most cases.. - for (size_t i = 0; i < Qual.size(); i++) { - if (Qual[i] < 0) { - Qual[i] = 0; - } - } - } -} - -/////////////////////////////////////////////////////////////// -//INPUT STREAMER - - - -void DNAunique::addSmpl(int k) { - if (k < 0) { - return; - } - Count++; -#ifdef _MAPDEREPLICATE - unordered_map::iterator smID = occurence.find(k); - if (smID == occurence.end()) { - occurence[k] = 1; - } else { - smID->second++; - } -#endif -} -void DNAunique::setOccurence(int smpl, int N) { - unordered_map::iterator smID = occurence.find(smpl); - if (smID == occurence.end()) { - occurence[smpl] = N; - } else { - smID->second += N; - } -} -/*vector> DNAunique::getDerepMapSort2(size_t wh ){ - typedef std::pair mypair; - size_t siz = occurence.size(); - if (wh > siz) { wh = siz; } - - struct IntCmp { - bool operator()(const mypair &lhs, const mypair &rhs) { - return lhs.second > rhs.second; - } - }; - - - vector myvec(occurence.begin(), occurence.end()); - std::partial_sort(myvec.begin(), myvec.begin() + wh, myvec.end(), IntCmp()); - - return myvec; -}*/ - -bool sortDescending(int i, int j) { return (i>j); }//descending sort -vector DNAunique::getDerepMapSort(size_t wh) { - vector vals; - size_t siz = occurence.size(); - if (wh > siz) { wh = siz;} - vals.reserve(siz); - for (auto kv = occurence.begin(); kv != occurence.end(); kv++) { - vals.push_back(kv->second); - } - //partial sort doesn't make sense, as I want to break border asap - //partial_sort(vals.begin(), vals.begin() + wh, vals.end(), sortDescending); - sort(vals.begin(), vals.end(), sortDescending); - return vals; -} - -bool DNAunique::pass_deprep_smplSpc(const vector& cv) { - unordered_map occ; - //combined samples will not be considered - //occ = occurence; - for (std::unordered_map::iterator iter = occurence.begin(); iter != occurence.end(); ++iter) { - //int cnts = iter->second; - int ref = cv[iter->first]; - if (ref != -1 && iter->second >= ref ) { - return true; - } - } - return false; - -} - - -void DNAunique::transferOccurence(shared_ptr odu) { - if (occurence.size() == 0) { - occurence = odu->occurence; - Count = odu->Count; - } else { - //which sample contains this dna? - unordered_map oldMap = odu->getDerepMap(); - unordered_map::iterator smID; - for (std::unordered_map::iterator oID = oldMap.begin(); oID != oldMap.end(); ++oID) { - smID = occurence.find(oID->first); - if (smID == occurence.end()) { - occurence[oID->first] = oID->second; - } else { - smID->second += oID->second; - } - } - //size track - Count += odu->Count; - } -} -void DNAunique::writeMap(ofstream & o, const string & hd, - vector & cntspersmpl, const vector& combiID) { - if (occurence.size() == 0) { return; } - int totCnt(0); - unordered_map occ; - if (combiID.size() > 0){//combine all counts on combined categories - std::unordered_map::iterator fnd; - for (std::unordered_map::iterator iter = occurence.begin(); iter != occurence.end(); ++iter) { - //aim: occ[combiID[iter->first]] += iter->second; - fnd = occ.find(combiID[iter->first]); - if (fnd == occ.end()){ - occ[combiID[iter->first]] = iter->second; - }else{ - fnd->second += iter->second; - } - } - } - else { - occ = occurence; - } - - //prints combined sample counts - o << hd; - for (std::unordered_map::iterator iter = occ.begin(); iter != occ.end(); ++iter) { - int cnts = iter->second; - totCnt += cnts; - o << "\t"; - o << iter->first << ":" << cnts; - } - //counts non-combined sample counts - for (std::unordered_map::iterator iter = occurence.begin(); iter != occurence.end(); ++iter) { - //int cnts = iter->second; - cntspersmpl[iter->first] += iter->second; - } - o << endl; - - if (totCnt != Count) { - cerr << "Unequal Counts in Map("<getID()<getIDshort(); - if (!usFmt) { - NewID += "_" + itos(Count); - } else { - NewID += ";size=" + itos(Count) + ";"; - } - IDfixed = true; -} - - -/////////////////////////////////////////////////////////////// -//INPUT STREAMER - -InputStreamer::~InputStreamer(){ - allStreamClose(); -// for (uint i = 0; i < tdn1.size(); i++) { if (tdn1[i] != NULL) { delete tdn1[i]; } } -// for (uint i = 0; i < tdn2.size(); i++) { if (tdn2[i] != NULL) { delete tdn2[i]; } } -} - -bool InputStreamer::getFastaQualLine(istream&fna, string&line) { - - if (!safeGetline(fna, line)) { return false; } - while (line[0] == '$') { //$ marks comment - safeGetline(fna, line); - } - return true; -} -bool InputStreamer::read_fasta_entry(istream&fna,istream&qual,shared_ptr tdn1, shared_ptrtdn2,int &cnt){ - - if(fna.eof()){return false;} - - //int in_int; //char in_char; - //int cnt=0; - string tqual(""),tseq(""); - string line(""), lineQ(""); - if (!getFastaQualLine(fna, line)) { return false; } - if (line == "" && fna.eof()){return false;} - if (!qualAbsent) { getFastaQualLine( qual, lineQ); } - cnt++; - - if (cnt == 1) { //fasta description - if (line[0] != '>' && fna) { cerr << "ERROR: Line 1 in fasta file does not start with \">\" \n"; exit(23); } - //new DNA, set up in tdn1 - tdn1->newHEad(line.substr(1)); - if (!getFastaQualLine(fna, line)) { return false; } - if (!qualAbsent) { getFastaQualLine(qual, lineQ); } - cnt++; - } - //continous read in until ">" is hit - while (line[0] != '>') { - tseq += line; - if (!getFastaQualLine(fna, line)) { break; } - } - if (!qualAbsent) { - while (lineQ[0] != '>') { - tqual += " " + lineQ; - if (!getFastaQualLine(qual, lineQ)) { break; } - } - } - - //fna - tdn1->setSeq(tseq); - size_t lsize = tseq.size(); - vector Iqual(lsize, 0); - //qual - if (!qualAbsent) { - const char* lQ = rtrim(tqual).c_str(); - uint ii = 0; int nn(0); - for (; ii < lsize; ii++) { - nn = parseInt(&lQ);// , posStr); - Iqual[ii] = nn; - if (*lQ == '\0') { - break; - } - //issQ >> Iqual[ii]; - } - if (ii != (lsize - 1)) { - cerr << "ERROR: quality counts (" << ii << ") not the same length as DNA base counts in Sequence (" << (lsize - 1) << ")\n" << tdn1->getID() << "\n"; - exit(54); - } - } - tdn1->Qappend(Iqual); - - - //since already read in 1 more line, this line needs to be used on new object - if (fna ) { - if (!qualAbsent && (line[0] != '>' || lineQ[0] != '>')) { cerr << "ERROR: Desynced fasta reader\n"; exit(23); } - tdn2->newHEad(line.substr(1)); - } else { - return false; - } - //a new DNA obj was set up, return to process tdni, tdno will be completed next call - return true; -} -inline int InputStreamer::parseInt(const char** p1){//,int& pos){//, const char ** curPos) { - /*from http://stackoverflow.com/questions/5830868/c-stringstream-is-too-slow-how-to-speed-up*/ - //size_t nxtPos = input.find_first_of(' ', curPos); - //const char *p = input.substr(curPos,nxtPos).c_str(); - //if (!*p || *p == '?') return 0; - //int s = 1; - //p = (const char*)curPos; - const char* p = *p1; - while (*p == ' ') p++; - - int acc = 0; - while (*p >= '0' && *p <= '9') - acc = acc * 10 + *p++ - '0'; - - - *p1 = p; - //curPos = (size_t) p; - return acc; -} -void InputStreamer::jmp_fastq(istream & fna, int &lnCnt) { - string line; - string tname = "", tseq = ""; //temporary storage - size_t cnt = 0, qcnt = 0, DNAlength = 0; - bool mode = true; //mode=T:fna,F:qual - bool needsAT = true, needsPlus = false; // checks if quality was completly read in and a '@' char is now expected in the next line - // getline(fna2,line2,'\n'); - - while (getline(fna, line, '\n')) { - lnCnt++; - if (line == "") { return ; } - if (line[0] == '@' && needsAT) { //fasta description - if (cnt != 0) { - cerr << "Line " << lnCnt << ": Could not find \'@\' when expected: on line \n" << line << endl; - } - //tname = line; - needsAT = false; - needsPlus = true; - continue; - } else if (line[0] == '+' && needsPlus) { - needsPlus = false; - mode = false; - continue; - } else if (needsAT) { - cerr << "Line " << lnCnt << " :Could not find \'@\' symbol where expected on line \n " << line << endl; - exit(6); - } - - istringstream iss(line); - iss >> tseq; - if (mode) { - DNAlength += tseq.length(); - } else { - qcnt += tseq.length(); - if (DNAlength == qcnt) { return; } - } - - cnt++; - } -} -//reads out single seq + quality entry from fastq file -shared_ptr InputStreamer::read_fastq_entry(istream & fna, int &minQScore, int& lnCnt, - bool&corrupt,bool repairInStream) { - string line; - //string tseq = ""; //temporary storage - uint qcnt = 0; - bool mode = true; //mode=T:fna,F:qual - shared_ptr tdn = make_shared("", ""); - vector Iqual(0); - bool needsAT = true, needsPlus = false; // checks if quality was completly read in and a '@' char is now expected in the next line - - if (!fna) { return NULL; cerr << "Read Fastq: Input stream does not exist" << lnCnt << endl; exit(53); } - - while (safeGetline(fna, line)) { - lnCnt++; - while (repairInStream) { - if (line[0] == '@') {repairInStream = false;} - else {safeGetline(fna, line);} - } - //if (line.length() == 0) { return tdn; } - if (needsAT) { //fasta description - if (line[0] == '@') { - if ((lnCnt-1) % 4 != 0) { - fqPassedFQsdt = false; - } - tdn->newHEad(line.substr(1)); - mode = true; qcnt = 0; - needsAT = false; - needsPlus = true; - continue; - } else { - corrupt = true;//try again,could be last empty line in file.. - if (line.length() != 0) { - IO_Error("Line " + itos(lnCnt) + " :Could not find \'@\' symbol where expected on line \n " +line);// << endl; - } - return tdn; - } - } else if (needsPlus && line[0] == '+') { - if ((lnCnt-3) % 4 != 0) { - fqPassedFQsdt = false; - } - - - Iqual.resize(tdn->length(), 0); - needsPlus = false; - mode = false; - continue; - } - - //istringstream iss(line); - //iss >> tseq; - if (mode) { - //fna - if ((lnCnt-2) % 4 != 0) { - fqPassedFQsdt = false; - } - - //if (std::any_of(line.begin(), line.end(), [](char c) {return (islower(c)); })) { - if (any_lowered(line)){ - fqPassedFQsdt = false; - std::transform(line.begin(), line.end(), line.begin(), ::toupper); - } - tdn->append(line); - } else if (!mode) { - //qual - if ((lnCnt ) % 4 != 0) { - fqPassedFQsdt = false; - } - - for (size_t i = 0; i < line.length(); i++) { - //really 33? - Iqual[qcnt] = minmaxQscore((qual_score)line[i] - fastQver); - qcnt++; - } - if (qcnt == tdn->length()) { - //needsAT=true; - tdn->Qappend(Iqual); - - } else if (line.length() + qcnt > tdn->length()) { - //check that quality gets not more length than DNA - IO_Error("ERROR: More quality positions than nucleotides detected for sequence\n " + tdn->getID()); - //tdn->setPassed(false); - corrupt = true; - return tdn; - } - break; - - } - } - //if (tdn!= NULL && !tdn->control()){delete tdn; tdn=NULL;} - corrupt = false; - return tdn; - - //to check for fast fastq reader: 1. DNA in uppercase? 2. DNA/QUAL in single line? -} -void InputStreamer::IO_Error(string x) { - cerr << x << endl; - if (DieOnError) { - exit(632); - } - ErrorLog.push_back(x); -} -//reads out single seq + quality entry from fastq file -shared_ptr InputStreamer::read_fastq_entry_fast(istream & fna, int& lnCnt, bool& corrupt) { - string line; - if (!fna) { return NULL; cerr << "Read Fastq_f: Input stream does not exist " << lnCnt << ".\n"; exit(53); } - - if (!safeGetline(fna, line)) { return NULL; } - shared_ptr tdn = make_shared("", ""); - if (line.length() == 0) { return tdn; } - while (line[0] != '@') { - IO_Error("ERROR on line " + itos(lnCnt) + ": Could not find \'@\' when expected (file likely corrupt, trying to recover):\n" + line);// << endl; - //exit(55); - //recover instead and go to next entry.. - corrupt = true; - if (!safeGetline(fna, line)) { corrupt = true; return NULL; }//delete tdn; - } - tdn->newHEad(line.substr(1)); - //cerr << line.substr(1); - if (!safeGetline(fna, tdn->getSeq())) { corrupt = true; return NULL; }//delete tdn; - //if (line.length() == 0) { return NULL; } - //std::transform(line.begin(), line.end(), line.begin(), ::toupper); - //tdn->append(line); - //"+" - if (!safeGetline(fna, line)) { corrupt = true; return NULL; }//delete tdn; - while (line[0] != '+') { - //recovery is hard, just give up this read - IO_Error("Error input line " + itos(lnCnt + 2) + ": Could not find \'+\' when expected (file likely corrupt, aborting):\n" + line);// << endl; - corrupt = true; - return tdn; - //if (!safeGetline(fna, line)) { delete tdn; return NULL; } - } - - //qual score - vector Iqual(tdn->mem_length(), 0); - if (!safeGetline(fna, line)) { corrupt = true; return NULL; }//delete tdn; - uint qcnt(0); uint lline = (uint)line.length(); - for (; qcnt < lline; qcnt++) { - Iqual[qcnt] = minmaxQscore((qual_score)line[qcnt] - fastQver); - - } - - if (qcnt == tdn->mem_length()) { - //needsAT=true; - tdn->setQual(Iqual); - } else if (line.length() + qcnt != tdn->length()) { - //check that quality gets not more length than DNA - corrupt = true; - IO_Error("Error input line " + itos(lnCnt + 3) + ": More quality positions than nucleotides detected for sequence\n " +tdn->getID());// << endl; -// exit(7); - } - lnCnt+=4; - //if (tdn!= NULL && !tdn->control()){delete tdn; tdn=NULL;} - corrupt = false; - return tdn; -} - -int InputStreamer::minmaxQscore(qual_score t) { - if (t < 0) { ////quick hack, since q<13 scores are the only deviants and uninteresting in most cases.. - if (fqSolexaFmt){ - if (t < -5) { - cerr << "Unusually low sloexa quality score (" << t << "); setting to 0.\n"; - } - } else { - if (t >= -5) { - cerr << "Resetting auto format to Solexa (illumina 1.0-1.3) format.\n"; - fqSolexaFmt = true; - } else { - cerr << "Unusually low quality score (" << t << "); setting to 0.\n"; - } - } - t = 0; - } - if (minQScore > t) { - minQScore = t; - if (minQScore < 0) { - } - } else if (maxQScore < t) { - maxQScore = t; - } - return t; -} -bool InputStreamer::checkInFileStatus() { - for (uint i = 0; i < 3; i++) { - if (fnaRead) { - if (fna_u[i] != NULL && *fna_u[i]) { - return true; - } - } else { - if (fastq_u[i] != NULL && *fastq_u[i]) { - return true; - } - } - } - return false; -} -void InputStreamer::allStreamReset() { - resetLineCounts(); -#ifdef DEBUG - cerr << "Resetting input streams" << endl; -#endif - //reopen streams in gz case // sdm 1.01: make default - if (true || openedGZ) { - allStreamClose(); - setupFastq_2(inFiles_fq[0], inFiles_fq[1], inFiles_fq[2]); - setupFastaQual2(inFiles_fna[0], inFiles_qual[0]); - } else { - for (uint i = 0; i < 3; i++) { - if (fna_u[i] != NULL && * (fna_u[i])) { fna_u[i]->clear(); fna_u[i]->seekg(0, ios::beg); } - if (qual_u[i] != NULL && *(qual_u[i])) { qual_u[i]->clear(); qual_u[i]->seekg(0, ios::beg); } - if (fastq_u[i] != NULL && *(fastq_u[i])) { fastq_u[i]->clear(); fastq_u[i]->seekg(0, ios::beg); } - } - } - //checkInFileStatus(); -} -void InputStreamer::allStreamClose(){ - for (uint i = 0; i < 3; i++){ -/* if (*(fna[i])){ fna[i]->close(); } - if (*(qual[i])){ qual[i]->close(); } - if (*(fastq[i])){ fastq[i]->close(); } -#ifdef _gzipread - if (gzfna[i]){ gzfna[i].close(); } - if (gzqual[i]){ gzqual[i].close(); } - if (gzfastq[i]){ gzfastq[i].close(); } -#endif - */ - if (fna_u[i] != NULL) { delete fna_u[i]; } fna_u[i] = NULL; - if (qual_u[i] != NULL) { delete qual_u[i]; } qual_u[i] = NULL; - if (fastq_u[i] != NULL) { delete fastq_u[i]; }fastq_u[i] = NULL; - } - - if (!fnaRead && minQScore < 1000) { - maxminQualWarns_fq( ); - } -} -void InputStreamer::jumpToNextDNA(bool&stillMore, int pos) { - if (fnaRead) {//get DNA from fasta + qual files - //shared_ptr ret; - stillMore = read_fasta_entry(*(fna_u[pos]), *(qual_u[pos]), tdn1[pos], tdn2[pos], lnCnt[pos]); - //tdn1 will be completed, tdn2 will have a header set up - //ret = tdn1[pos]; - tdn1[pos] = tdn2[pos]; tdn2[pos].reset(new DNA("", "")); - } else { - jmp_fastq(*fastq_u[pos], lnCnt[pos]); - if (!*(fastq_u[pos])) { - stillMore = false; - } - } -} - -shared_ptr InputStreamer::getDNA(bool& stillMore, int pos, bool& sync){ - //if (sync) { - // while (desync(pos)) { - // jumpToNextDNA(stillMore, pos); - // } - //} - if (pos == 1 && numPairs <= 1) { - return NULL; - } - shared_ptr ret; - bool corrupt(true); //corrupt state isn't implemented for fnaread - - bool repairInStream(false); - while (corrupt) { - if (fnaRead) {//get DNA from fasta + qual files - stillMore = read_fasta_entry(*(fna_u[pos]), *(qual_u[pos]), tdn1[pos], tdn2[pos], lnCnt[pos]); - corrupt = false; - //tdn1 will be completed, tdn2 will have a header set up - ret = tdn1[pos]; - tdn1[pos] = tdn2[pos]; tdn2[pos].reset( new DNA("", "")); - if (!ret->seal() || ret->isEmpty()) { ret = NULL; } - if (pos == 0 && pairs_read[pos] % 100 == 0) { - _drawbar(*(fna_u[pos])); - } - else if (!stillMore) { _drawbar(*(fna_u[pos])); } - } - else { //fqRead - if (fqReadSafe) { - ret = read_fastq_entry(*(fastq_u[pos]), minQScore, lnCnt[pos], corrupt, repairInStream); - if (fastQver == 0 && ret->length() > 5 && !corrupt) {//autodetect - ret->resetQualOffset(auto_fq_version(), fqSolexaFmt); - //reset streams - } - if (lnCnt[pos] > 100) {//tmp set back to 500 - if (fqPassedFQsdt) { - fqReadSafe = false; -#ifdef DEBUG - cerr << "Switching to fast fastq reader..\n "; -#endif - } - } - } - else { - ret = read_fastq_entry_fast(*(fastq_u[pos]), lnCnt[pos], corrupt); - } - if (!stillMore || fastq_u[pos]->eof() || (!*(fastq_u[pos])) ) { - if (ret != NULL) { if (!ret->seal() || ret->isEmpty()) { ret = NULL; } } //delete ret; - stillMore = false; break; - } else if (ret == NULL || !ret->seal() || ret->isEmpty()) { - corrupt = true; - } - if (pos == 0 && pairs_read[pos] % 100 == 0) { - if (_drawbar(*(fastq_u[pos]))) { stillMore = false; break; } - } - else if (!stillMore) { _drawbar(*(fastq_u[pos])); } - - - if (corrupt) { - //delete ret; - ret = NULL; - sync = true; - repairInStream = true; - } - } - pairs_read[pos]++; - //last check - } - // - return ret; -} - -void InputStreamer::openMIDseqs(string p,string in){ - if (in==""){return;} - - if (fastq_u[2] != NULL){ - cerr << "MID file was already initialized" << endl; - } -#ifdef DEBUG - cerr << "Open Mid Seq file" << endl; -#endif - - string file_type = "MID specific fastq"; - string tmp = (p + in); - inFiles_fq[2] = tmp; - if (isGZfile(tmp)){ -#ifdef _gzipread - fastq_u[2] = new igzstream(tmp.c_str(), ios::in); - file_type = "MID specific gzipped fastq"; -#else - cerr << "gzip not supported in your sdm build\n" << tmp; exit(50); -#endif - } - else { - fastq_u[2] = new ifstream(tmp.c_str(), ios::in); - } - if (!*(fastq_u[2])){ cerr << "\nCouldn't find " << file_type<<": " << in << " !\n Aborting..\n"; exit(4); } - - hasMIDs=true; -} - -bool InputStreamer::setupFastq(string path, string fileS, int& pairs, string subsPairs,bool simu) { - allStreamClose(); openedGZ = false; - minQScore = 1000; maxQScore = -1; fqSolexaFmt = false; - resetLineCounts(); - vector tfas = splitByCommas(fileS); - if ( pairs == -1 ) { - pairs = (int) tfas.size(); - if ( pairs > 1 ) { cerr << "Paired input (\",\" separated) detected\n"; } - } - numPairs = pairs; - string p1(""), p2(""), midp(""); - string xtraMsg = ""; - if (getCurFileN() > 0) { - xtraMsg = " " + itos(getCurFileN()) + " of " + itos(totalFiles); - if (BCnumber > 1) { - xtraMsg = ", looking for " + itos(BCnumber) + "BCs.\n"; - } else { - xtraMsg = ".\n"; - } - } - - if (tfas.size() != (uint)pairs && subsPairs == "") { - cerr << "Unequal number of files (" << tfas.size() << ") and option-set paired files (" << pairs << ").\n Aborting...\n"; exit(52); - } - if (tfas.size() == 3) { - if (tfas.size() != 3) { cerr << "Could not detect 3 input files in string\n" << fileS << "\n Aborting.." << endl; exit(76); } - midp = path + tfas[1]; - p1 = path + tfas[0]; - p2 = path + tfas[2]; -// cerr << p1 << " + " << p2 << " and " << midp << endl; - } else if (tfas.size() == 2) { - p1 = path + tfas[0]; - p2 = path + tfas[1]; - } else if (tfas.size() == 1) { - p1 = path + fileS; - //cerr << "Reading fastq " << p1 << endl; - } - if (subsPairs == "1") { - p2 = ""; - } else if (subsPairs == "2") { - p1 = p2; p2 = ""; - } - - if (simu) { - return fileExists(p1) && fileExists(p2) && fileExists(midp); - } - - - if (p1 != ""&&p2 != "") { - if (midp != "") { - cerr << "Reading paired fastq + MID file" << xtraMsg<<"."< 1.f) { _print(_max + 1, 1); _fileLength = 0; return false; } - - // Number of #'s as function of current progress "prog" - int cur((int) ceil(prog * (float) _max)); - if (_last != cur) _last = cur, _print(cur, prog); - if (prog == 1.f) { - return true; - } - return false; - -} -inline void InputStreamer::_measure(istream& tar) { - tar.seekg(0, ios_base::end); - _fileLength = (int) tar.tellg(); - tar.seekg(0, ios_base::beg); - tar.clear(); -} - -bool InputStreamer::setupFastq_2(string p1, string p2, string midp) { - //setupFastq_2(inFiles_fq[0],inFiles_fq[1],inFiles_fq[2]); -#ifdef DEBUG - cerr << "setupFastq2 " << p1<openMIDseqs("", midp); - } - return true; -} -//mainFile = IS->setupInput(path, uniqueFas[i], FastqF[tarID], FastaF[tarID], QualF[tarID], MIDfq[tarID], fil->isPaired(), cmdArgs["-onlyPair"]); -string InputStreamer::setupInput(string path, int i, int t, const vector& uF, const vector& FQ, const vector& Fas, - const vector& Qual, const vector& midf, int &paired, string onlyPair, - string& shMain, bool simu) { - string mainFile(""); - if (fnaRead) { - if (Fas[t] != uF[i]) { - cerr << "Error in matching FASTA target filenames.\n"; - exit(11); - } - this->setupFastaQual(path, Fas[t], Qual[t], paired, onlyPair,simu); - mainFile = path + Fas[t]; - shMain = Fas[t]; - } else { - if (FQ[t] != uF[i]) { - cerr << "Error in matching target filenames.\n"; - exit(11); - } - this->setupFastq(path, FQ[t], paired, onlyPair,simu); - mainFile = path + FQ[t]; - shMain = FQ[t]; - } - this->openMIDseqs(path, midf[t]); - return mainFile; -} - - -bool InputStreamer::setupFastaQual(string path, string Sfil, string Qfil, int& pairs, string subsPairs, bool simu) { - resetLineCounts(); - allStreamClose(); - vector tfas = splitByCommas(Sfil); - if ( pairs == -1 ) { - pairs = (int) tfas.size(); - if ( pairs > 1 ) { cerr << "Paired input (\",\" separated) detected\n"; } - } - if (pairs > 1) { cerr << "Paired fasta+qual is currently not implemented\n"; exit(72); } - if (subsPairs == "1") { - // - } else if (subsPairs == "2") { - // - } - - numPairs = pairs; - tdn1[0].reset( new DNA("", "")); tdn2[0].reset( new DNA("", "")); - - inFiles_fna[0] = path + Sfil; - inFiles_qual[0] = path + Qfil; - if (simu) { - if (!fileExists(inFiles_qual[0],-1,false)) { - cerr << "Warning: corresponding quality file is missing: " << inFiles_qual[0]<rdbuf()->pubsetbuf(Buffer, RDBUFFER); - } - //quality file - if (fileQ != "") { - if (isGZfile(fileQ)) { -#ifdef _gzipread - qual_u[0] = new igzstream(fileQ.c_str(), ios::in); - file_typeq = "gzipped quality file"; -#else - cerr << "gzip not supported in your sdm build \n" << fileQ; exit(50); -#endif - } else if (fileQ == "") { - //setup empty stream - qual_u[0] = new ifstream(); - qualAbsent = true; - } else { - qual_u[0] = new ifstream(fileQ.c_str(), ios::in); - } - if (!*(qual_u[0])) { - cerr << "\nCouldn't find " << file_typeq << " file \"" << fileQ << "\"!\n Running in no qual filter mode\n"; - qualAbsent = true; - //exit(55); - } - if (getCurFileN() > 0) { - cerr << "Reading Fasta + Quality file " << getCurFileN() << " of " << totalFiles << ".\n"; - } else { - cerr << "Reading Fasta + Quality file.\n"; - } - cerr<< file_type << " : " << fileS << endl << file_typeq << " : " << fileQ << endl; - } else if (fileS != "") { - if (getCurFileN() > 0) { - cerr << "Reading Fasta file " << getCurFileN() << " of " << totalFiles << ".\n"; - } else { - cerr << "Reading Fasta.\n"; - } - qual_u[0] = new ifstream(); - qualAbsent = true; - } - return true; -} -void InputStreamer::setupFna(string fileS){ - allStreamClose(); - resetLineCounts(); - numPairs = 1; - tdn1[0].reset(new DNA("", "")); tdn2[0].reset(new DNA("", "")); - setupFastaQual2(fileS, "","seq"); - /*cerr << "Reading Fasta file.\n" << fileS << endl; - string file_type = "seq"; - // INPUT file - if (isGZfile(fileS)){ -#ifdef _gzipread - fna_u[0] = new igzstream(fileS.c_str(), ios::in); - file_type = "gzipped seq"; -#else - cerr << "gzip not supported in your sdm build"; exit(50); -#endif - } - else { - //fna[0].open(fileS.c_str(), ios::in); - fna_u[0] = new ifstream(fileS.c_str(), ios::in); - } - if (!fna_u[0]){ cerr << "\nCouldn't find "<= 100 || maxQScore < 2) { - return fqDiff; - } - fqSolexaFmt = false; - if (minQScore >= 59 && maxQScore > 74){ - fqDiff = (fastQver - 64); fastQver = 64; - if (minQScore < 64) { //set to illumina1.0 (solexa) - fqSolexaFmt = true; - cerr << "\nSetting to illumina 1.0-1.3 (solexa) fastq version (q offset = 64, min Q=-5).\n\n"; - } else { - cerr << "\nSetting to illumina 1.3-1.8 fastq version (q offset = 64).\n\n"; - } - } else if (minQScore >= 33 && maxQScore <= 74) { - fqDiff = (fastQver - 33); fastQver = 33; - cerr << "\nSetting to Sanger fastq version (q offset = 33).\n\n"; - } else { - cerr << "\nUndecided fastq version..\n"; - fqDiff = (fastQver - 33); fastQver = 0; - //exit(53); - } - QverSet = true; - return fqDiff; -} - -void InputStreamer::maxminQualWarns_fq(){ - if (minQScore >= 59-33 && fastQver==33 ){ //set to sanger version, but no low qual over whole dataset -> probably illumina version - cerr << " WARNING :: \nQuality scores in your dataset are unusually high (min Q=" << minQScore<<"). Please check that you have a fastQ file in NCBI SRA, Sanger or Illumina 1.8+ version.\nIf not, set fastqVersion in option file to \"3\" (Illumina 1.0) or \"2\" (Illumina 1.3 < 1.8) .\n\n"; - } - if (minQScore < 0 ){ //set to sanger version, but no low qual over whole dataset -> probably illumina version - cerr << " WARNING :: \nQuality scores in your dataset are unusually low (min Q=" << minQScore << "). Please check that you have a fastQ file in Illumina 1.0 or Illumina 1.3 < 1.8 version.\nIf not, set fastqVersion in option file to \"1\" (NCBI SRA, Sanger or Illumina 1.8+ version).\n\n"; - } -} - -#ifdef _gzipread2 - -std::vector< char > readline(gzFile f) { - // gzFile fp =gzopen(fname,"r"); - - std::vector< char > v(2056); - int pos = 0; - for (;;) { - if (gzgets(f, &v[pos], (int)v.size() - pos) == 0) { - // end-of-file or error - int err; - //const char *msg = gzerror(f, &err); - if (err != Z_OK) { - // handle error - } - break; - } - unsigned read = strlen(&v[pos]); - if (v[pos + read - 1] == '\n') { - if (v[pos + read - 2] == '\r') { - pos = pos + read - 2; - } - else { - pos = pos + read - 1; - } - break; - } - if (read == 0 || pos + read < v.size() - 1) { - pos = read + pos; - break; - } - pos = v.size() - 1; - v.resize(v.size() * 2); - } - v.resize(pos); - return v; -} -#endif \ No newline at end of file diff --git a/configs/sdm_src/InputStream.h b/configs/sdm_src/InputStream.h deleted file mode 100644 index 3874649..0000000 --- a/configs/sdm_src/InputStream.h +++ /dev/null @@ -1,539 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - -#ifndef _InputStr_h -#define _InputStr_h - - -#include "DNAconsts.h" -#include -#include -#include - -extern char DNA_trans[256]; -extern short DNA_amb[256]; -extern short NT_POS[256]; -extern short DNA_IUPAC[256 * 256]; -typedef float matrixUnit; - - -string spaceX(uint k); -int digitsInt(int x); -int digitsFlt(float x); -string intwithcommas(int value); -std::string itos(int number); -std::string ftos(float number); -bool isGZfile(const string fileS);//test if file is gzipped input - -static inline std::string &rtrim(std::string &s) { - s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(std::isspace))).base(), s.end()); - return s; -} - - -//MOCAT header fix -std::vector header_string_split(const std::string str, const std::string sep); -void remove_paired_info(string&, short = -1); -//MOCAT header fix -std::string header_stem(string& header); -std::istream& safeGetline(std::istream& is, std::string& t); -string reverseTS2(const std::string & Seq); -void reverseTS(std::string & Seq); - - -bool any_lowered(const string& is); -//this function changes input string (file location) to have consistent file names -string applyFileIT(string x, int it, const string xtr = ""); -bool fileExists(const std::string& name, int i=-1,bool extiffail=true); -//vector orderOfVec(vector&); - - - - -class ofbufstream { -public: - ofbufstream(const string IF, int mif) :file(IF), modeIO(mif), used(0) { - if (modeIO == ios::out) { - remove(file.c_str()); - } - keeper = new char[bufS]; - } - ~ofbufstream() { - writeStream(); - delete[] keeper; - } - void operator<< (const string& X) { - size_t lX(X.length()); - if (lX + used > bufS) { - writeStream(); - } - memcpy(keeper + used, X.c_str(), lX); - used += lX; - } -private: - void writeStream() { - if (used == 0) { return; } - ofstream of(file.c_str(), ios::app); - of.write(keeper, used); - of.close(); - used = 0; - } - string file; - char *keeper; - int modeIO; - size_t used; - static const size_t bufS = 500000; -}; - - - -inline vector splitByComma(const string& fileS,bool requireTwo, char SrchStr=','){ - string::size_type pos = fileS.find(SrchStr); - if (pos == string::npos){ - if (requireTwo){ - cerr< (1,fileS); - } - } - vector tfas(2,""); - tfas[0] = fileS.substr(0,pos); - tfas[1] = fileS.substr(pos+1); - return tfas; -} - -inline vector splitByCommas(const string& fileS, char SrchStr = ',') { - if (fileS.find(SrchStr) == string::npos) { return vector(1, fileS); } - vector res = splitByComma(fileS, true, SrchStr); - vector ret(0); ret.push_back(res[0]); - while (res[1].find(SrchStr) != string::npos) { - res = splitByComma(res[1], true, SrchStr); - ret.push_back(res[0]); - } - ret.push_back(res[1]); - return ret; -} - - -//requires sorted vector with the entries being actual datapoints -template -TYPE calc_median2(vector& in, float perc){ - size_t sum = in.size(); - size_t tar = (size_t)(((float)sum) * perc); - return in[tar]; -} - - -//returns "i_fna" or "i_fastq" -string detectSeqFmt(const string); - - -class Filters; - -class DNA{ -public: - DNA(string seq, string names) :Seq(seq), SeqLength(Seq.length()), - ID(names), NewID(names), - Qual(0),QualTraf(""),Sample(-1),avgQual(-1.f), - Qsum(0),AccumError(0.),goodQual(false),midQual(false), - Read_position(-1), - FtsDetected(), - IDfixed(false), tempFloat(0.f){} - DNA():Seq(""),SeqLength(0),ID(""),NewID(""),Qual(0),QualTraf(""), - Sample(-1), avgQual(-1.f), - Qsum(0), AccumError(0.), goodQual(false), midQual(false), - Read_position(-1), - FtsDetected(), - IDfixed(false), tempFloat(0.f) { - } - bool operator==(DNA i) { - if (i.getSeqPseudo() == this->getSeqPseudo()) { - return true; - } else { - return false; - } - } - bool operator==(shared_ptr i) { - if (i->getSeqPseudo() == this->getSeqPseudo()) { - return true; - } else { - return false; - } - } - - //~DNA(){} - void append(const string &s) { Seq += s; SeqLength = Seq.length(); } - void Qappend(const vector &q); - void setSeq(string & s) { Seq = s; SeqLength = Seq.length();/*DN = Seq.c_str();*/ } - string &getSeq() { return Seq; } - string getSeq_c() { return Seq; } - string getSeqPseudo() { return Seq.substr(0, SeqLength); } - void setQual(vector& Q) { Qual = Q; avgQual = -1.f; } - const string& getID() { if (IDfixed) { return NewID; }return ID; } - string getID_copy() { string x = getID(); string y = x; return y; } - string getIDPosFree(); // remove /1 /2 #1:0 etc - const string& getOldID() { return ID; } - string getIDshort() { return ID.substr(0, getShorterHeadPos(ID)); } - string getNewIDshort() { return NewID.substr(0, getShorterHeadPos(NewID)); } - bool seal(); - bool isEmpty() { if (ID.length() == 0 && Seq.length() == 0) { this->setPassed(false); return true; } return false; } - - int numACGT(); - float getAvgQual(); - unsigned int getQsum(){return Qsum;} - float qualWinfloat(unsigned int,float,int&); - - - float binomialFilter(int, float); - //float qualWinfloat_hybr(int,float,int,float,int&); - bool qualWinPos(unsigned int,float); - bool qualAccumTrim(double d); - int qualAccumulate(double d); - double getAccumError(){ - if (AccumError == 0.f) { for (uint i = 0; i < Qual.size(); i++) { if (Qual[i] >= 0) { AccumError += SAqualP[Qual[i]]; } } } - if (std::isinf((double)AccumError)) { - AccumError = 5.f; - } - return AccumError;} - int minQual(){int mq=50; for (uint i=0;i&, vector&); - - - inline uint length() { return (uint)SeqLength; } - inline uint mem_length() { return (uint) Seq.length(); } - bool cutSeq(int start, int stop=-1, bool = false); - bool HomoNTRuns(int); - int matchSeq(string, int, int, int); - void reverse_transcribe(); - int matchSeqRev(string, int, int, int=0); - int matchSeq_tot(string, int, int, int&); - void writeSeq(ostream&,bool singleLine=false); - void writeQual(ostream&, bool singleLine = false); - void writeFastQ(ostream&,bool=true); - void writeFastQ(ofbufstream&, bool = true); - void writeFastQEmpty(ostream&); - void setNewID(string x) { NewID = x; } - void newHEad(string x){NewID=x;ID=x;} - void changeHeadPver(int ver); - void setTA_cut(bool x) { FtsDetected.TA_cut = x; } - bool getTA_cut() { return FtsDetected.TA_cut; } - void setBarcodeCut() { FtsDetected.Barcode_cut = true; FtsDetected.Barcode_detected = true; } - bool getBarcodeCut() { return FtsDetected.Barcode_cut; } - void setBarcodeDetected(bool x){ FtsDetected.Barcode_detected = x; } - bool getBarcodeDetected() { return FtsDetected.Barcode_detected; } - bool isMIDseq() { if (Read_position == 3) { return true; } return false; } - void setMIDseq(bool b){ if (b){ Read_position = 3; } } - void setpairFWD(){ Read_position = 0; } - void setpairREV(){ Read_position = 1; } - int getReadMatePos() { return (int) Read_position; } - bool sameHead(shared_ptr); - bool sameHead(const string&); - //inline void reverseTranscribe(); - void setTempFloat(float i){tempFloat = i;} - float getTempFloat(){return tempFloat;} - void adaptHead(shared_ptr,const int,const int); - void failed(){goodQual=false;midQual=false;} - bool control(){ if (Qual.size()==0){return false;}return true;} - void setBCnumber(int i, int BCoff) { if (i < 0) { Sample = i ; FtsDetected.Barcode_detected = false; } else { Sample = i + BCoff; FtsDetected.Barcode_detected = true; } } - int getBCnumber();//always return BC tag IDX global (no local filter idx accounted for, use getBCoffset() to correct) - - void prepareWrite(int fastQver); - void reset(); - void resetTruncation() { SeqLength = Seq.length(); } - void setPassed(bool b); - void setMidQual(bool b) { midQual = b; } - bool isPassed(void){return goodQual;} - bool isMidQual(void){return midQual;} - string getSubSeq(int sta, int sto){return Seq.substr(sta,sto);} - void resetQualOffset(int off, bool solexaFmt); - - //control & check what happened to any primers (if) - bool has2PrimersDetected() { return (FtsDetected.reverse && FtsDetected.forward); } - bool getRevPrimCut() { return FtsDetected.reverse; } - bool getFwdPrimCut() { return FtsDetected.forward; } - void setRevPrimCut() { FtsDetected.reverse = true; } - void setFwdPrimCut() { FtsDetected.forward = true; } - //only used in pre best seed step - //float getSeedScore() { return tempFloat; } - //void setSeedScore(float i) { tempFloat = (float)i; } - - struct QualStats { - bool maxL; bool PrimerFail; bool AvgQual; //sAvgQual - bool HomoNT; bool PrimerRevFail; bool minL; - bool minLqualTrim; //<-sMinQTrim trimmed due to quality - bool TagFail; bool MaxAmb; bool QualWin;//sQualWin - bool AccErrTrimmed; bool QWinTrimmed; // either of these makes bool Trimmed; - bool fail_correct_BC; bool suc_correct_BC; bool - failedDNAread; - //bool adapterRem; -> setTA_cut - bool RevPrimFound; - bool BinomialErr; bool dblTagFail; - QualStats() : - maxL(false), PrimerFail(false), AvgQual(false), HomoNT(false), - PrimerRevFail(true), minL(false), minLqualTrim(false), - TagFail(false), MaxAmb(false), QualWin(false), - AccErrTrimmed(false), QWinTrimmed(false), - fail_correct_BC(false), suc_correct_BC(false), - failedDNAread(false), RevPrimFound(false), - BinomialErr(false), - dblTagFail(false) - {} - } QualCtrl; - -protected: - size_t getShorterHeadPos(const string & x, int fastQheadVer=-1) { - - size_t pos(string::npos); - if (fastQheadVer != 0) { - if (Read_position == 1) { - pos = x.find("/2"); - // if (pos == string::npos) { pos = x.find_first_of(" 1:");} - // if (pos == string::npos) { pos = x.find_first_of("/1"); } - } - else if (Read_position == 0) { - pos = x.find("/1"); - } - else { - pos = x.find("/1"); - if (pos == string::npos) { pos = x.find("/2"); } - } - } - //if (pos == string::npos){pos=x.length()-min((size_t)5,x.length());}} - //if(pos<0){pos=0;} - if (pos == string::npos) { pos = min(x.find(' '), x.find('\t')); } - - if (pos == string::npos) { pos = x.length(); } - return pos; - } - //mainly used to mark if rev/Fwd primer was detected - string xtraHdStr(); - size_t getSpaceHeadPos(const string & x) { - size_t pos = x.find(' '); - if (pos == string::npos) { pos = x.length(); } - return pos; - } - //binomial accumulated error calc - inline float interpolate(int errors1, float prob1, int errors2, float prob2, float alpha); - float sum_of_binomials(const float j, int k, float n, int qual_length, const vector& error_probs, const vector< vector> & per_position_accum_probs); - inline float prob_j_errors(float p, float j, float n); - - inline bool matchDNA(char,char); - - string Seq; - size_t SeqLength; - string ID,NewID; //original and newly constructed ID - vector Qual; - string QualTraf; - int Sample; - - //const char* DN; - float avgQual; - unsigned int Qsum; - double AccumError; - bool goodQual,midQual; - //bool TA_cut, Barcode_cut; //technical adapter, barcode (tag) - short Read_position;//-1=unkown; 0=pair1 (fwd primer); 1=pair2 (rev primer); 3=MID seq ; - - struct ElementsDetection{ - bool forward; bool reverse;//primers detected - bool TA_cut; bool Barcode_detected; bool Barcode_cut; - ElementsDetection() :forward(false), reverse(false), TA_cut(false), Barcode_detected(false), Barcode_cut(false) {} - void reset() { forward = false; reverse = false; TA_cut = false; Barcode_detected = false; Barcode_cut = false; } - } FtsDetected; - - - - bool IDfixed; - float tempFloat; -}; - -struct DNAHasher -{ - size_t operator()(shared_ptr k) const - { - // Compute individual hash values for two data members and combine them using XOR and bit shifting - return ((hash()(k->getSeqPseudo())) >> 1); - } -}; - - - - - -class DNAunique : public DNA{//used for dereplication -public: - DNAunique() : DNA(), Count(0), pair(0){}//chimeraCnt((matrixUnit) 1), - DNAunique(string s, string x) :DNA(s, x), Count(1) {} - DNAunique(shared_ptrd, int BC) : DNA(*d), Count(0), BestSeedLength( (uint)Seq.size()),pair(0){ addSmpl(BC); } - ~DNAunique() { ; }// if (pair != NULL) { delete pair; } - //string Seq; string ID; - void Count2Head(bool); - void addSmpl(int k); - void writeMap(ofstream & o, const string&, vector&, const vector&); - inline int getCount() { return Count; } - uint getBestSeedLength() { return BestSeedLength; } - void setBestSeedLength(uint i) { BestSeedLength = i; } - void setOccurence(int smpl, int N); - void transferOccurence(shared_ptr); - const unordered_map & getDerepMap() { return occurence; } - vector getDerepMapSort(size_t); - //vector> getDerepMapSort2(size_t wh); - void getDerepMapSort(vector&, vector&); - void saveMem() { QualTraf = ""; NewID = ID.substr(0, getSpaceHeadPos(ID)); ID = ""; } - void attachPair(shared_ptr d) { pair = d; pair->saveMem(); } - shared_ptr getPair(void) { return pair; } - //estimates if one sample occurence covers the unique counts required for sample specific derep min counts - bool pass_deprep_smplSpc( const vector&); - - //matrixUnit chimeraSplitNum() { return chimeraCnt; } - //void setChimSplitNum(matrixUnit x) { chimeraCnt = x; } - //sort - //bool operator < (const DNAunique& str) const { return (Count < str.Count); } -private: - int Count; - //matrixUnit chimeraCnt; - int BestSeedLength; - unordered_map occurence; - shared_ptr pair; - -}; - -class InputStreamer{ -public: - InputStreamer(bool fnRd, int fq, string ignoreInptEr="0") : - _fileLength(10), _max(60), _last(0), - fna_u(3, NULL), qual_u(3, NULL), fastq_u(3,NULL), - inFiles_fna(3, ""), inFiles_qual(3, ""), inFiles_fq(3, ""), - //fna(3,NULL), qual(3,NULL), fastq(3,NULL), - -#ifdef _gzipread - //gzfna(3,NULL), gzqual(3,NULL), gzfastq(3,NULL), -#endif - tdn1(3, NULL), tdn2(3, NULL), - fnaRead(fnRd), hasMIDs(false), - lnCnt(3, 0), fastQver(fq), - minQScore(1000), maxQScore(-1), - QverSet(true), numPairs(1), - pairs_read(3, 0), opos(3,0), - currentFile(0), totalFiles(0), BCnumber(0), - qualAbsent(false), - fqReadSafe(true), fqPassedFQsdt(true), - fqSolexaFmt(false), openedGZ(false), - ErrorLog(0), DieOnError(true) - { - opos[0] = 1; if (fastQver == 0) { QverSet = false; } - if (ignoreInptEr=="1") { DieOnError = false; } - } - ~InputStreamer(); - //path, fasta, qual, pairNum - string setupInput(string path, int i, int tarID, - const vector& uF, const vector& FQ, const vector& Fas, const vector& Qual, - const vector& midf, int &paired, string onlyPair, - string& shortMainFile, bool simu = false); - bool setupFastaQual(string,string, string, int&, string,bool=false); - void setupFna(string); - //path, fastq, fastqVer, pairNum - bool setupFastq(string,string, int&,string,bool = false); - //0=pair 1; 1=pair 2; 2=midSeq; sync=synchronize read pairs (ie only first pair read so far, jump to same DNA reads with second pair) - shared_ptr getDNA(bool&,int,bool& sync); - void jumpToNextDNA(bool&, int); - //shared_ptr getDNA2(bool&); - //shared_ptr getDNA_MID(bool&); - bool hasMIDseqs(){return hasMIDs;} - void allStreamClose(); - void allStreamReset(); - void openMIDseqs(string,string); - int pairNum() { return numPairs; } - bool qualityPresent() { return !qualAbsent; } - bool checkInFileStatus(); - void atFileYofX(uint cF, uint tF, uint BCn) { currentFile = cF; totalFiles = tF; BCnumber = BCn; } - uint getCurFileN() { return currentFile; } - -private: - inline qual_score minmaxQscore(qual_score t);// , int lnCnt); - int parseInt(const char** p1);// , int &pos);// , const char ** &curPos); - bool setupFastq_2(string, string, string); - bool setupFastaQual2(string, string, string = "fasta file"); - shared_ptr read_fastq_entry(istream & fna, int &minQScore, - int&,bool&,bool); - shared_ptr read_fastq_entry_fast(istream & fna, int&,bool&); - void jmp_fastq(istream &, int&); - bool read_fasta_entry(istream&fna,istream&qual,shared_ptr in,shared_ptr,int&); - bool getFastaQualLine(istream&fna, string&); - void maxminQualWarns_fq(); - int auto_fq_version(); - int auto_fq_version(int minQScore, int maxQScore=0); - void resetLineCounts(){ lnCnt[0] = 0; lnCnt[1] = 0; lnCnt[2] = 0; } - bool desync(int pos) { if ( abs(pairs_read[pos] - pairs_read[opos[pos]]) > 1 ) {return true; } return false; } - void IO_Error(string x); - //bar on file read progress - void _measure(istream &); - inline bool _drawbar(istream &); - inline void _print(int cur, float prog); - int _fileLength, _max, _last; - - - - //abstraction to real file type - vector fna_u, qual_u, fastq_u; - vector inFiles_fna, inFiles_qual, inFiles_fq; - //0,1,2 refers to pairs / MID fasta files - //vector fna, qual, fastq; - //ifstream qual, fastq, - //second pair - //ifstream fna2, qual2, fastq2, - //usually used for MID - //fna3, qual3, fastq3; - - //required for Fasta in term storage - vector> tdn1; vector> tdn2; - //shared_ptr tdn21; shared_ptr tdn22; - //shared_ptr tdn31; shared_ptr tdn32; - bool fnaRead, hasMIDs; - vector lnCnt;// , lnCnt2, lnCnt3;//line count - int fastQver,minQScore,maxQScore;//which version of Fastq? minima encountered Qscore - bool QverSet; - //1 or 2? - int numPairs; - //keep track of sequences read for each pair; other position (1=2,2=1) - vector pairs_read, opos; - //some stats to print, nothing really relevant - uint currentFile, totalFiles, BCnumber; - //is quality information even available? - bool qualAbsent; - //fq format not checked for completeness - bool fqReadSafe, fqPassedFQsdt, fqSolexaFmt; - bool openedGZ; - - //collects errors, handles errors - vector ErrorLog; - bool DieOnError; -}; - - - - - - -#ifdef _gzipread2 -std::vector< char > readline(gzFile f); -#endif - -#endif \ No newline at end of file diff --git a/configs/sdm_src/InputStream.o b/configs/sdm_src/InputStream.o deleted file mode 100644 index 86049c7..0000000 Binary files a/configs/sdm_src/InputStream.o and /dev/null differ diff --git a/configs/sdm_src/Makefile b/configs/sdm_src/Makefile deleted file mode 100644 index f23813b..0000000 --- a/configs/sdm_src/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# copyright: Michael Safyan -# modified by Falk Hildebrand - -program_NAME := sdm -program_C_SRCS := $(wildcard *.c) -program_CXX_SRCS := $(wildcard *.cpp) -program_C_OBJS := ${program_C_SRCS:.c=.o} -program_CXX_OBJS := ${program_CXX_SRCS:.cpp=.o} -program_OBJS := $(program_C_OBJS) $(program_CXX_OBJS) -program_INCLUDE_DIRS := -program_LIBRARY_DIRS := ${CPATH} -program_LIBRARIES := - - -CPPFLAGS +=-O3 -lz -std=c++0x -D__USE_XOPEN2K8 -CPPFLAGS += $(foreach includedir,$(program_INCLUDE_DIRS),-I$(includedir)) -LDFLAGS += $(foreach librarydir,$(program_LIBRARY_DIRS),-L$(librarydir)) -LDFLAGS += $(foreach library,$(program_LIBRARIES),-l$(library)) - -.PHONY: all clean distclean - -all: $(program_NAME) - -$(program_NAME): $(program_OBJS) - $(LINK.cc) $(program_OBJS) -lz -o $(program_NAME) - -clean: - @- $(RM) $(program_NAME) - @- $(RM) $(program_OBJS) - -distclean: clean - diff --git a/configs/sdm_src/containers.cpp b/configs/sdm_src/containers.cpp deleted file mode 100644 index b89c7b0..0000000 --- a/configs/sdm_src/containers.cpp +++ /dev/null @@ -1,5422 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand -email: Falk.Hildebrand@gmail.com - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - -#include "containers.h" -using namespace std; - - -void trim(string& str){ - // trim trailing spaces - size_t endpos = str.find_last_not_of(" \t"); - if( string::npos != endpos ) - { - str = str.substr( 0, endpos+1 ); - } - - // trim leading spaces - size_t startpos = str.find_first_not_of(" \t"); - if( string::npos != startpos ) - { - str = str.substr( startpos ); - } -} -//from http://stackoverflow.com/questions/8888748/how-to-check-if-given-c-string-or-char-contains-only-digits -bool is_digits(const std::string &str) -{ - return std::all_of(str.begin(), str.end(), ::isdigit); // C++11 -} - - - -bool betterPreSeed(shared_ptr d1, shared_ptr d2, shared_ptr ref) { - //0.2% difference is still ok, but within 0.5% of the best found seed (prevent detoriating sequence match) - //float blen = (float)ref->length() + (float)d1->length(); - shared_ptr ref2 = ref->getPair(); - uint curL = d1->mem_length(); - if (d2 != NULL) { curL += d2->mem_length(); } - else { - if (d1->has2PrimersDetected() && !ref->has2PrimersDetected()) { return true; } - if (!d1->has2PrimersDetected() && ref->has2PrimersDetected()) { return false; } - } - uint bestL = ref->getBestSeedLength(); - if (d1->getFwdPrimCut() && !ref->getFwdPrimCut()){ return true; }//hard reason - if (!d1->getFwdPrimCut() && ref->getFwdPrimCut()) { return false; } - - if (float(curL) / float(bestL) < BestLengthRatio) { return false; } - - //at least 90% length of "good" hit - if (d1->mem_length() / ref->mem_length() < RefLengthRatio) { return false; } - - //checks if the new DNA has a better overall quality - //1 added to qual, in case no Qual DNA is used - float thScore = (1 + d1->getAvgQual())* log((float)d1->mem_length()); - float rScore = (1 + ref->getAvgQual())* log((float)ref->mem_length()); - if (thScore > rScore) { - //also check for stable lowest score - if (d1->minQual() > ref->minQual() - MinQualDiff && (d2 == NULL || ref2 == NULL)) { - if (curL > bestL) { ref->setBestSeedLength(curL); } - return true; - } - } - if (d2 == NULL || ref2 == NULL) { - return false; - } - if (d2->getRevPrimCut() && !ref2->getRevPrimCut()) { return true; }//hard reason - if (!d2->getRevPrimCut() && ref2->getRevPrimCut()) { return false; }//hard reason - - //at least 90% length of "good" hit - if (d2->mem_length() / ref2->mem_length() < RefLengthRatio) { return false; } - - //checks if the new DNA has a better overall quality - //weigh with average id to OTU seed - thScore += (1 + d2->getAvgQual()) * log((float)d2->mem_length()) * 97; - rScore += (1 + ref2->getAvgQual()) * log((float)ref2->mem_length()) * 97; - if (thScore > rScore) { - //update best seed length score - if (curL > bestL) { ref->setBestSeedLength(curL); } - return true; - } - - return false; -} - -dualPrimerDistrStats::dualPrimerDistrStats(const vector&, const vector&){ - -} - -ReadSubset::ReadSubset(const string inf, const string default_outfile): -RemainderStrPos(-1), newHD(0), outFiles(0), outFilesIdx(0) { - string line; - ifstream in(inf.c_str()); - if (!in){ - cerr << "Could not find " << inf << " read subset file. Exiting.\n"; exit(90); - } - int ini_ColPerRow(0), cnt(0), skips(0); - - //check read subset format - while (!safeGetline(in, line).eof()) { - if (line.substr(0, 1) == "#"){ skips++; continue; } - string segments; - int ColsPerRow = 0; // Initialize counter. - stringstream ss; - ss << line; - while (getline(ss, segments, '\t')) { - ColsPerRow++; - } - - if (cnt == 0){ - ini_ColPerRow = ColsPerRow; - } - else { - if (ColsPerRow != ini_ColPerRow){ - cerr << "Number of columns in read subset file on line " << cnt + skips << " is " << ColsPerRow << ". Expected " << ini_ColPerRow << " columns.\n"; - exit(91); - } - if (cnt > 1000){ - break; - } - } - cnt++; - } - if (ini_ColPerRow == 0){ - cerr << "Read Subset File exists, but appears to be badly formated (0 columns detected). Exiting\n"; exit(92); - } - if (cnt == 0){ - cerr << "Read Subset File exists, but appears to be badly formated (0 lines detected). Exiting\n"; exit(92); - } - in.clear(); - in.seekg(0, ios::beg); - //extract read subset content - cnt = 0; - map tmpFiles; - map::iterator tmpFilIT; - unordered_map ::iterator TarIT; - //parameters were set for in matrix, now read line by line - while (!safeGetline(in, line).eof()) { - // while(getline(in,line,'\n')) { - if (cnt != 0 && line.substr(0, 1) == "#"){ continue; } - if (line.length() < 5){ continue; } - stringstream ss; string segments; int cnt2 = 0; - ss << line; - while (getline(ss, segments, '\t')) { - if (cnt2 == 0){ - //free target RD from paired end info - remove_paired_info(segments); - TarIT = Targets.find(segments); - if (TarIT == Targets.end()){ - Targets[segments] = cnt; - } else { - break; - } - } - else if (cnt2 == 1){ - newHD.push_back(segments); - } - else if (cnt2 == 2){ - uint Idx(0); - //1: test if outfile already exists - tmpFilIT = tmpFiles.find(segments); - if (tmpFilIT != tmpFiles.end()){ - Idx = (*tmpFilIT).second; - } - else {//create new key - Idx = (uint) outFiles.size(); - outFiles.push_back(segments); - tmpFiles[segments] = Idx; - } - //2: add the index to outfile to this position - outFilesIdx.push_back(Idx); - } - cnt2++; - } - if (cnt2 == 1) {//no specific header - newHD.push_back(""); - tmpFilIT = tmpFiles.find("Default"); - uint Idx(0); - if (tmpFilIT != tmpFiles.end()) { - Idx = (*tmpFilIT).second; - } else {//create new key - Idx = (uint)outFiles.size(); - outFiles.push_back("Default"); - tmpFiles["Default"] = Idx; - } - - outFilesIdx.push_back(Idx); - } - if (cnt2>0) { cnt++; } - } - //is extra file info in read subset file? - if (ini_ColPerRow < 3){ - outFilesIdx.resize(Targets.size(), 0); - outFiles.resize(1, default_outfile); - } - -} - -void ReadSubset::findMatches(shared_ptr IS, shared_ptr MD, bool mocatFix) { - shared_ptr match(NULL); shared_ptr match2(NULL); - bool cont(true), cont2(true); - int idx(0); - int pairs = IS->pairNum(); - unordered_map ::iterator SEEK; - bool b_doHD = newHD.size() > 0; - bool sync(false);//meaningless placeholder - while (cont) { - match = IS->getDNA(cont, 0, sync); - if (match == NULL) {break;} - if (pairs > 1) { - match2 = IS->getDNA(cont2, 1, sync); - } - string curID = match->getIDPosFree(); - if (mocatFix && curID.length()>5 ) { - curID = header_stem(curID); - //cerr << curID << endl; - //exit(0); - } - //cerr << curID << endl; - SEEK = Targets.find(curID); - if (SEEK == Targets.end()) {//no hit - if (RemainderStrPos != -1) { - MD->writeSelectiveStream(match, 0, RemainderStrPos); - if (pairs > 1) { - MD->writeSelectiveStream(match2, 1, RemainderStrPos); - } - } else {// nothing written, but still need to delete -// delete match; if (pairs > 1) {delete match2;} - } - continue; - } - //serious work - //cerr << "H"; - idx = SEEK->second; - if (b_doHD && newHD[idx] != "" && pairs>1) { - match->newHEad(newHD[idx] + "#1:0"); - match2->newHEad(newHD[idx] + "#2:0"); - } - MD->writeSelectiveStream(match, 0, outFilesIdx[idx]); - if (pairs > 1) { - MD->writeSelectiveStream(match2, 1, outFilesIdx[idx]); - } - - //finished, clean up to reduce search space (faster??) - Targets.erase(SEEK); - if (Targets.size() == 0 && RemainderStrPos==-1) { - return; - } - } - cerr << Targets.size() << " seqs remaining (not found in current file)\n"; -} - - - - - -MultiDNA::MultiDNA(shared_ptr fil,OptContainer& cmdArgs, - std::ios_base::openmode writeStatus, shared_ptr RDSset, - string fileExt,int forceFmt) : - MFil(fil),subFilter(0),DNAsP1(0),DNAsP2(0),DNAsS1(0),DNAsS2(0), - DNAsNoHead(0),DNAsP1_alt(0),DNAsP2_alt(0),DNAsS1_alt(0),DNAsS2_alt(0), - suppressOutWrite(0),write2File(true), mem_used(false), - DNAinMem(0),writeThreadStatus(0), - fastQver(fil->getuserReqFastqVer()), - fastQoutVer(fil->getuserReqFastqOutVer()),BWriteQual(false), - BWriteFastQ(false), b_multiOutStream(false), b_changeFQheadVer(false), - b_oneLinerFasta(false), b_doDereplicate(false), b_writePassed(true), b_writeMidPass(true), - maxReadsPerOFile(fil->maxReadsOutput()),ReadsWritten(fil->writtenReads()), - maxRdsOut(-1), stopAll(false), - leadingOutf(""), locCmdArgs(cmdArgs), Derepl(NULL), cntDerep(0), wrMode(ios::out), - sFile(0), qFile(0), fqFile(0), - sFileStr(0), qFileStr(0), fqFileStr(0), fqNoBCFile(0), totalFileStrms(0) -/* qFilePos(0), sFilePos(0), fqFilePos(0), - qFile2Pos(0), sFile2Pos(0), fqFile2Pos(0),//second pair - qFileSPos(0), sFileSPos(0), fqFileSPos(0),//singleton - qFileS2Pos(0), sFileS2Pos(0), fqFileS2Pos(0)//singleton*/ -{ - - pairedSeq = MFil->isPaired(); - if (cmdArgs.find("-suppressOutput") != cmdArgs.end()) { - suppressOutWrite = atoi( cmdArgs["-suppressOutput"].c_str() ); - } - if (suppressOutWrite == 3 || suppressOutWrite == 1){ - b_writePassed = false; - } - if (suppressOutWrite == 3 || suppressOutWrite == 2){ - b_writeMidPass = false; - } - if (fil->doSubselReads() && RDSset != nullptr ) { - this->openSeveralOutstreams(locCmdArgs,RDSset, writeStatus); - } else {//standard one file output stream - this->openOutStreams(locCmdArgs, MFil->getFileIncrementor(), writeStatus, fileExt, forceFmt); - } - if (cmdArgs["-o_fastq_noBC"] != "") { - openNoBCoutstrean(cmdArgs["-o_fastq_noBC"]); - } - - //threads = futures(num_threads); - //if (pairedSeq){cerr<<"paired MFil\n";} -} -MultiDNA::~MultiDNA(){ - //delete MFil; -#ifdef DEBUG - cerr << "Destr MultiDNA" ; -#endif - delAllDNAvectors(); -#ifdef DEBUG - cerr << ".. done" << endl; -#endif - -/* for (uint i=0; icloseOutStreams(true); -#ifdef DEBUG - cerr << "Subfilters deleted, streams closed" << endl; -#endif - //delete optim; -} -void MultiDNA::delAllDNAvectors(){ -#ifdef DEBUG - cerr << "cleaning MD.."; -#endif - /*for (unsigned int i=0; i < DNAsP1.size(); i++){ delete DNAsP1[i]; } - for (unsigned int i=0; i < DNAsP2.size(); i++){ delete DNAsP2[i]; } - for (unsigned int i=0; i < DNAsS1.size(); i++){ delete DNAsS1[i]; } - for (unsigned int i=0; i < DNAsS2.size(); i++){ delete DNAsS2[i]; } - for (unsigned int i=0; i < DNAsNoHead.size(); i++){ delete DNAsNoHead[i]; } - for (unsigned int i=0; i < DNAsP1_alt.size(); i++){ delete DNAsP1_alt[i]; } - for (unsigned int i=0; i < DNAsP2_alt.size(); i++){ delete DNAsP2_alt[i]; } - for (unsigned int i=0; i < DNAsS1_alt.size(); i++){ delete DNAsS1_alt[i]; } - for (unsigned int i=0; i < DNAsS2_alt.size(); i++){ delete DNAsS2_alt[i]; } - */ - DNAsP1.resize(0);DNAsP2.resize(0);DNAsS1.resize(0); - DNAsS2.resize(0);DNAsNoHead.resize(0); - DNAsP1_alt.resize(0);DNAsP2_alt.resize(0); - DNAsS1_alt.resize(0);DNAsS2_alt.resize(0); -#ifdef DEBUG - cerr << " finished\n"; -#endif - -} - - -void MultiDNA::analyzeDNA(shared_ptr d, int FilterUse, int pair,int& idx) { - if (d==NULL){ - return ; - } - //collect some info on general run parameters - MFil->preFilterSeqStat(d, pair); - - - if ( !MFil->doFilterAtAll() ) { - bool isP1 = max(0, pair) == 0; - if (idx < 0 && !isP1 && !MFil->doubleBarcodes()) { - ; - } else if (idx < 0) { - idx = MFil->cutTag(d, isP1); //still need to check for BC - } - if (idx >= 0) { //prevent second read pair from being flagged as true - d->setPassed(true); - } - return; - } - - if (MFil->secondaryOutput() ){ - MFil->checkXtra(d, pair, idx); - } else if (FilterUse==-1){ - MFil->check(d, false, pair, idx); - } else { - cerr << "Invalid control path in analyzeDNA\n"; exit(55); - subFilter[FilterUse]->check(d, false, pair, idx); - } - //count this as failure if BC was present - //d->prepareWrite(fastQoutVer); -} -vector MultiDNA::analyzeDNA(shared_ptr p1, shared_ptr p2, shared_ptr mid, - bool changePHead, int FilterUse){ - cerr << "deprecated analuze DNA"; exit(2323); - vector ret(2, true); - //1st: check if DNA pointer valid - if (p1 == NULL){ - ret[0] = false; - } else { - MFil->preFilterSeqStat(p1, 0); - } - if (p2 == NULL){ - ret[1] = false; - } else { - MFil->preFilterSeqStat(p2, 1); - } - if (mid == NULL){//no MID? kill - ret[1] = false; ret[0] = false; return ret; - } - else { - mid->setMIDseq(true); - } - - //2nd: check if basic DNA signatures are valid - - //3rd: check if all quality criteria are met, BC is true etc. - //if (FilterUse == -1){ - //ret = MFil->check_pairs(p1, p2, mid, ret, changePHead); - //} else { //mutithread, to avoid race conditions etc use separate filter - // ret = subFilter[FilterUse]->check_pairs(p1, p2, mid, ret, changePHead); - //} - //if (ret[0]){ p1->prepareWrite(fastQoutVer); } - //if (ret[1]){ p2->prepareWrite(fastQoutVer);} - - return ret; -} - -bool MultiDNA::checkFastqHeadVersion(shared_ptr d,bool disable){ - b_changeFQheadVer = false; - if (pairedSeq==1){return false;} - int fastQheadVer = 0; - string head = d->getID(); - int shouldVer = MFil->FQheadV(); - if (head.find("/1") != string::npos || head.find("/2") != string::npos){ - fastQheadVer = 1; - } - if (head.find(" 1:") != string::npos|| head.find(" 2:") != string::npos){ - fastQheadVer = 2; - } - if (shouldVer!=0 && shouldVer != fastQheadVer){ - b_changeFQheadVer=true; - } - bool ret = b_changeFQheadVer; - if (disable){ - b_changeFQheadVer = false; - } - return ret; -} - - - -void MultiDNA::writeAllStoredDNA(){ -#ifdef DEBUG - printStorage(); - cerr << "Writting stored DNA" << DNAsP1.size() <<" " <0){ - if (writeThreadStatus>1){wrThread.join();} - writeThreadStatus++; - wrThread = thread(&MultiDNA::writeAllStoredDNA2t,this); - } else { - writeAllStoredDNA2(); - } -#else - writeAllStoredDNA2(); -#endif -} -void MultiDNA::writeAllStoredDNA2(){ - mem_used=false; - // - if (b_writePassed) { - - // - if (DNAsP1.size() != DNAsP2.size()) { - for (unsigned int i = 0; i < DNAsP1.size(); i++) { - writeAndDel(DNAsP1[i], 0); - ReadsWritten++; - } - for (unsigned int i = 0; i < DNAsP2.size(); i++) { - writeAndDel(DNAsP2[i], 1); - } - } else { - for (unsigned int i = 0; i < DNAsP1.size(); i++) { - writeAndDel(DNAsP1[i], 0); - writeAndDel(DNAsP2[i], 1); - ReadsWritten++; - } - } - for (unsigned int i = 0; i < DNAsS1.size(); i++) { - writeAndDel(DNAsS1[i], 2); - } - - for (unsigned int i = 0; i < DNAsS2.size(); i++) { - writeAndDel(DNAsS2[i], 3); - } - DNAsP1.resize(0); DNAsP2.resize(0); - DNAsS1.resize(0); DNAsS2.resize(0); - } - if (b_writeMidPass) { - - //Xtra file - if (DNAsP1_alt.size() != DNAsP2_alt.size()) { - for (unsigned int i = 0; i < DNAsP1_alt.size(); i++) { - writeAndDel(DNAsP1_alt[i], 0); - ReadsWritten++; - } - for (unsigned int i = 0; i < DNAsP2_alt.size(); i++) { - writeAndDel(DNAsP2_alt[i], 1); - } - } else { - for (unsigned int i = 0; i < DNAsP1_alt.size(); i++) { - writeAndDel(DNAsP1_alt[i], 0); - writeAndDel(DNAsP2_alt[i], 1); - ReadsWritten++; - } - } - for (unsigned int i = 0; i < DNAsS1_alt.size(); i++) { - writeAndDel(DNAsS1_alt[i], 2); - } - - for (unsigned int i = 0; i < DNAsS2_alt.size(); i++) { - writeAndDel(DNAsS2_alt[i], 3); - } - DNAsP1_alt.resize(0); DNAsP2_alt.resize(0); - DNAsS1_alt.resize(0); DNAsS2_alt.resize(0); - } - //just to be on the safe side.. - delAllDNAvectors(); -#ifdef DEBUG - cerr << " .. Finished" << endl; -#endif -} -#ifdef _THREADED -void MultiDNA::writeAllStoredDNA2t(){ - std::lock_guard guard(mutex); - write2File=true;mem_used=false; - if (DNAsP1.size() != DNAsP2.size()){ - for (unsigned int i=0; iincrementFileIncrementor(); - this->closeOutStreams(true); - //this is definetely a new file - - this->openOutStreams(locCmdArgs,MFil->getFileIncrementor(),ios_base::out); - ReadsWritten = 0; -} -void Filters::addDNAtoCStats(shared_ptr d,int Pair) { - //here should be the only place to count Barcodes! - int easyPair = Pair < 3 ? Pair - 1 : Pair - 3; - colStats[easyPair].total2++; - if (d->isPassed() || d->isMidQual()) { - this->DNAstatLQ(d, easyPair, d->isMidQual()); - colStats[easyPair].totalSuccess++; - } else { - colStats[easyPair].totalRejected++; - - } - - //some general stats that always apply: - if (d->QualCtrl.PrimerFail) { - colStats[easyPair].PrimerFail++; - } - if (d->QualCtrl.PrimerRevFail) { - colStats[easyPair].PrimerRevFail++; - } - if (d->QualCtrl.minLqualTrim) { - colStats[easyPair].minLqualTrim++; - } - if (d->QualCtrl.TagFail) { - colStats[easyPair].TagFail++; - } - if (d->QualCtrl.fail_correct_BC) { - colStats[easyPair].fail_correct_BC++; - } - if (d->QualCtrl.suc_correct_BC) { - colStats[easyPair].suc_correct_BC++; - } - if (d->QualCtrl.RevPrimFound) { - colStats[easyPair].RevPrimFound++; - } - if (d->QualCtrl.QWinTrimmed || d->QualCtrl.AccErrTrimmed) { - colStats[easyPair].Trimmed++; - } - if (d->getTA_cut()) { - colStats[easyPair].adapterRem++; - } - - - if (d->isPassed() || d->isMidQual()) { - countBCdetected(d->getBCnumber(), easyPair, false); - //and register as success - } else { - if (d->getBarcodeDetected()) { - //DNA is no longer useful - failedStats2(d, easyPair); - } - //delete d; - if (d->QualCtrl.AvgQual) { - colStats[easyPair].AvgQual++; - } - if (d->QualCtrl.minL) { - colStats[easyPair].minL++; - } - if (d->QualCtrl.maxL) { - colStats[easyPair].maxL++; - } - if (d->QualCtrl.HomoNT) { - colStats[easyPair].HomoNT++; - } - if (d->QualCtrl.MaxAmb) { - colStats[easyPair].MaxAmb++; - } - if (d->QualCtrl.BinomialErr) { - colStats[easyPair].BinomialErr++; - } - if (d->QualCtrl.QualWin) { - colStats[easyPair].QualWin++; - } - } - - -} -bool MultiDNA::saveForWrite(shared_ptr d,int Pair){ - //second most important part: collect stats on DNA passing through here (should be all read) - //most important part: save DNA to be written later (or discard) - if (d == NULL || stopAll) { - return !stopAll; - } - //int easyPair = Pair < 3 ? Pair - 1 : Pair - 3; - MFil->addDNAtoCStats(d, Pair); - - if( d->isPassed()){ - - if (b_writePassed){ - d->prepareWrite(fastQoutVer); - //lock MultiDNA -#ifdef _THREADED - std::lock_guard guard(mutex); -#endif - //dereplicate & create copy of DNA? - - mem_used = true; - if (Pair == 1){ - DNAsP1.push_back(d); - } - else if (Pair == 2){ - DNAsP2.push_back(d); - } - else if (Pair == 3){ - DNAsS1.push_back(d); - MFil->colStats[0].singleton++; - } - else if (Pair == 4){ - DNAsS2.push_back(d); - MFil->colStats[1].singleton++; - } - DNAinMem++; - } - - } else if (d->isMidQual()){ - - if (b_writeMidPass){ - d->prepareWrite(fastQoutVer); - mem_used = true; - if (Pair == 1){ - DNAsP1_alt.push_back(d); - } - else if (Pair == 2){ - DNAsP2_alt.push_back(d); - } - else if (Pair == 3){ - MFil->statAddition.singleton++; - DNAsS1_alt.push_back(d); - } - else if (Pair == 4){ - MFil->statAddition.singleton++; - DNAsS2_alt.push_back(d); - } - DNAinMem++; - } - - } - //automatic mechanism to write to File, once enough DNA is in memory - if (write2File && DNAinMem>DNAinMemory){ - writeAllStoredDNA(); - DNAinMem=0; - } - if (maxReadsPerOFile>0 && ReadsWritten+DNAinMem >= maxReadsPerOFile){ - //cerr << "ReadsWritten " << ReadsWritten << " DNAinMem " << DNAinMem << endl; - DNAinMem=0; - incrementOutputFile(); - } - if (maxRdsOut > 0 && ReadsWritten + DNAinMem >= maxRdsOut) { - writeAllStoredDNA(); - stopAll = true; - - } - return !stopAll; -} -void MultiDNA::writeAndDel(shared_ptr d,int Pair){ - //ofstream tmpS, tmpQ, tmpFQ; -// int PairC = Pair; -// if (Pair > 1) { -// PairC = Pair - 2; -// } - - - int ClsVec = -1; - if (d != NULL) { - if (MFil->doFilterAtAll()) { - if (d->isPassed()) { - ClsVec = 0; - } else if (d->isMidQual()) { - ClsVec = 1; - } - } - else { - ClsVec = 0; - } - } - if (ClsVec >= 0) { - if (BWriteFastQ && b_changeFQheadVer) {//check if header PE naming needs to be changed - d->changeHeadPver(MFil->FQheadV()); - } - if (BWriteFastQ) { - if (fqFile[ClsVec][Pair ] == NULL ) { - //cerr << "_"; - openOFstreamFQ(fqFileStr[ClsVec][Pair], wrMode, ClsVec, Pair , "Appending"); - } - //cerr << "X"; - d->writeFastQ(*(fqFile[ClsVec][Pair])); - } else { - if (sFile[ClsVec][Pair]==NULL ) { - openOFstreamFNA(sFileStr[ClsVec][Pair], wrMode, ClsVec, Pair, "Appending"); - } - - d->writeSeq(*sFile[ClsVec][Pair ], b_oneLinerFasta); - if (BWriteQual) { - if (qFile[ClsVec][Pair]==NULL) { - openOFstreamQL(qFileStr[ClsVec][Pair], wrMode, ClsVec, Pair, "Appending"); - } - d->writeQual(*qFile[ClsVec][Pair], b_oneLinerFasta); - } - } - } - -// delete d; - -} -void MultiDNA::writeSelectiveStream(shared_ptr d, int Pair,int FS) { - //ofstream tmpS, tmpQ, tmpFQ; - if (d == 0) { - return; - } - d->prepareWrite(fastQoutVer); - int PairC = Pair; - if (Pair > 1) { - PairC = Pair - 2; - } - if (d->isPassed()) { - MFil->DNAstatLQ(d, PairC, false); - } else if ( d->isMidQual()) { - MFil->DNAstatLQ(d, PairC, true); - } - if (BWriteFastQ && b_changeFQheadVer) {//check if header PE naming needs to be changed - d->changeHeadPver(MFil->FQheadV()); - } - if (BWriteFastQ) { - if (fqFile[FS][Pair]==NULL||!*fqFile[FS][Pair]) { - openOFstreamFQ(fqFileStr[FS][Pair], ios_base::app, FS, Pair, "Appending"); - } - d->writeFastQ(*(fqFile[FS][Pair])); - delete fqFile[FS][Pair]; fqFile[FS][Pair] = NULL; - } else { - if (sFile[FS][Pair]==NULL||!*sFile[FS][Pair]) { - openOFstreamFNA(sFileStr[FS][Pair], ios_base::app, FS, Pair, "Appending"); - } - d->writeSeq(*sFile[FS][Pair],b_oneLinerFasta); - if (BWriteQual) { - if (qFile[FS][Pair]==NULL||!*qFile[FS][Pair]) { - openOFstreamQL(sFileStr[FS][Pair], ios_base::app, FS, Pair, "Appending"); - } - d->writeQual(*qFile[FS][Pair], b_oneLinerFasta); - } - } - - -// delete d; - -} -void MultiDNA::writeNonBCReads(shared_ptr d, shared_ptr d2) { - if (fqNoBCFile.size() == 2) { - if (!d->getBarcodeDetected() && !d2->getBarcodeDetected()) { - if ((!d->getBarcodeDetected() && d2->getBarcodeDetected()) || - (d->getBarcodeDetected() && !d2->getBarcodeDetected())) { - cerr << "Barcode only set in 1 reads.. something wrong!\n"; - } - d->writeFastQ(*(fqNoBCFile[0])); - d2->writeFastQ(*(fqNoBCFile[1])); - } - } -} - -void MultiDNA::setSubfilters(int num){ - if (num<2){return;} - subFilter.resize(num); - for (uint i=0;i idx (MFil->Barcode.size(),0); - for (uint i=0;iaddStats(subFilter[i],idx); - } -} -void MultiDNA::attachDereplicator(shared_ptr de) { - if (de != NULL) { - Derepl = de; b_doDereplicate = true; - Derepl->setPaired(pairedSeq>1); - //insert code here to fix BC offset in filter & add to derep info on sample names from the current filter - int curBCOffset = Derepl->getHighestBCoffset(); -#ifdef DEBUG - cerr << "BARCODE INCREMENT: " << curBCOffset << endl; -#endif - - MFil->setBCoffset(curBCOffset); - Derepl->BCnamesAdding(MFil); - } -} - -/*void MultiDNA::depPrep(shared_ptr tdn) { - - if (b_doDereplicate && tdn->isPassed()) { - cntDerep++; - Derepl->addDNA(tdn); - } -}*/ -void MultiDNA::depPrep(shared_ptr tdn, shared_ptr tdn2) { - bool added = false; - if (!b_doDereplicate) { - return; - } - Derepl->addDNA(tdn, tdn2, added); - if (added){ - cntDerep++; - if (tdn->getBarcodeDetected() && !tdn->isPassed() && !tdn->isMidQual() ){ - MFil->statAddDerepBadSeq(tdn->getBCnumber()); - - } - } - -} - -void MultiDNA::resetOutFilesAndFilter(){ -#ifdef DEBUG - cerr << "resetOutFilesAndFilter" << endl; -#endif - //reset count stats - MFil->resetStats(); - //reset all stored DNA - delAllDNAvectors(); - Derepl->reset(); - //close streams -- why? no reason, since nothing has been writen - //this->closeOutStreams(); - totalFileStrms = 0; -} - -void MultiDNA::closeOutStreams(bool wr){ - - - write2File = true; - if (wr){ - this->writeAllStoredDNA(); - } - -#ifdef DEBUG - cerr << "closing output streams"; -#endif - - for (int i = 0; i < (int)sFile.size(); i++) { - for (int j = 0; j < (int) sFile[i].size(); j++) { - if (sFileStr[i][j] != "T") { - delete sFile[i][j]; sFile[i][j] = NULL; - } - } - } - for (int i = 0; i < (int)qFile.size(); i++) { - for (int j = 0; j < (int)qFile[i].size(); j++) { - if (qFileStr[i][j] != "T") { - delete qFile[i][j]; qFile[i][j] = NULL; - } - } - } - for (int i = 0; i < (int)fqFile.size(); i++) { - for (int j = 0; j < (int)fqFile[i].size(); j++) { - if (fqFileStr[i][j] != "T") { - delete fqFile[i][j]; fqFile[i][j] = NULL; - } - } - } - //if(qFile){qFile.close();}if(sFile){sFile.close();}if(fqFile){fqFile.close(); } - //other housekeeping tasks - this->mergeSubFilters(); - MFil->setWrittenReads(ReadsWritten); -#ifdef DEBUG - cerr << ".. closed\n"; -#endif - -} -/*void MultiDNA::resetOutStreams(){ - if(qFile){qFile.seekp(qFilePos);}if(sFile){sFile.seekp(sFilePos);}if(fqFile){fqFile.seekp(fqFilePos); } - if(qFile2){qFile2.seekp(qFile2Pos);}if(sFile2){sFile2.seekp(sFile2Pos);}if(fqFile2){fqFile2.seekp(fqFile2Pos); } - if(qFileS){qFileS.seekp(qFileSPos);}if(sFileS){sFileS.seekp(sFileSPos);}if(fqFileS){fqFileS.seekp(fqFileSPos); } - if(qFileS2){qFileS2.seekp(qFileS2Pos);}if(sFileS2){sFileS2.seekp(sFileS2Pos);}if(fqFileS2){fqFileS2.seekp(fqFileS2Pos); } -}*/ -void MultiDNA::openOFstream(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg, bool onlyPrep, int wh) { - switch (wh) { - case 1: - openOFstreamFNA(opOF, wrMode, p1, p2, errMsg, onlyPrep); - break; - case 0: - openOFstreamFQ(opOF, wrMode, p1, p2, errMsg, onlyPrep); - break; - case 2: - openOFstreamQL(opOF, wrMode, p1, p2, errMsg, onlyPrep); - break; - default: - cerr << "Wrong wh specified"; exit(1002); - } -} -void MultiDNA::openNoBCoutstrean(const string inS) { - vector tfnaout = splitByCommas(inS); - fqNoBCFile.resize(2, NULL); - fqNoBCFile[0] = new ofstream(tfnaout[0], wrMode); - fqNoBCFile[1] = new ofstream(tfnaout[1], wrMode); - -} - -void MultiDNA::openOFstreamFQ(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg, bool onlyPrep) { - if (p2 > 3) { cerr << "internal error: can't have more than 4 entries in output file stream\n"; exit(1001); } - if (p1+1 >= (int)fqFile.size()) { - vector nullVec(4, NULL); - fqFile.resize(p1+1, nullVec); - } - if ((int)fqFileStr.size() - 1 <= p1) { - fqFileStr.resize(p1 + 1, vector(4, "")); - } - fqFileStr[p1][p2] = opOF; - if (onlyPrep) { return; } - if (p1 == 1 && !b_writeMidPass ){ return; }//p1==1: mid passed suppressOutWrite >= 2 - if (p1 == 0 && !b_writePassed){ return; }//suppressOutWrite == 1 - if (opOF == "T") { - fqFile[p1][p2] = &std::cout; - } else if (isGZfile(opOF)) { -#ifdef DEBUG - cerr << "open fq.gz with wrmode" << wrMode << endl; -#endif -#ifdef _gzipread - fqFile[p1][p2] = new ogzstream(opOF.c_str(), wrMode);// wrMode); -#else - cerr << "gzip outpout not supported in your sdm build\n" << opOF; exit(50); -#endif - - }else{ - fqFile[p1][p2] = new ofstream(opOF, wrMode); - } - - if (!*fqFile[p1][p2]) { - cerr << "Could not open " << errMsg << " fastq output file " << opOF << endl << p1 << " " << p2 << " " << totalFileStrms << endl; - exit(4); - } -} -void MultiDNA::openOFstreamFNA(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg, bool onlyPrep) { - if (p2 > 3) { cerr << "internal error: can't have more than 4 entries in output file stream\n"; exit(1001); } - if (p1+1 >= (int)sFile.size()) { - vector nullVec(4, NULL); - sFile.resize(p1+1, nullVec); - } - if ((int)sFileStr.size() - 1 <= p1) { - sFileStr.resize(p1 + 1, vector(4, "")); - } - sFileStr[p1][p2] = opOF; - if (onlyPrep ) { return; } - if (p1 == 1 && !b_writeMidPass){ return; }//p1==1: mid passed suppressOutWrite >= 2 - if (p1 == 0 && !b_writePassed){ return; }//suppressOutWrite == 1 - if (opOF == "T") { - sFile[p1][p2] = &std::cout; - } else if (isGZfile(opOF)) { -#ifdef _gzipread - sFile[p1][p2] = new ogzstream(opOF.c_str(), wrMode); -#else - cerr << "gzip outpout not supported in your sdm build\n" << opOF; exit(50); -#endif - } - else { - sFile[p1][p2] = new ofstream(opOF, wrMode); - } - if (!*sFile[p1][p2]) { - cerr << "Could not open " << errMsg << " fasta output file " << opOF << endl; - exit(4); - } -} -void MultiDNA::openOFstreamQL(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg, bool onlyPrep) { - if (p2 > 3) { cerr << "internal error: can't have more than 4 entries in output file stream\n"; exit(1001); } - if (p1+1 >= (int)qFile.size()) { - vector nullVec(4, NULL); - qFile.resize(p1+1, nullVec); - } - if ((int)qFileStr.size() - 1 <= p1) { - qFileStr.resize(p1 + 1, vector(4, "")); - } - qFileStr[p1][p2] = opOF; - if (onlyPrep) { return; } - if (p1 == 1 && !b_writeMidPass){ return; }//p1==1: mid passed suppressOutWrite >= 2 - if (p1 == 0 && !b_writePassed){ return; }//suppressOutWrite == 1 - if (opOF == "T") { - qFile[p1][p2] = &std::cout; - } - else if (isGZfile(opOF)) { -#ifdef _gzipread - qFile[p1][p2] = new ogzstream(opOF.c_str(), wrMode); -#else - cerr << "gzip outpout not supported in your sdm build\n" << opOF; exit(50); -#endif - } else { - qFile[p1][p2] = new ofstream(opOF, wrMode); - } - if (!*qFile[p1][p2]) { - cerr << "Could not open " << errMsg << " quality output file " << opOF << endl; - exit(4); - } -} - -void MultiDNA::openSeveralOutstreams(OptContainer& cmdArgs, shared_ptr RDS, std::ios_base::openmode wrMode) { -#ifdef DEBUG - cerr << " openining multiple out streams" << endl; -#endif - string path = "", fileEnd(".fna"); - vector outFile = RDS->getOFiles(); - bool openStrms = true; int omode(1); - vector>& tmp = sFileStr; - if (cmdArgs.find("-o_fastq") != cmdArgs.end() && cmdArgs["-o_fastq"] != "" && cmdArgs["-o_fna"] == "") { //write fastq - BWriteFastQ = true; - path = cmdArgs["-o_fastq"]; - tmp = fqFileStr; omode = 0; - fileEnd = ".fastq"; - } else if (cmdArgs["-o_fna"] != "") { - BWriteFastQ = false; - path = cmdArgs["-o_fna"]; - } else { - cerr << "Could not find valid sequence output file path. Exiting\n"; - exit(55); - } - if (RDS->multiFile()) { - b_multiOutStream = true; - } - int i(0); - for (i = 0; i < (int)outFile.size(); i++) { - if (totalFileStrms >= maxFileStreams && openStrms) { - cerr << "Too many output file streams\nSwitching to dynamical file appending\n"; - openStrms = false; - //this->closeOutStreams(); - } - string baseFile = path + removeFileEnding(outFile[i]); - if (pairedSeq > 1) { - if (i < 4) { - cerr << "Outfile " << i << ": "<setRemainingFilepipe(i); - if (pairedSeq == 1) { - openOFstream(cmdArgs["-excludeFile"], wrMode, i, 0, "excludeFile file ", !openStrms, omode); - } else { - string baseFile = path + removeFileEnding(cmdArgs["-excludeFile"]); - if (i < 4) { - cerr << "out file " << i << baseFile + ".1" << fileEnd << "\n"; - } - openOFstream(baseFile + ".1" + fileEnd, wrMode, i, 0, "paired 1st " + itos(i), !openStrms, omode); - //2nd pair - openOFstream(baseFile + ".2" + fileEnd, wrMode, i, 1, "paired 2nd " + itos(i), !openStrms, omode); - //1st singleton - //if (openStrms) { openOFstream(baseFile + ".1" + fileEnd + SingletonFileDescr, wrMode, i, 2, "Singleton 1 " + itos(i), omode); } - //2nd singleton - //if (openStrms) { openOFstream(baseFile + ".2" + fileEnd + SingletonFileDescr, wrMode, i, 3, "Singleton 2 " + itos(i), omode); } - totalFileStrms += 2; - } - } -} - -void MultiDNA::openOutStreams(OptContainer& cmdArgs,int fileIt,std::ios_base::openmode wrMode_i, - string fileExt, int forceFmt){ - this->setwriteMode(wrMode_i); - if ( suppressOutWrite == 3 || (cmdArgs["-o_fastq"] == "" && cmdArgs["-o_fna"] == "" && cmdArgs["-o_qual"] == "") ){ - suppressOutWrite = 3; b_writePassed = false; b_writeMidPass = false; return; - } - if (forceFmt != -1){ - if (forceFmt == 1 && (cmdArgs.find("-o_fna") == cmdArgs.end() || cmdArgs["-o_fna"] == "") ){//force fna, no qual, required for seed ref fastas - cmdArgs["-o_fna"] = cmdArgs["-o_fastq"]; - cmdArgs["-o_fastq"] = ""; - } - } - - - if (cmdArgs.find("-o_fastq") != cmdArgs.end() && cmdArgs["-o_fastq"] != "" && cmdArgs["-o_fna"] == ""){ //write fastq - this->setFastQWrite(true); - if (pairedSeq>1){ //open second pair + singleton -#ifdef DEBUG - cerr << " paired fastq out " << endl; -#endif - vector tfnaout = splitByCommas(cmdArgs["-o_fastq"]); - if (tfnaout.size()!=2){ - cerr<<"Paired sequences given as input, requires paired output file (2 files separated by \",\"). Given output file = "<1){ //write fastq - tfnaout = splitByCommas(cmdArgs["-o_fastq2"]); - openOFstreamFQ(applyFileIT(tfnaout[0] + fileExt, fileIt).c_str(), wrMode, 1, 0, "additional paired 1st"); - openOFstreamFQ(applyFileIT(tfnaout[1] + fileExt, fileIt).c_str(), wrMode, 1, 1, "additional paired 2nd"); - openOFstreamFQ(applyFileIT(tfnaout[0] + fileExt , fileIt, SingletonFileDescr).c_str(), wrMode, 1, 2, "additional Singleton 1", true); - openOFstreamFQ(applyFileIT(tfnaout[1] + fileExt , fileIt, SingletonFileDescr).c_str(), wrMode, 1, 3, "additional Singleton 2", true); - } - } else { -#ifdef DEBUG - cerr << " single fastq out " << endl; -#endif - openOFstreamFQ(applyFileIT(cmdArgs["-o_fastq"] + fileExt, fileIt).c_str(), wrMode, 0, 0, "the main"); - leadingOutf = applyFileIT(cmdArgs["-o_fastq"] + fileExt, fileIt); - //additional file - if (cmdArgs.find("-o_fastq2") != cmdArgs.end() && cmdArgs["-o_fastq2"].length()>1) { //write fastq - openOFstreamFQ(applyFileIT(cmdArgs["-o_fastq2"] + fileExt, fileIt).c_str(), wrMode, 1, 0, "the additional"); - } - } - - return; - } - BWriteFastQ=false; - if (pairedSeq==1){ -#ifdef DEBUG - cerr << " fasta singleton output " << endl; -#endif - openOFstreamFNA(applyFileIT(cmdArgs["-o_fna"] + fileExt, fileIt).c_str(), wrMode, 0, 0, "main"); - leadingOutf = applyFileIT(cmdArgs["-o_fna"] + fileExt, fileIt); - if (cmdArgs["-o_qual"] != ""){ - openOFstreamQL(applyFileIT(cmdArgs["-o_qual"] + fileExt, fileIt).c_str(), wrMode, 0, 0, "main"); - this->setQualWrite(true); - } else { - this->setQualWrite(false); - } - //additional file (secondary filter) - if (cmdArgs.find("-o_fna2") != cmdArgs.end() && cmdArgs["-o_fna2"].length()>1) { //add file - openOFstreamFNA(applyFileIT(cmdArgs["-o_fna2"] + fileExt, fileIt).c_str(), wrMode, 1, 0, "additional"); - } - if (cmdArgs.find("-o_qual2") != cmdArgs.end() && cmdArgs["-o_qual2"].length()>1) { //add file - openOFstreamQL(applyFileIT(cmdArgs["-o_qual2"] + fileExt, fileIt).c_str(), wrMode, 1, 0, "additional"); - this->setQualWrite(true); - } - - } else { -#ifdef DEBUG - cerr << " fasta paired output " << endl; -#endif - - vector tfnaout = splitByCommas(cmdArgs["-o_fna"]); - if (tfnaout.size()!=2){ - cerr<<"Paired sequences given as input, requires paired output file (2 files separated by \",\"). Given output file = "<1){ //write fastq - tfnaout = splitByCommas(cmdArgs["-o_fna2"]); - - openOFstreamFNA(applyFileIT(tfnaout[0] + fileExt, fileIt).c_str(), wrMode, 1, 0, "additional paired 1st", true); - openOFstreamFNA(applyFileIT(tfnaout[1] + fileExt, fileIt).c_str(), wrMode, 1, 1, "additional paired 2nd", true); - openOFstreamFNA(applyFileIT(tfnaout[0] + fileExt + SingletonFileDescr, fileIt).c_str(), wrMode, 1, 2, "additional Singleton 1", true); - openOFstreamFNA(applyFileIT(tfnaout[1] + fileExt + SingletonFileDescr, fileIt).c_str(), wrMode, 1, 3, "additional Singleton 2", true); - } - if (cmdArgs["-o_qual"] != ""){ - this->setQualWrite(true); - vector tqout = splitByComma(cmdArgs["-o_qual"],true); - openOFstreamQL(applyFileIT(tqout[0] + fileExt, fileIt).c_str(), wrMode, 0, 0, "paired 1st"); - openOFstreamQL(applyFileIT(tqout[1] + fileExt, fileIt).c_str(), wrMode, 0, 1, "paired 2nd"); - openOFstreamQL(applyFileIT(tqout[0] + fileExt + SingletonFileDescr, fileIt).c_str(), wrMode, 0, 2, "Singleton 1", true); - openOFstreamQL(applyFileIT(tqout[1] + fileExt + SingletonFileDescr, fileIt).c_str(), wrMode, 0, 3, "Singleton 2", true); - if (cmdArgs["-o_qual2"] != "") { - vector tqout = splitByComma(cmdArgs["-o_qual2"], true); - openOFstreamQL(applyFileIT(tqout[0] + fileExt, fileIt).c_str(), wrMode, 1, 0, "additional paired 1st", true); - openOFstreamQL(applyFileIT(tqout[1] + fileExt, fileIt).c_str(), wrMode, 1, 1, "additional paired 2nd", true); - openOFstreamQL(applyFileIT(tqout[0] + fileExt + SingletonFileDescr, fileIt).c_str(), wrMode, 1, 2, "additional Singleton 1", true); - openOFstreamQL(applyFileIT(tqout[1] + fileExt + SingletonFileDescr, fileIt).c_str(), wrMode, 1, 3, "additional Singleton 2", true); - } - } else { - this->setQualWrite(false); - } - - } - - -} -//******************************************* -//* DEREPLICATE OBJECT -//******************************************* -Dereplicate::Dereplicate(OptContainer& cmdArgs): -BCN2SmplID(0), b_usearch_fmt(true), b_singleLine(true), b_pairedInput(false), -minCopies(1,0), minCopiesStr("0"), //default minCopies accepts every derep -totSize(0), tmpCnt(0), curBCoffset(0){ - outfile = cmdArgs["-o_dereplicate"]; - if (cmdArgs.find("-dere_size_fmt") != cmdArgs.end() && cmdArgs["-dere_size_fmt"] == "1") { - b_usearch_fmt = false; - } - if (cmdArgs.find("-min_derep_copies") != cmdArgs.end()) { - minCopies[0] = -1;//in this case reset to -1 the first entry.. - minCopiesStr = cmdArgs["-min_derep_copies"]; - string x = cmdArgs["-min_derep_copies"]; - - vector xs = splitByCommas(x, ','); - for (size_t i = 0; i < xs.size(); i++) { - vector tmp = splitByComma(xs[i], false, ':'); - if (tmp.size() > 2) { - cerr << "Derep string " << xs[i] << " has to be two integers seqparated by \":\"\n"; exit(623); - } - int pos = 1; int cnt = -1; - if (tmp.size() == 1) {//this is the 1 sample position, if not set - cnt = atoi(tmp[0].c_str()); - }else{ - pos = atoi(tmp[1].c_str());//number of samples - cnt = atoi(tmp[0].c_str());//16s copy numbers - } - if (pos <= 0) { cerr << "wrong derplicate position (<=0):" << xs[i] << endl; exit(312); } - if (pos > 3000) { cerr << "too large derplicate position (>3000):" << xs[i] << endl; exit(313); } - if (minCopies.size() < (size_t)pos) { - minCopies.resize(pos, -1); - } - minCopies[pos-1] = cnt; - } - minCopiesSiz = minCopies.size(); - //minCopies = atoi(cmdArgs["-min_derep_copies"].c_str()); - - } -} -/*bool Dereplicate::addDNA(shared_ptr d) { - //1st build hash of DNA - string seq = d->getSeqPseudo(); - int BCN = d->getBCnumber(); - if (BCN >= 0) { tmpCnt++; } - HashDNAIT spotted = Tracker.find(seq); - if (spotted != Tracker.end()) {// found something - int hpos ( spotted->second ); - Dnas[hpos]->addSmpl(BCN); - if (betterPreSeed(d, NULL, Dnas[hpos])) { - //replace old DNA - DNAunique *du = new DNAunique(d, -1); - du->saveMem(); - du->setBestSeedLength(Dnas[hpos]->getBestSeedLength()); - du->transferOccurence(Dnas[hpos]); - delete Dnas[hpos]; - Dnas[hpos] = du; - } - return false; - } else { - //create entry - Tracker[seq] = (int)Dnas.size(); - DNAunique *tmp = new DNAunique(d, BCN); - tmp->saveMem(); - Dnas.push_back(tmp); - return true; - } - return true; -}*/ - -bool Dereplicate::addDNA( shared_ptr d, shared_ptr d2, bool& added) { - //1st build hash of DNA - if (!d->getBarcodeDetected()) { - return false; - } - string seq = d->getSeqPseudo(); - int BCN = d->getBCnumber(); - bool pass = d->isPassed(); - shared_ptr tmp = make_shared(d, BCN); - - //if (!) { return false; } - auto spotted = Tracker.find(tmp); - if (spotted != Tracker.end()) {// found something - added = true; //d->setMidQual(false); tmp->setMidQual(false); - //int idx (spotted->second); - (*spotted)->addSmpl(BCN); - if (pass && betterPreSeed(d, d2, (*spotted))) { - //replace old DNA - //shared_ptr du = make_shared(d, -1); - tmp->saveMem(); - tmp->setBestSeedLength((*spotted)->getBestSeedLength()); - tmp->transferOccurence((*spotted)); - if (d2 != NULL) { - tmp->attachPair( make_shared(d2, -1)); - } else {//do nothing - //du->attachPair(new DNAunique()); - } - //delete Dnas[idx]; - Tracker.erase(*spotted); - Tracker.insert(tmp); - } - return false;//added to seeds - } else if (pass){ - added = true; d->setMidQual(false); - //create entry - //Tracker[seq] = (int)Dnas.size(); - //shared_ptr tmp = make_shared(d, BCN); - tmp->saveMem(); - if (d2 != NULL) { - tmp->attachPair( make_shared(d2, BCN)); - } else { - //tmp->attachPair(new DNAunique()); - } - Tracker.insert(tmp); - //Dnas.push_back(tmp); - //size_t idx = Dnas.size(); - return true;//new seed - } - - return true;//didn't do a thing.. -} -void Dereplicate::BCnamesAdding(shared_ptr fil) { - vector refSID = fil->SampleID; - for (size_t i = 0; i < refSID.size(); i++) { - BCN2SmplID.push_back(refSID[i]); - } - //this will in the end be used to synchronize the different barcodes between samples - curBCoffset = (int)BCN2SmplID.size(); - //cerr << curBCoffset << " Added BC list\n"; -} -void Dereplicate::reset() { -// for (size_t i = 0; i < Dnas.size(); i++) { delete Dnas[i]; } -// Dnas.resize(0); - totSize = 0; tmpCnt = 0; - //BCN2SmplID.resize(0); - Tracker.clear(); -} -bool DNAuPointerCompare(shared_ptr l, shared_ptr r) { return l->getCount() > r->getCount(); } - -string Dereplicate::writeDereplDNA(shared_ptr mf) { - ofstream of, omaps, of2, ofRest, of2p2; - cerr << "Evaluating and writing dereplicated reads..\n"; - int fastqVer = mf->getuserReqFastqOutVer(); - string mapF = outfile.substr(0, outfile.find_last_of('.')) + ".map"; - string outHQf = outfile.substr(0, outfile.find_last_of('.')) + ".hq.fq"; - string outHQf_p2 = outfile.substr(0, outfile.find_last_of('.')) + ".2.hq.fq"; - string outRest = outfile + ".rest"; - //setup map header - omaps.open(mapF.c_str(), ios::out);//| ios::binary - of2.open(outHQf.c_str(), ios::out); - of.open(outfile.c_str(), ios::out); - ofRest.open(outRest.c_str(), ios::out); - if (b_pairedInput) { - of2p2.open(outHQf_p2, ios::out); - } - - //sample specific derep filter - bool bDerepSmpSpcfc(false); - vector derepSmpSpcfc = mf->getDrerepSampleSpecifity(); - if (derepSmpSpcfc.size() > 0) { bDerepSmpSpcfc = true; } - - //combiner relevant vars - bool bCombiSmpl = mf->combineSamples(); - vector smplId2comb(0,0); - unordered_map & combiMapCollectGrp = mf->combiMapCollectGrp; - - //vector refSID = mf->SampleID; - omaps << "#SMPLS"; - if (!bCombiSmpl){ - for (size_t i = 0; i < BCN2SmplID.size(); i++) { - omaps << "\t" + itos((int)i) + ":" + BCN2SmplID[i]; - } - } else { - smplId2comb = mf->combiSmplConvergeVec(BCN2SmplID); - if (BCN2SmplID.size() != smplId2comb.size()){ - cerr << "FATAL: BCN2SmplID != smplId2comb\n"; exit(234); - } - for(unordered_map::iterator IT = combiMapCollectGrp.begin(); IT != combiMapCollectGrp.end(); IT++){ - omaps << "\t" << itos(IT->second) + ":" + IT->first; - } - } - omaps << "\n"; - - //convert to vector, that can than be written out - vector> Dnas (Tracker.size()); - int cnt = 0; - for (const auto& dd : Tracker) { - Dnas[cnt] = dd; - cnt++; - } -// bool DNAuPointerCompare(shared_ptr l, shared_ptr r) { return l->getCount() < r->getCount();} - sort(Dnas.begin(), Dnas.end() , DNAuPointerCompare); - totSize = 0; size_t passed_hits(0); - //bool thrHit = false; - - //sanity check - vector cntspersmpl(BCN2SmplID.size(), 0); - //print unique DNAs - for (size_t i = 0; i < Dnas.size(); i++) { - //for (const auto& dd : Tracker){ - //reformat header - shared_ptr dd = Dnas[i]; - dd->Count2Head(b_usearch_fmt); - - if (( bDerepSmpSpcfc && dd->pass_deprep_smplSpc(derepSmpSpcfc)) || - pass_deprep_conditions(dd) ) { - - passed_hits++; - totSize += dd->getCount(); - dd->writeSeq(of, b_singleLine); - - } else { - dd->writeSeq(ofRest, b_singleLine); - } - dd->writeMap(omaps, dd->getID(), cntspersmpl, smplId2comb); - - //write out full seq + fastq - dd->resetTruncation(); - dd->prepareWrite(fastqVer); - dd->writeFastQ(of2); - if (b_pairedInput) { - shared_ptr oD = dd->getPair(); - if (oD != NULL) { - oD->resetTruncation(); - oD->prepareWrite(fastqVer); - oD->writeFastQ(of2p2); - } else { - shared_ptr tmp = make_shared< DNA>("", dd->getID()); - tmp->writeFastQEmpty(of2p2); - - } - } - } -// if (tmpCnt != totSize) { -// cerr << "Counting failed\n" << tmpCnt << " " << totSize< 0) { - report += "; average size in this set is " + ftos(avgSize) + ".\nUniques with insufficient abundance : " + intwithcommas(int(Tracker.size() - passed_hits)) + " not passing derep conditions\n"; - } - //cerr << tmpCnt << endl; - cerr << "\n" << report << endl << endl; - int uneqCnts(0); - //check cntspersmpl vector - vector SampleID = mf->SampleID; - for (unsigned int i = 0; icolStats[0].BarcodeDetected[i] != cntspersmpl[j] && mf->colStats[0].BarcodeDetected[i] != -1) { - //cerr << "ERROR: Unequal counts for " << SampleID[i] << ": " << mf->colStats.BarcodeDetected[i] << " vs " << cntspersmpl[j] << endl; - uneqCnts++; - - } else if (!detected) { - cerr << "Could not detect Sample " << BCN2SmplID[j] << " in ref set (check that mapping file is correctly formatted?).\n"; exit(87); - } - } -#ifdef DEBUG - cerr << "Derep Fin" << endl; -#endif - if (b_pairedInput) { of2.close(); } - if (uneqCnts>0) { - cerr << "Unequal counts in " << uneqCnts << " cases. \n"; - //exit(66); - } - return report; -} - - -bool Dereplicate::pass_deprep_conditions(shared_ptr d) { - vector x = d->getDerepMapSort(minCopiesSiz); - int cumSum(0); - for (size_t i = 0; i < x.size(); i++) { - cumSum += x[i]; - //if (minCopies[i] == -1) { continue; } - size_t yy(minCopiesSiz); - if (yy > i) { yy = i+1; }//if checking for 1 smpl copy, don't need to check for 2 samlpe allowed copy number.. - for (size_t j = 0; j < yy; j++) { - if (minCopies[j] == -1) { continue; } - - //if (i > minCopiesSiz) { break; } - if (cumSum >= minCopies[j]) { return true; } - } - } - return false; -} - -void writeLog(string& logf) { - ofstream of; - of.open(logf.c_str()); - of.close(); -} - -string additionalFileName(const string& in){ - if (in.find(",") == std::string::npos){ - return additionalFileName2(in); - } - else { - vector vi = splitByCommas(in); - string out = additionalFileName2(vi[0]); - for (uint i = 1; i < vi.size(); i++){ - out += "," + additionalFileName2(vi[i]); - } - return out; - } -} -string additionalFileName2(const string& in){ - size_t point = in.find_last_of('.'); - if (point == (size_t)-1){ return in + ".add"; } - return in.substr(0, point) + ".add." + in.substr(point + 1); -} - - -//******************************************* -//* FILTERS OBJECT -//******************************************* - - -Filters::Filters(OptContainer& cmdArgs) : - PrimerL(0), PrimerR(0), PrimerL_RC(0), PrimerR_RC(0), PrimerIdx(0), - Barcode(0), revBarcode(0), Barcode2(0), revBarcode2(0), - HeadSmplID(0), - hetPrimer(2,vector(0)), - demultiSinglFiles(0), demultiSinglFilesF(0), - colStats(2), - FastaF(0), QualF(0), FastqF(0), MIDfqF(0), - derepMinNum(0), - lMD(NULL), RepStat(2, NULL), RepStatAddition(2, NULL), - tAdapter(""), tAdapterLength(0), - bDoAdapter(false), bDoMultiplexing(true), bDoBarcode(true), - bDoBarcode2(false), - bDoHeadSmplID(false), bBarcodeSameSize(false), - bOneFileSample(false), curBCnumber(-1), BCoffset(0), - bAdditionalOutput(false), b2ndRDBcPrimCk(false), - bRevRdCk(false), bChkRdPrs(true), - min_l(0), alt_min_l(0), min_l_p(-1.f), alt_min_l_p(-1.f), - maxReadLength(0), norm2fiveNTs(false), - max_l(10000), min_q(0.f), alt_min_q(0.f), - BcutPrimer(true), alt_BcutPrimer(true), bPrimerR(false), - bRequireRevPrim(false), alt_bRequireRevPrim(false), - bRequireFwdPrim(false), alt_bRequireFwdPrim(false), BcutTag(true), - bCompletePairs(false), bShortAmplicons(false), - MinTagLen(0), MinTagLen2(0),MaxTagLen(0), MaxTagLen2(0), MinPrimLen(0), maxHomonucleotide(0), - PrimerErrs(0), alt_PrimerErrs(0), TagErrs(0), - MaxAmb(-1), alt_MaxAmb(-1), - FQWwidth(0), EWwidth(0), - RevPrimSeedL(5), - b_BinFilBothPairs(false), - BinFilErr(2.5), BinFilP(-1.f), - alt_FQWthr(0), alt_EWthr(0), - PEheaderVerWr(0), TrimStartNTs(0), TruncSeq(-1), - userReqFastqVer(0), userReqFastqOutVer(33), maxAccumQP(-1), - alt_maxAccumQP(-1), - pairedSeq(-1), - revConstellationN(0), - BCdFWDREV(2), - Xreads(-1), - restartSet(false),b_optiClusterSeq(false), - b_subselectionReads(false), b_doQualFilter(true), - b_doFilter(true), - bDoDereplicate(false), bDoCombiSamples(false), - bDoDemultiplexIntoFiles(false), - maxReadsPerOFile(0),ReadsWritten(0),OFileIncre(0), - Barcode_len(0), Barcode2_len(0) -// dPDS(make_unique()), dHDS(make_unique()) - { - - colStats[0] = collectstats(); colStats[1] = collectstats(); - if (bAdditionalOutput){ - statAddition = collectstats(); - } - bool alt_bRequireRevPrimSet=false; - - - string optF (""); - if (cmdArgs.find("-options") != cmdArgs.end()) { - optF = cmdArgs["-options"]; - } - - iniSpacer = cmdArgs["-sample_sep"]; - //*************************************** - //default options - int maxAmb(0),PrimerErrs(1),TagErrs(0); - float minQual(25); - float minL(250.f); - int maxL(1000); - int QualWinWidth = 50; - float QualWinThr = 0; - int EndWinWidth = 15; - float EndWinThr = 20; - int maxHomoNT(12); - bool keepTag(false),keepPrimer(false); - bool addModConf = false; - - //set up some basic objects - if ( cmdArgs.find("-paired") != cmdArgs.end() ) { - pairedSeq = atoi(cmdArgs["-paired"].c_str()); //fakeEssentials(); - if ( pairedSeq<1 || pairedSeq>3 ) { cerr << "Argument \"-paired\" supplied with unknown parameter. Aborting.\n"; exit(28); } - if ( cmdArgs["-onlyPair"] == "1" || cmdArgs["-onlyPair"] == "2" ) { - pairedSeq = 1; - } - } - if (cmdArgs.find("-normRdsToFiveNTs") != cmdArgs.end()) { - norm2fiveNTs = true; - cerr << "Warning: normRdsToFiveNTs is not implemented!\n"; - } - //delimit output file size to X reads - if ( cmdArgs.find("-maxReadsPerOutput") != cmdArgs.end() ) { - maxReadsPerOFile = atoi(cmdArgs["-maxReadsPerOutput"].c_str()); - } - //important for fastq format - if ( cmdArgs.find("-i_qual_offset") != cmdArgs.end() ) { - if ( cmdArgs["-i_qual_offset"] == "auto" ) { - userReqFastqVer = 0; - } else { - userReqFastqVer = atoi(cmdArgs["-i_qual_offset"].c_str()); - } - } - //cerr<(true); - RepStatAddition[i] = make_shared(true); - } - PreFiltP1 = make_shared(true); - PreFiltP2 = make_shared(true); - //do new SEED sequence selection? - if ( cmdArgs.find("-optimalRead2Cluster") != cmdArgs.end() ) { - b_optiClusterSeq = true; - } - //do selection of specific reads? - if ( cmdArgs["-specificReads"] != "" ) { - b_subselectionReads = true; - } - if (cmdArgs.find("-binomialFilterBothPairs") != cmdArgs.end() && cmdArgs["-binomialFilterBothPairs"] == "1") { - b_BinFilBothPairs = true; - } - //*************************************** - //read options - ifstream opt; - opt.open(optF.c_str(),ios::in); - if ( !opt || optF=="") { - cerr << "NO filtering will be done on your reads (just rewriting / log files created)." << endl; - b_doFilter = false; - return; - } - string line; - while (getline(opt,line,'\n')){ - - if (line.length()<=1 || line.substr(0,1)=="#"){ - continue; - } - - bool addMod = false; - if (line.substr(0,1)=="*"){ - addMod= true; - line = line.substr(1); - } - string segs; - string segs2; - stringstream ss; - ss << line; - getline(ss,segs,'\t'); - getline(ss,segs2,'\t'); - - if (cmdArgs["-XfirstReads"] != "") { - Xreads = atoi(cmdArgs["-XfirstReads"].c_str()); - } - - - if (strcmp(segs.c_str(),"minSeqLength") == 0){ - if (addMod){ - float tmp = (float)atof(segs2.c_str()); - if (tmp>1.f) { - alt_min_l = (int)tmp; - } else { - alt_min_l = -1; - alt_min_l_p = tmp; - } - if (alt_min_l != minL){ addModConf = true; } - } else { - minL = (float)atof(segs2.c_str()); - } - } else if (strcmp(segs.c_str(),"maxSeqLength") == 0){ - maxL = atoi(segs2.c_str()); - } else if (strcmp(segs.c_str(),"minAvgQuality") == 0){ - if (addMod){ - alt_min_q = (float) atof(segs2.c_str()); - if (alt_min_q!= minQual){addModConf = true;} - } else { - minQual = (float) atof(segs2.c_str()); - } - } else if (strcmp(segs.c_str(),"maxAmbiguousNT") == 0){ - if (addMod){ - alt_MaxAmb = atoi(segs2.c_str()); - if (MaxAmb!= alt_MaxAmb){addModConf = true;} - } else { - maxAmb = atoi(segs2.c_str()); - } - } else if (strcmp(segs.c_str(),"QualWindowThreshhold") == 0){ - if (addMod){ - alt_FQWthr = (float) atof(segs2.c_str()); - if (alt_FQWthr!= QualWinThr){addModConf = true;} - } else { - QualWinThr = (float) atof(segs2.c_str()); - } - } - else if (strcmp(segs.c_str(), "QualWindowWidth") == 0){ - QualWinWidth = atoi(segs2.c_str()); - } - else if (strcmp(segs.c_str(), "BinErrorModelMaxExpError") == 0){ - BinFilErr = (float) atof(segs2.c_str()); - if (BinFilErr < 0){ - cerr << "BinErrorModelMaxExpError was set to <0. Set to 0 instead.\n"; - BinFilErr = 0; - } - } - else if (strcmp(segs.c_str(), "BinErrorModelAlpha") == 0){ - BinFilP = (float)atof(segs2.c_str()); - if (BinFilP != -1.f && (BinFilP<0.f || BinFilP>1.f)){ - cerr << "BinErrorModelAlpha has to be between 0 and 1 (or -1 to deactivate).\nAborting..\n"; - exit(542); - } - - } else if (strcmp(segs.c_str(),"TrimWindowWidth") == 0){ - EndWinWidth = atoi(segs2.c_str()); - } else if (strcmp(segs.c_str(),"TrimWindowThreshhold") == 0){ - if (addMod){ - alt_EWthr = (float) atof(segs2.c_str()); - if (alt_EWthr != EndWinThr){addModConf = true;} - } else { - EndWinThr = (float) atof(segs2.c_str()); - } - } else if (strcmp(segs.c_str(),"maxBarcodeErrs") == 0){ - TagErrs = atoi(segs2.c_str()); - } else if (strcmp(segs.c_str(),"maxPrimerErrs") == 0){ - if (addMod){ - alt_PrimerErrs = atoi(segs2.c_str()); - if (alt_PrimerErrs!=PrimerErrs){ addModConf = true;} - } else { - PrimerErrs = atoi(segs2.c_str()); - } - } else if (strcmp(segs.c_str(),"keepBarcodeSeq") == 0){ - atoi(segs2.c_str())==0 ? keepTag=false : keepTag=true; - } else if (strcmp(segs.c_str(),"keepPrimerSeq") == 0){ - if (addMod){ - atoi(segs2.c_str())==0 ? alt_BcutPrimer=false : alt_BcutPrimer=true; - if(alt_BcutPrimer!=keepPrimer) {addModConf = true;} - } else { - atoi(segs2.c_str())==0 ? keepPrimer=false : keepPrimer=true; - } - } else if (strcmp(segs.c_str(),"maxHomonucleotide") == 0){ - maxHomoNT = atoi(segs2.c_str()); - } else if (strcmp(segs.c_str(),"maxAccumulatedError") == 0){ - if (addMod){ - alt_maxAccumQP = double(atof(segs2.c_str())); - if(alt_maxAccumQP!=maxAccumQP) {addModConf = true;} - } else{ - maxAccumQP = double(atof(segs2.c_str())); - } - } else if (strcmp(segs.c_str(),"TechnicalAdapter") == 0){ - tAdapter = segs2.c_str(); - transform(tAdapter.begin(), tAdapter.end(),tAdapter.begin(), ::toupper); - tAdapterLength = (int) tAdapter.length(); - bDoAdapter = true; - }else if (segs == "PEheaderPairFmt"){ - PEheaderVerWr = atoi(segs2.c_str()); - } else if (segs == "TrimStartNTs"){ - TrimStartNTs = atoi(segs2.c_str()); - } else if (segs == "fastqVersion"){ - if (segs2 == "auto") { - userReqFastqVer = 0; - } else { - userReqFastqVer = FastqVerMod(atoi(segs2.c_str())); - } - } else if (segs == "RejectSeqWithoutRevPrim"){ - if (addMod){ - alt_bRequireRevPrimSet=true; - if (segs2=="T"){alt_bRequireRevPrim=true; - } else {alt_bRequireRevPrim=false;} - if(alt_bRequireRevPrim!=bRequireRevPrim){addModConf = true;} - } else { - if (segs2=="T"){bRequireRevPrim=true; - } else {bRequireRevPrim=false;} - } - } else if (segs == "RejectSeqWithoutFwdPrim"){ - if (addMod){ - alt_bRequireFwdPrim=true; - if (segs2=="T"){alt_bRequireFwdPrim=true; - } else {alt_bRequireFwdPrim=false;} - if(alt_bRequireFwdPrim!=bRequireFwdPrim){addModConf = true;} - } else { - if (segs2=="T"){bRequireFwdPrim=true; - } else {bRequireFwdPrim=false;} - } - } else if (segs == "TruncateSequenceLength") { - TruncSeq = atoi(segs2.c_str()); - if (TruncSeq != -1 && TruncSeq < (int)minL) { minL = (float)TruncSeq; } - } else if (segs == "AmpliconShortPE") { - if (segs2 == "T") { - bShortAmplicons = true; - } else { bShortAmplicons = false; } - } else if (segs == "CheckForMixedPairs") { - if (segs2 == "T") { - b2ndRDBcPrimCk = true; - } else { b2ndRDBcPrimCk = false; } - } else if ( segs == "CheckForReversedSeqs" ) { - if ( segs2 == "T" ) { - bRevRdCk = true; - } else { bRevRdCk = false; } - } - else if (segs == "SyncReadPairs") { - if (segs2 == "T") { - bChkRdPrs = true; - } - else { bChkRdPrs = false; } - } - - - } - - //report some non-std options - if (bShortAmplicons){ - cerr << "Checking for reverse primers on 1st read.\n"; - } - if (b2ndRDBcPrimCk){ - cerr << "Checking for switched pairs.\n"; - } - - opt.close(); - //set in filter object - this->setSeqLength(minL,maxL); - this->setPrimerErrs(PrimerErrs); - this->setTagErrs(TagErrs); - this->removePrimer(!keepPrimer); - this->removeTag(!keepTag); - this->setMaxAmb(maxAmb); - this->setAvgMinQual(minQual); - this->setFloatingQWin(QualWinWidth,QualWinThr); - this->setFloatingEWin(EndWinWidth,EndWinThr); - this->setMaxHomo(maxHomoNT); - //alternative options (mid qual filtering) - if (addModConf){ - if (!alt_bRequireRevPrimSet){alt_bRequireRevPrim = bRequireRevPrim;} - if (cmdArgs.find("-o_fna") != cmdArgs.end() && cmdArgs["-o_fna"].length()>1){ - if (cmdArgs.find("-o_fna2") == cmdArgs.end()){ - cmdArgs["-o_fna2"] = additionalFileName(cmdArgs["-o_fna"]); - //cmdArgs["-o_fna2"] = cmdArgs["-o_fna"].substr(0,cmdArgs["-o_fna"].length()-4)+".add.fna"; - } - } else if (cmdArgs.find("-o_fastq") != cmdArgs.end() && cmdArgs["-o_fastq"].length()>1){ - if (cmdArgs.find("-o_fastq2") == cmdArgs.end()){ - cmdArgs["-o_fastq2"] = additionalFileName(cmdArgs["-o_fastq"]); - } - } - bAdditionalOutput=true; - } - - - - -} - -//copy of filter, but leave stat vals empty -Filters::Filters(shared_ptr of, int BCnumber, bool takeAll) : - PrimerL(0,""), PrimerR(0,""), - PrimerL_RC(0,""), PrimerR_RC(0,""), - PrimerIdx(of->PrimerIdx), - Barcode(0), revBarcode(0), Barcode2(0), revBarcode2(0), - HeadSmplID(0), hetPrimer(2, vector(0)), colStats(2), - FastaF(of->FastaF), QualF(of->QualF), FastqF(of->FastqF), - MIDfqF(of->MIDfqF), derepMinNum(of->derepMinNum), - lMD(NULL), RepStat(2,NULL), RepStatAddition(2,NULL), - tAdapter(of->tAdapter),tAdapterLength(of->tAdapterLength), - bDoAdapter(of->bDoAdapter),bDoMultiplexing(of->bDoMultiplexing), - bDoBarcode(of->bDoBarcode), bDoBarcode2(of->bDoBarcode2), - bDoHeadSmplID(of->bDoHeadSmplID), - bBarcodeSameSize(of->bBarcodeSameSize), - bOneFileSample(of->bOneFileSample), curBCnumber(BCnumber), BCoffset(0), - bAdditionalOutput(of->bAdditionalOutput), b2ndRDBcPrimCk(of->b2ndRDBcPrimCk), - bRevRdCk(of->bRevRdCk), bChkRdPrs(of->bChkRdPrs), - min_l(of->min_l), alt_min_l(of->alt_min_l), min_l_p(of->min_l_p), alt_min_l_p(of->alt_min_l_p), - maxReadLength(0), norm2fiveNTs(of->norm2fiveNTs), - max_l(of->max_l), min_q(of->min_q), alt_min_q(of->alt_min_q), - BcutPrimer(of->BcutPrimer),alt_BcutPrimer(of->alt_BcutPrimer), - bPrimerR(of->bPrimerR), - bRequireRevPrim(of->bRequireRevPrim),alt_bRequireRevPrim(of->alt_bRequireRevPrim), - bRequireFwdPrim(of->bRequireFwdPrim),alt_bRequireFwdPrim(of->alt_bRequireFwdPrim), - BcutTag(of->BcutTag), - bCompletePairs(of->bCompletePairs), bShortAmplicons(of->bShortAmplicons), - MinTagLen(of->MinTagLen), MinTagLen2(of->MinTagLen2), MaxTagLen(of->MaxTagLen), MaxTagLen2(of->MaxTagLen2), MinPrimLen(of->MinPrimLen), maxHomonucleotide(of->maxHomonucleotide), - PrimerErrs(of->PrimerErrs),alt_PrimerErrs(of->alt_PrimerErrs),TagErrs(of->TagErrs), - MaxAmb(of->MaxAmb), alt_MaxAmb(of->alt_MaxAmb), - FQWwidth(of->FQWwidth),EWwidth(of->EWwidth), - RevPrimSeedL(of->RevPrimSeedL), - b_BinFilBothPairs(of->b_BinFilBothPairs), - BinFilErr(of->BinFilErr), BinFilP(of->BinFilP), - FQWthr(of->FQWthr),EWthr(of->EWthr), - alt_FQWthr(of->alt_FQWthr),alt_EWthr(of->alt_EWthr), - PEheaderVerWr(of->PEheaderVerWr),TrimStartNTs(of->TrimStartNTs), - TruncSeq(of->TruncSeq), - iniSpacer(of->iniSpacer),userReqFastqVer(of->userReqFastqVer), - userReqFastqOutVer(of->userReqFastqOutVer),maxAccumQP(of->maxAccumQP), - alt_maxAccumQP(of->alt_maxAccumQP), - //BChit, BCrevhit initialize to 0 - new set, new luck - pairedSeq(of->pairedSeq), - revConstellationN(0), - BCdFWDREV(of->BCdFWDREV), - restartSet(false), - b_optiClusterSeq(of->b_optiClusterSeq), b_subselectionReads(of->b_subselectionReads), - b_doQualFilter(of->b_doQualFilter), - b_doFilter(of->b_doFilter), - bDoDereplicate(of->bDoDereplicate), - bDoCombiSamples(of->bDoCombiSamples), - bDoDemultiplexIntoFiles(of->bDoDemultiplexIntoFiles), - maxReadsPerOFile(of->maxReadsPerOFile), - ReadsWritten(of->ReadsWritten), OFileIncre(of->OFileIncre), - Barcode_len(0), Barcode2_len(0) -// dPDS(NULL), dHDS(NULL) - { - RepStat[0] = make_shared(of->RepStat[0]->bMedianCalcs); - RepStat[1] = make_shared(of->RepStat[1]->bMedianCalcs); - BCdFWDREV[0].fix(); BCdFWDREV[1].fix(); - RepStatAddition[0] = make_shared(of->RepStatAddition[0]->bMedianCalcs); - RepStatAddition[1] = make_shared(of->RepStatAddition[1]->bMedianCalcs); - PreFiltP1 = make_shared(of->PreFiltP2->bMedianCalcs); - PreFiltP2 = make_shared(of->PreFiltP2->bMedianCalcs); - colStats[0] = collectstats(); colStats[1] = collectstats(); - if (bAdditionalOutput) { - statAddition = collectstats(); - } - if (takeAll) { - this->allResize((uint)of->PrimerIdx.size()); - PrimerIdxRev = of->PrimerIdxRev; - PrimerIdx = of->PrimerIdx; - Barcode = of->Barcode; - Barcode2 = of->Barcode2; - SampleID = of->SampleID; - SampleID_Combi = of->SampleID_Combi; - HeadSmplID = of->HeadSmplID; - PrimerL = of->PrimerL; - PrimerR = of->PrimerR; - PrimerL_RC = of->PrimerL_RC; - PrimerR_RC = of->PrimerR_RC; - hetPrimer = of->hetPrimer; - lMD = of->lMD; - Barcode_len = of->Barcode_len; - Barcode2_len = of->Barcode2_len; - demultiSinglFiles = of->demultiSinglFiles; - demultiSinglFilesF = of->demultiSinglFilesF; - BarcodePreStats(); - - } - -} - -Filters::~Filters() { -#ifdef DEBUG - cerr << "Deleting filter .. " ; -#endif - -// for (size_t i = 0; i < 2; i++) { delete RepStat[i]; delete RepStatAddition[i]; } -// delete PreFiltP1; delete PreFiltP2; -// delete dPDS; delete dHDS; -#ifdef DEBUG - cerr << "Done" << endl; -#endif -} - -//simulates that in mapping file links to sequence file was given. -bool Filters::setcmdArgsFiles(OptContainer& cmdArgs){ - - if (FastqF.size()==0 && QualF.size()==0 && FastaF.size() > 0){ - //fasta entry but no qual entries - - string path=""; - if (cmdArgs.find("-i_path") != cmdArgs.end() && cmdArgs["-i_path"].length() > 2){ - path=cmdArgs["-i_path"] + string("/"); - } - - QualF.resize(FastaF.size()); - for (unsigned int i=0; i< FastaF.size(); i++){ - string newQ = FastaF[i]; - int pos = (int) newQ.find_last_of("."); - newQ = newQ.substr(0,pos); - newQ += string(".qual"); - fstream fin; - string fullQ = path + newQ; - fin.open(fullQ.c_str(),ios::in); - if( fin.is_open() ) { - cerr<<"Using quality file: "<\n"; - newQ = ""; - //fin.close();return false; - } - fin.close(); - QualF[i] = newQ; - } - } - - int fileSiz = (int)Barcode.size(); - //instead of max: - if (!bDoMultiplexing){fileSiz=1;} - - if (FastaF.size()==0 && FastqF.size()==0){ - //set up fasta/fastq vector specific to corresponding BC (that should be in this file) - if (cmdArgs.find("-i_fastq") == cmdArgs.end()){ - FastaF.resize(fileSiz); - QualF.resize(fileSiz); - for (unsigned int i=0; i< FastaF.size(); i++){ - FastaF[i] = cmdArgs["-i_fna"]; - QualF[i] = cmdArgs["-i_qual"]; - } - } else {//fastq input - vector fqTmp (1,cmdArgs["-i_fastq"]); - if (cmdArgs["-i_fastq"].find(";") != string::npos) {//";" denotes several files - if (fileSiz == 1) {//no BC, - fqTmp = splitByCommas(cmdArgs["-i_fastq"], ';'); - this->allResize((uint) fqTmp.size()); - fileSiz = (int) fqTmp.size(); - cerr << "Detected " << fileSiz << " input files (pairs)." << endl; - FastqF = fqTmp; - } else { - cerr << "Fastq string contains symbol \";\". Not allowed in input string"; exit(32); - } - } else { - FastqF.resize(fileSiz, cmdArgs["-i_fastq"]); - } - } - } - - - if (MIDfqF.size() == 0) - if (cmdArgs.find("-i_MID_fastq") != cmdArgs.end()) { - MIDfqF.resize(fileSiz, ""); - for (unsigned int i = 0; i < MIDfqF.size(); i++) { - MIDfqF[i] = cmdArgs["-i_MID_fastq"]; - } - } - if (cmdArgs.find("-o_demultiplex") != cmdArgs.end()) { - generateDemultiOutFiles(cmdArgs["-o_demultiplex"]); - } - - - if (cmdArgs["-o_dereplicate"] != "") { - //check if file could exist - ofstream temp; - temp.open(cmdArgs["-o_dereplicate"].c_str(), ios::out); - if (!temp) { cerr << "Could not open outstream to dereplicated sequences:\n" << cmdArgs["- o_dereplicate"] << endl; exit(78); } - temp.close(); - bDoDereplicate = true; - } - - return true; -} - - -void Filters::generateDemultiOutFiles( string path) { - //fill in demultiSinglFiles vector - vector empVec(2, NULL); - vector empVec2(2, ""); - - struct stat info; - if (stat(path.c_str(), &info) != 0) { - cerr << "Output path for demultiplexed files does not exist, please create this directory:\n" << path << endl; - exit(833); - }else if (info.st_mode & S_IFDIR) { // S_ISDIR() doesn't exist on my windows - cerr<<"Writing demultiplexed files to: "< maxFileStreams) {openOstreams = false;} - if (pairedSeq == 1|| pairedSeq == -1) { - string nfile = path + SampleID[i] + ".fq"; - if (openOstreams) { demultiSinglFiles[i][0] = new ofbufstream(nfile.c_str(), ios::out); } - demultiSinglFilesF[i][0] = nfile; - ostrCnt++; - } - else { - string nfile = path + SampleID[i] + ".1.fq"; - if (openOstreams) { demultiSinglFiles[i][0] = new ofbufstream(nfile.c_str(), ios::out); } - demultiSinglFilesF[i][0] = nfile; - nfile = path + SampleID[i] + ".2.fq"; - if (openOstreams) { demultiSinglFiles[i][1] = new ofbufstream(nfile.c_str(), ios::out); } - demultiSinglFilesF[i][1] = nfile; - ostrCnt += 2; - } - } - -} -//only does BC 1 -void Filters::reverseTS_all_BC(){ -// for (int i=0; i d, int pair) { - if (pair <= 0) { - PreFiltP1->addDNAStats(d); - } else if (pair == 1) { - PreFiltP2->addDNAStats(d); - } - updateMaxSeqL(d->length()); -} -void Filters::updateMaxSeqL(int x) { - if (x < maxReadLength) { return; } - maxReadLength = x; - if (min_l_p != -1.f) { - min_l = (int)((float)maxReadLength * min_l_p); - } -} -void Filters::setSeqLength(float minL, int maxL) { - if (minL>1.f) { - min_l = (int)minL; - min_l_p = -1.f; - } else { - min_l = -1; - min_l_p = minL; - } - max_l = maxL; -} - - -//ever_best is the best %id that was ever observed for this cluster match -bool Filters::betterSeed(shared_ptr d1, shared_ptr d2, shared_ptr ref, shared_ptr ref2, float ever_best, - uint bestL, int usePair, bool checkBC) { - float d1pid(d1->getTempFloat()), refpid(ref->getTempFloat()); - int TagIdx(0); - if (checkBC) { - TagIdx = -2; - } - //0.2% difference is still ok, but within 0.5% of the best found seed (prevent detoriating sequence match) - //float blen = (float)ref->length() + (float)d1->length(); - uint curL = d1->length(); - if (d2 != NULL) { curL += d2->length(); } - if (float(curL) / float(bestL) < BestLengthRatio) { return false; } - if (d1pidlength() / ref->length() < RefLengthRatio) { return false; } - - //checks if the new DNA has a better overall quality - //1 added to qual, in case no Qual DNA is used - float thScore = (1+d1->getAvgQual())*(d1pid ) * log((float)d1->length() ); - float rScore = (1+ref->getAvgQual())*(refpid ) * log((float)ref->length() ); - if (thScore > rScore){ - //also check for stable lowest score - if (d1->minQual() > ref->minQual() - MinQualDiff && (d2 == NULL || ref2 == NULL)) { return true; } - } - if (d2 == NULL || ref2 == NULL) { - return false; - } - //*** DNA2 - //second pair likely to be of worse qual, but only direct comparison relevant here - - check(d2, true, 1, TagIdx); - //at least 90% length of "good" hit - if (d2->length() / ref2->length() < RefLengthRatio) { return false; } - - //checks if the new DNA has a better overall quality - //weigh with average id to OTU seed - thScore +=(1+ d2->getAvgQual()) * log((float)d2->length()) * 97; - rScore += (1+ref2->getAvgQual()) * log((float)ref2->length()) * 97; - if (thScore > rScore) { - return true; - } - - return false; -} -bool Filters::check(shared_ptr d, bool doSeeding, int pairPre, - int &tagIdx) { - //bool qualWinTrim(false);// , TecAdap(false); //RevPrimFound(false), bool AccErrTrim(false); - int pair = max(0, pairPre);//corrects for -1 (undefined pair) to set to 0 - unsigned int hindrance = 0; - - //sTotalPlus(pair); - if (check_length(d->length())) { - d->QualCtrl.minL = true; //sMinLength(pair); - return false; - } - //remove technical adapter - if (pair == -1) { - if (bDoAdapter){ - remove_adapter(d); - } - } - - //BC already detected (e.g. MID)? - if (tagIdx == -2){ - tagIdx = cutTag(d, pair == 0); //barcode 2nd part - } - if ((bDoBarcode2||bDoBarcode)&& tagIdx < 0) { - d->QualCtrl.TagFail = true; d->failed(); return false;//sTagFail(pair); - } - - - //cerr<failed(); return false; - } - } - else if (bShortAmplicons) {//pair == 1, check for fwd primer in pair 2 (rev-compl) - cutPrimer(d, PrimerIdx[tagIdx], true,pair); - } - } - - - if (check_length(d->length())){ - d->QualCtrl.minL = true; //sMinLength(pair); - return false; - } - if (max_l!=0 && d->length()-hindrance > max_l){ - d->QualCtrl.maxL = true; //sMaxLength(pair); - return false; - } - - - - //rev primer is the first that needs to be looked for - //makes it slower, as higher chance for low qual and this routine is costly.. however more important to get good lock on rev primer - if ((pair != 0 || bShortAmplicons) && bPrimerR) { - //removal of reverse primer - bool revCheck = pair == -1 || pair == 0;//1:false for RC, else always a reverse check - cutPrimerRev(d, PrimerIdxRev[tagIdx], revCheck); - if (d->getRevPrimCut()) { - //RevPrimFound = true; - d->QualCtrl.PrimerRevFail = false; - //check length - if (check_lengthXtra(d, hindrance)) { - d->QualCtrl.minL = true; //sMinLength(pair); - d->failed(); return false; - } - } else {//stats, but only for 2nd pair - d->QualCtrl.PrimerRevFail = true; //sRevPrimerFail(pair); - if (pair == 1 && bRequireRevPrim) {//failed to find reverse primer - return false; - } - } - } - - - if (doSeeding){ - //cut off low qual, hard limits - //int TWwidth = 10; float TWthr = 20; - d->qualWinPos(EWwidth, EWthr);//) { d->QualCtrl.Trimmed = true; }// sTrimmed(pair); } - //SEED is needed for building trees, accumulated error not interesting - //double tmpAccumQP = 2.; - //if (!d->qualAccumTrim(tmpAccumQP)){ colStats.AccErrTrimmed++; } -// if (Trim){ colStats.Trimmed++; } -// if (AccErrTrim){ colStats.AccErrTrimmed++; } - - return true; - } - - - //if seq needs to be cut, than here - if (TruncSeq>0){d->cutSeq(TruncSeq,-1,true); - if ( check_length(d->length()) ){ - d->QualCtrl.minL = true; //sMinLength(pair); - return false; - } - } - if (b_doQualFilter) { - //second cut off low qual - d->qualWinPos(EWwidth, EWthr); - //cut off accumulation error larger than maxAccumQP - d->qualAccumTrim(maxAccumQP); - //if (check_length(d->length(),hindrance) ){sMinLength(); return false;} - if (check_length(d->length())) { - d->QualCtrl.minLqualTrim = true; return false;//sMinQTrim(pair); - } - int rea = 2; - if ((min_q > 0 || FQWthr > 0) && d->qualWinfloat(FQWwidth, FQWthr, rea) < min_q) { - d->QualCtrl.AvgQual = true;//sAvgQual(pair); - return false; - } - if (rea == 1) { - d->QualCtrl.QualWin = true; //sQualWin(pair); - return false; - } - //binomial filter here - if (b_BinFilBothPairs || pair != 1 ){ - float ExpErr = d->binomialFilter((int)BinFilErr, BinFilP); - if (ExpErr > BinFilErr){ - d->QualCtrl.BinomialErr = true; - //sBinomError(pair, ExpErr); - d->failed(); return false; - } - } - } - - if (MaxAmb!=-1 && d->numACGT() > MaxAmb){ - d->QualCtrl.MaxAmb = true;//sMaxAmbig(pair); - return false; - } - if (maxHomonucleotide!=0 && !d->HomoNTRuns(maxHomonucleotide)){ - d->QualCtrl.HomoNT = true;// sHomoNT(pair); - return false; - } - - //cerr<getID()<0){ - if (d->length()-TrimStartNTs > max_l){//length check - d->QualCtrl.maxL = true; //sMaxLength(pair); - return false; - } - //remove start NTs - d->cutSeq(0,TrimStartNTs); - - } - //MUTEX - //colStats[pair].totalRejected--; - //if (qualWinTrim || AccErrTrim) {d->QualCtrl.Trimmed = true;}//sTrimmed(pair); } - d->setPassed(true); - return true; - } - d->setPassed(true); - - /*if (qualWinTrim || AccErrTrim) { - d->QualCtrl.Trimmed = true;//sTrimmed(pair); - if (AccErrTrim) {d->QualCtrl.AccErrTrimmed = true;//colStats[pair].AccErrTrimmed++; } - if (qualWinTrim) { d->QualCtrl.QWinTrimmed = true; }//colStats[pair].QWinTrimmed++; - } - }*/ - //if (TecAdap) { colStats[pair].adapterRem++; } - //colStats[pair].totalRejected--; - - //keep control over passed / not as close as possible to source - return true; -} -//DNA qual check, and some extra parameters -bool Filters::checkXtra(shared_ptr d, int pairPre, int &tagIdx) { - - //bool qualWinTrim(false), AccErrTrim(false); // RevPrimFound(false),TecAdap(false), - unsigned int hindrance = 0; - int pair = max(0, pairPre);//corrects for -1 (undefined pair) to set to 0 - -// sTotalPlus(pair); -// sTotalPlusXtra(pair); - //remove technical adapter - if (pairPre == -1) { - if (bDoAdapter){ - remove_adapter(d); - } - } - //set in outer routines that check mid (-1) or needs to be checked here(-2) - if (tagIdx == -2){ - if ( bDoBarcode2 && pair == 1 ) { - tagIdx = cutTag(d, false); //barcode 2nd part - } else if(bDoBarcode && pair == 0) { - tagIdx = cutTag(d,true); //barcode - } - else { - tagIdx = 0; - } - } - if ((bDoBarcode || bDoBarcode2) && tagIdx < 0) { d->QualCtrl.TagFail = true;return false; }// sTagFail(pair); - - - - - //cerr<failed(); return false; - } - } - else if (bShortAmplicons) {//pair == 1, check for fwd primer in pair 2 (rev-compl) - cutPrimer(d, PrimerIdx[tagIdx], true,pair); - } - } - if (check_lengthXtra(d)){ - d->QualCtrl.minL = true;//sMinLength(pair); - d->failed(); return false; - } - if (max_l!=0 && d->length()-hindrance > max_l){ - d->QualCtrl.maxL = true; //sMaxLength(pair); - d->failed(); return false; - } - - - //rev primer is the first that needs to be looked for - //makes it slower, as higher chance for low qual and this routine is costly.. however more important to get good lock on rev primer - if ((pair != 0 || bShortAmplicons) && bPrimerR) { - //removal of reverse primer - //bool revPrm(false); - bool revCheck = pair == -1 || pair == 0;//1:false for RC, else always a reverse check - cutPrimerRev(d, PrimerIdxRev[tagIdx], revCheck); - if (d->getRevPrimCut()) { - //RevPrimFound = true; - d->QualCtrl.PrimerRevFail = false; - //check length - if (check_lengthXtra(d, hindrance)) { - d->QualCtrl.minL = true; //sMinLength(pair); - d->failed(); return false; - } - } else {//stats, but only for 2nd pair - //sRevPrimerFail(pair); - d->QualCtrl.PrimerRevFail = true; // maybe replace later with routine that checks for FtsDetected - if (pair != 0 && bRequireRevPrim) {//failed to find reverse primer - if (alt_bRequireRevPrim) { - d->failed(); return false; - } else { - //RevPrimFound = false; - d->QualCtrl.PrimerRevFail = false; - d->setMidQual(true); - } - } - } - } - - //if seq needs to be cut, than here - if (TruncSeq>0){d->cutSeq(TruncSeq,-1,true); - if ( check_lengthXtra(d) ){ - //d->QualCtrl.minL = true; //sMinLength(pair); - d->failed(); return false; - } - } - - if (b_doQualFilter) { - //second cut off low qual - d->qualWinPos(EWwidth, EWthr); // { qualWinTrim = true; } - //cut off accumulation error larger than maxAccumQP - if (maxAccumQP != -1.0) { - int cP = d->qualAccumulate(maxAccumQP); - if (check_lengthXtra(d, 0, cP)) { - d->isMidQual(); d->QualCtrl.minLqualTrim = true;//sMinQTrim(pair); - cP = d->qualAccumulate(alt_maxAccumQP); - if (check_lengthXtra(d, 0, cP)) {//check if passes alt - d->failed(); return false; - } - } else { - d->qualAccumTrim(maxAccumQP);//) { AccErrTrim = true; } - } - } - - - int rea = 2, rea2 = 2; - if ((min_q > 0 || FQWthr > 0) && d->qualWinfloat(FQWwidth, FQWthr, rea) < min_q) { - d->QualCtrl.AvgQual = true; //sAvgQual(pair); - if ((alt_min_q > 0 || alt_FQWthr > 0) && d->qualWinfloat(FQWwidth, alt_FQWthr, rea2) < alt_min_q) { - d->QualCtrl.AvgQual= true; //statAddition.AvgQual++; - d->failed(); return false; - } else { - d->QualCtrl.AvgQual = false; - d->setMidQual(true); - } - } - if (rea == 1) { - d->QualCtrl.QualWin = true; //sQualWin(pair); - if (rea2 == 1) { - //statAddition.QualWin++; - //d->QualCtrl.QualWin = true; - d->failed(); return false; - } - } - if (b_BinFilBothPairs || pair == 0){ - float ExpErr = d->binomialFilter((int)BinFilErr, BinFilP); - if (ExpErr > BinFilErr){ - d->QualCtrl.BinomialErr = true; - //sBinomError(pair, ExpErr); - d->failed(); return false; - } - } - } - int ambNTs = d->numACGT(); - if (MaxAmb!=-1 && ambNTs > MaxAmb){ -// sMaxAmbig(pair); - d->QualCtrl.MaxAmb = true; - if (alt_MaxAmb!=-1 && ambNTs>= alt_MaxAmb){ - d->QualCtrl.MaxAmb = true; //statAddition.MaxAmb++; - d->failed(); return false; - } - } - if (maxHomonucleotide!=0 && !d->HomoNTRuns(maxHomonucleotide)){ - d->QualCtrl.HomoNT = true;//sHomoNT(pair); - d->failed(); return false; - } - - //cerr<getID()<0){ - if (d->length()-TrimStartNTs > max_l){//length check - d->QualCtrl.maxL = true; //sMaxLength(pair); - d->failed(); - return false; - } - //remove start NTs - d->cutSeq(0,TrimStartNTs); - - } - //colStats[ pair].totalRejected--; - //if (qualWinTrim || AccErrTrim) { colStats[pair].Trimmed++; } - d->setPassed(true); - return true; - } - - if (!d->isMidQual()) { - d->setPassed(true); - } - - /* if (pairPre <= 0) { - statAddition.totalRejected--; - if (qualWinTrim || AccErrTrim) { - statAddition.Trimmed++; - if (AccErrTrim) { statAddition.AccErrTrimmed++; } - if (qualWinTrim) { statAddition.QWinTrimmed++; } - } - //if (TecAdap) { statAddition.adapterRem++; } - } - } else { - if (qualWinTrim || AccErrTrim) { - colStats[pair].Trimmed++; - if (AccErrTrim) { colStats[pair].AccErrTrimmed++; } - if (qualWinTrim) { colStats[pair].QWinTrimmed++; } - } - //if (TecAdap) { colStats[pair].adapterRem++; } - //keep control over passed / not as close as possible to source - colStats[pair].totalRejected--; - - }*/ - - return true; -} -void Filters::noMapMode(OptContainer& cmdArgs){ - string noMapTxt = "sdm run in No Map Mode."; - if (cmdArgs.find("-paired") != cmdArgs.end() && (cmdArgs["-paired"]=="2" || cmdArgs["-paired"]=="2")){ - pairedSeq = 2; //fakeEssentials(); - noMapTxt += " Using paired end sequencing files."; - } - - BcutPrimer = false; bDoBarcode = false; bDoBarcode2 = false; - bDoAdapter=false;bDoMultiplexing=false; - bDoHeadSmplID=false; - fakeEssentials(); - MinTagLen = 0; MinTagLen2 = 0; MaxTagLen = 0; MaxTagLen2 = 0; MinPrimLen = 0; - cerr< emptVec(2, NULL); - vector emptVec2(2, ""); - demultiSinglFiles.resize(x, emptVec); - demultiSinglFilesF.resize(x, emptVec2); -} - -bool Filters::remove_adapter(shared_ptr d){ //technical adapter - //allows for 0 errors, no shifts - const string& se = d->getSeq(); - for (unsigned int i=0;icutSeq(0,tAdapterLength); - d->setTA_cut(true); - return true; -} -//only identifies based on dual BCding -void Filters::dblBCeval(int& tagIdx, int& tagIdx2, string presentBC, shared_ptr tdn, shared_ptr tdn2) { - //bool BCfail = false;// , BCfail2 = false; - if ( tagIdx < 0 || tagIdx2 < 0 || !tdn->getBarcodeDetected() || !tdn2->getBarcodeDetected()) { - tagIdx = -1; tagIdx2 = -1; - if (tdn != NULL) { - tdn->setPassed(false); /*BCfail = true; */ - tdn->setBCnumber(tagIdx, BCoffset); tdn->setMidQual(false); - } - if (tdn2 != NULL) { tdn2->setPassed(false); tdn2->setMidQual(false); tdn2->setBCnumber(tagIdx2, BCoffset);} - - colStats[0].dblTagFail++; - return; - } - string BC1 = Barcode[tagIdx]; - string BC2 = Barcode2[tagIdx2]; - bool hit(false); - //this routine finds two matching barcodes (as several combinations are possible) - for ( uint i = 0; i < Barcode.size(); i++ ) { - if ( Barcode[i] == BC1 && Barcode2[i] == BC2 ) { - tagIdx = i; tagIdx2 = i; hit = true; break; - } - } - - if ( !hit ) { - //no BC, useless - tagIdx = -1; tagIdx2 = -1; - if (tdn != NULL) { tdn->setPassed(false); tdn->setMidQual(false); tdn->setBCnumber(tagIdx, BCoffset); } - if (tdn2 != NULL) { tdn2->setPassed(false); tdn2->setMidQual(false); tdn2->setBCnumber(tagIdx2, BCoffset); } - return; - } - presentBC = BC1 + "|" + BC2; - //add new BC info to DNA - //also reset BC in DNA - if ( tdn != NULL ) { - BCintoHead(tagIdx, tdn, presentBC, -1, false, true); - //already done in BCintoHead - } - if ( tdn2 != NULL ) { - BCintoHead(tagIdx2, tdn2, presentBC, -1, true, true); - } -} - -//cuts & identifies - version is just for mid sequences -int Filters::cutTag(shared_ptr d, string&presentBC, int& c_err, bool isPair1) { - if (bDoHeadSmplID) { - for (unsigned int i = 0; igetOldID().find(HeadSmplID[i]); - if (pos != string::npos) { - SampleIntoHead(i, d, pos); - return i; - } - } - return -1; - } - /*BCdecide & locBCD(BCdFWD); - if ( !isPair1 ) { - locBCD = BCdREV; - }*/ - int start(-1), stop(-1); - int idx(-1); - int scanRegion = 4; //dna region to scan for Tag Seq - if (!d->getTA_cut() && isPair1) {//no technical adapter found / given by user: scan wider region for barcode - scanRegion = 14; //arbitary value - } - if (d->isMIDseq()) { - if (d->length()length() - MinTagLen + 1; - } - - scanBC(d, start, stop, idx, c_err, scanRegion, presentBC, isPair1); - if (!BCdFWDREV[!isPair1].b_BCdirFix) { - if (start == -1) {//check reverse transcription - //d->reverseTranscribe(); - scanBC_rev(d, start, stop, idx, c_err, scanRegion, presentBC, isPair1); - if (start != -1) { - BCdFWDREV[!isPair1].BCrevhit++; - } - } - else { - BCdFWDREV[!isPair1].BChit++; - } - //check if BC direction can be fixed - if (BCdFWDREV[!isPair1].BCrevhit + BCdFWDREV[!isPair1].BChit > DNAinMemory) { - if (!eval_reversingBC(isPair1)) { return -1; } - } - } - if (start != -1) { - if (BcutTag && !d->isMIDseq()) { - //remove tag from DNA - d->cutSeq(start, stop); - d->setBarcodeCut(); - } - } - else { - idx = -1; - } - return idx; -} -int Filters::findTag(shared_ptr d, string&presentBC, int& c_err, bool isPair1) { - if (bDoHeadSmplID) { - for (unsigned int i = 0; igetOldID().find(HeadSmplID[i]); - if (pos != string::npos) { - SampleIntoHead(i, d, pos); - return i; - } - } - return -1; - } - /*BCdecide & locBCD(BCdFWD); - if ( !isPair1 ) { - locBCD = BCdREV; - }*/ - int start(-1), stop(-1); - int idx(-1); - int scanRegion = 14; //dna region to scan for Tag Seq - - scanBC(d, start, stop, idx, c_err, scanRegion, presentBC, isPair1); - if (!BCdFWDREV[!isPair1].b_BCdirFix) { - if (start == -1) {//check reverse transcription - //d->reverseTranscribe(); - scanBC_rev(d, start, stop, idx, c_err, scanRegion, presentBC, isPair1); - if (start != -1) { - BCdFWDREV[!isPair1].BCrevhit++; - } - } - else { - BCdFWDREV[!isPair1].BChit++; - } - //check if BC direction can be fixed - if (BCdFWDREV[!isPair1].BCrevhit + BCdFWDREV[!isPair1].BChit > DNAinMemory) { - if (!eval_reversingBC(isPair1)) { return -1; } - } - } - if (start == -1) { - idx = -1; - } - return idx; -} -int Filters::cutTag(shared_ptr d, bool isPair1) { - - if (d->length() < MinTagLen ) { return -1; } - if ((isPair1 && !bDoBarcode)|| - (!isPair1 && !bDoBarcode2)){ - d->setBCnumber(0, BCoffset); - return BCoffset; //not failed, just not requested - } - /*BCdecide & locBCD(BCdFWD); - if ( !isPair1 ) { - locBCD = BCdREV; - }*/ - - int idx(-1); - if (bDoHeadSmplID){ - for (unsigned int i=0; igetOldID().find(HeadSmplID[i]); - if (pos != string::npos){ - if ( !bDoBarcode2 ) { SampleIntoHead(i, d, pos); }//this has to be done AFTER two BCs are read (on a higher lvl) - else {d->setBCnumber(i, BCoffset);} - return i; - } - } - return idx; - } - else if (bOneFileSample){ - d->setBCnumber(0,this->currentBCnumber()); - BCintoHead(0, d, "FileName", isPair1, false); - return 0; - } - int start(-1),stop(-1); - string presentBC(""); int c_err(0); - int scanRegion=4; //dna region to scan for Tag Seq - if (!d->getTA_cut() && isPair1){//no technical adapter found / given by user: scan wider region for barcode - scanRegion = 14; //arbitary value - } - if (d->isMIDseq()){ - scanRegion = d->length() - MinTagLen+1; - } - scanBC(d, start, stop, idx, c_err, scanRegion, presentBC, isPair1); - if ( !BCdFWDREV[!isPair1].b_BCdirFix ) { - if (start == -1){//check reverse transcription - //d->reverseTranscribe(); - scanBC_rev(d, start, stop, idx, c_err, scanRegion, presentBC, isPair1); - if (start!=-1){ - BCdFWDREV[!isPair1].BCrevhit++; - } - } else { - BCdFWDREV[!isPair1].BChit++; - } - //check if BC direction can be fixed - if ( BCdFWDREV[!isPair1].BCrevhit + BCdFWDREV[!isPair1].BChit > 5000 ) { - eval_reversingBC(isPair1);//){return -1;} - } - } - if (start < 0) { - return (-1); - } - d->setBCnumber(idx, BCoffset); - - if (BcutTag && !d->isMIDseq()) { - //remove tag from DNA - d->cutSeq(start, stop); - d->setBarcodeCut(); - BCintoHead(idx, d, presentBC, c_err, isPair1); - } - return idx; -} - -void Filters::BCintoHead(int idx, shared_ptr d,const string presentBC, - const int c_err, bool pair1, bool atEnd){ - vector * locBC = (!pair1) ? &Barcode2 : &Barcode; - string on (d->getID()); - string spacee (" "); - //keep only until first space - //remove">" - if (!atEnd){ - on = on.substr(0,on.find_first_of(' ',0)); - } - - string nID ( SampleID[idx]); - if (bDoCombiSamples){ - nID = SampleID_Combi[idx]; - } - nID += iniSpacer + - on + spacee + string("orig_bc=") + (*locBC)[idx]; - if (c_err > 0 || atEnd) {//atEnd: dbl barcode - //convert c_err to s_c_err; - string s_c_err; stringstream conve; - conve << c_err; - s_c_err = conve.str(); - nID += spacee + string("new_bc=") + presentBC + - spacee + string("bc_diffs=") + s_c_err; - } - d->setNewID(nID); - d->setBCnumber(idx, BCoffset); -} - -void Filters::SampleIntoHead(const int idx, shared_ptr d, const size_t pos){ - string on = d->getID(),spacee=" "; - size_t pos2 = on.find_first_of(" ",pos); - string on2 = on.substr(0,pos)+on.substr(pos2+1); - string on3 = on.substr(pos,pos2); - - string nID(SampleID[idx]); - if (bDoCombiSamples){ - nID = SampleID_Combi[idx]; - } - - nID += iniSpacer + - on2 + spacee + string("orig_hdPart=")+on3; - d->setNewID(nID); - d->setBCnumber(idx, BCoffset); -} -void Filters::countBCdetected(int BC, int Pair, bool MidQ) { - if (!bDoMultiplexing) { return; } - if (BC - BCoffset < 0) { - cerr << "BC below 0("<< BC - BCoffset <<") pair "<< Pair<<" in countBCdetected\n"; exit(132); - } - if (!MidQ) { - if (Pair < 0) { Pair = 0; } - colStats[Pair].BarcodeDetected[BC - BCoffset]++; - } - else { - if (Pair <= 0) { - statAddition.BarcodeDetected[BC - BCoffset]++; - } - } -} -bool Filters::eval_reversingBC(bool fwd){ - if ( !fwd && !bDoBarcode2 ) { - return true; - } - /*BCdecide lbcd(BCdFWD); - if ( !fwd ) { - lbcd = BCdREV; - }*/ - if ( BCdFWDREV[!fwd].b_BCdirFix ) { return true; } - BCdFWDREV[!fwd].b_BCdirFix = true; lMD->setBCfixed(true, fwd); - if ( BCdFWDREV[!fwd].BCrevhit> BCdFWDREV[!fwd].BChit * 8 ) {//use reversed BC .. - BCdFWDREV[!fwd].reversedBCs = true; - if ( fwd ) { - reverseTS_all_BC(); - } else { - reverseTS_all_BC2(); - } - if ( BCdFWDREV[!fwd].BChit > 0 ) { - restartSet=true; - return false; - } - } else if ( BCdFWDREV[!fwd].BCrevhit>0 ) { - restartSet=true; - return false; - } - return true; -} -void Filters::scanBC_rev(shared_ptr d,int& start,int& stop,int& idx,int c_err, - int scanRegion,string & presentBC, - bool fwdStrand) { - vector emptyV(0), emptyV2(0); - vector& locRevBC(emptyV2); - vector& locBC(emptyV); - if ( !fwdStrand ) { - locRevBC = revBarcode2; - locBC = Barcode2; - } else { - locRevBC = revBarcode; - locBC = Barcode; - } - int BCs = (int) locRevBC.size(); - if (BCs==0){ - locRevBC = locBC; - BCs = (int)locRevBC.size(); - for (int i=0; i< BCs; i++){ - reverseTS(locRevBC[i]); - } - if ( !fwdStrand ) {//copy over BC - revBarcode2 = locRevBC; - } else { - revBarcode = locRevBC; - } - } - //check each possible BC for a match - if (TagErrs == 0){ - for (idx=0; idx< BCs; idx++){ - start = d->matchSeq_tot(locRevBC[idx],0,scanRegion,c_err); - if (start!=-1){ - presentBC = locRevBC[idx]; - stop = start+ (int)locRevBC[idx].length(); - break; - } - } - } else { - vector stars(0),idxses(0); - bool zeroErr = false; - //this version tries all BC's and if there are more than one possible match, will reject all matches - for (idx=0; idx< BCs; idx++){ - start = d->matchSeq_tot(locRevBC[idx],TagErrs,scanRegion,c_err); - if (start!=-1){ - if (c_err==0){ - stop = start+ (int) locRevBC[idx].length(); - presentBC = locRevBC[idx]; - zeroErr = true; - break; - } - stars.push_back(start); - idxses.push_back(idx); - } - } - if (!zeroErr && stars.size()>0){ - //int pair = (int)!fwdStrand;//d->getReadMatePos(); - if (stars.size() > 1){//too many matches, thus true seq can't be found - //currently only have only one BC, could be changed in future - //sTagNotCorrected(pair); - d->QualCtrl.fail_correct_BC = true; - idx=-1; start = -1; - return; - } - d->QualCtrl.suc_correct_BC = true; - //sTagCorrected(pair);// colStats.suc_correct_BC++; - start = stars[0]; - idx = idxses[0]; - stop = start+(int)locRevBC[idx].length(); - presentBC = d->getSubSeq(start,stop); - } - - } -} -void Filters::scanBC(shared_ptr d, int& start, int& stop, int& idx, int c_err, - int scanRegion, string & presentBC, bool fwdStrand) { - //check each possible BC for a match - //TODO: check for BC using suffix tree - vector < string > emptyV(0); - //vector * locBC = (!fwdStrand) ? &Barcode2 : &Barcode; - vector * locBClen = (!fwdStrand) ? &Barcode2_len : &Barcode_len; - //int pair = d->getReadMatePos(); - //int pair = int( !fwdStrand); - - BarcodeList & locBCL(emptyBCs); - uint MTL (MaxTagLen); - uint ITL (MinTagLen); - if ( !fwdStrand ) { - locBCL = BCList2; - MTL = MaxTagLen2; - ITL = MinTagLen2; - } else { - locBCL = BCList; - } - - for (start = 0; start < scanRegion; start++){ - string test = d->getSubSeq(start, MTL); - std::unordered_map::iterator itFnd = locBCL.find(test); - if ( itFnd != locBCL.end() ) { - idx = (*itFnd).second; - stop = start + (int)(*locBClen)[idx]; - presentBC = test; // (*locBC)[idx]; - return; - } - } - start = -1; - if ( TagErrs != 0 || ITL != MTL ) { - vector stars(0), idxses(0); - bool zeroErr = false; - - //this version tries all BC's and if there are more than one possible match, will reject all matches - for (auto jx = locBCL.begin(); jx != locBCL.end();jx++) { - start = d->matchSeq_tot((*jx).first, TagErrs, scanRegion, c_err); - if (start!=-1){ - idx = (*jx).second; - if (c_err==0){ - stop = start + (int)(*locBClen)[idx]; - presentBC = d->getSubSeq(start, MTL); // (*locBC)[idx]; - zeroErr = true; - break; - } - stars.push_back(start); - idxses.push_back(idx); - } - } - if (!zeroErr && stars.size()>0){ - if (stars.size() > 1){//too many matches, thus true seq can't be found - //sTagNotCorrected(pair); - d->QualCtrl.fail_correct_BC = true; - idx = -1; start = -1; - return; - } - d->QualCtrl.suc_correct_BC = true; - //sTagCorrected(pair);// colStats.suc_correct_BC++; - - start = stars[0]; - idx = idxses[0]; - stop = start + (int)(*locBClen)[idx]; - presentBC = d->getSubSeq(start,stop); - } - - } -} -//cuts primers, tags -bool Filters::cutPrimer(shared_ptr d,int primerID,bool RC,int pair){ - //only adapted to singular BC - if (PrimerL[0].length()==0){return true;} - int start(-1) ,stop(-1); - int tolerance(30), startSearch(0); - if (!d->getBarcodeCut() && MaxTagLen > 0) { tolerance = MaxTagLen + 4; - } else { tolerance = 22; }//in this case nothing is known about 5' end - - if (!BcutTag){ - //Tag was not cut out of Seq, take this into account - startSearch = MinTagLen-2; - tolerance += (MaxTagLen-MinTagLen)+4; - } - if (!RC) { - start = d->matchSeq(PrimerL[primerID], PrimerErrs, tolerance, startSearch); - stop = start + (int)PrimerL[primerID].length(); - } else { - int QS = d->length();int limit = max(QS >> 1, QS - 150); stop = QS; - start = d->matchSeqRev(PrimerL_RC[primerID], PrimerErrs, limit, startSearch); - } - if (start == -1){//failed to match primer - d->QualCtrl.PrimerFail = true; - //sPrimerFail(pair);// max(0, (int)d->getReadMatePos())); - if (alt_PrimerErrs!= 0 && PrimerErrs < alt_PrimerErrs){ - if (!RC) { - start = d->matchSeq(PrimerL[primerID], alt_PrimerErrs, tolerance, startSearch); - stop = start + (int)PrimerL[primerID].length(); - } else { - int QS = d->length(); int limit = max(QS >> 1, QS - 150); stop = QS; - start = d->matchSeqRev(PrimerL_RC[primerID], alt_PrimerErrs, limit, startSearch); - } - } - if (start== -1){ - //statAddition.PrimerFail++; - return false; - }else if (pair!=1){ //2nd read shouldnt be affected by fwd primer (but still checked in short read mode) - d->setMidQual(true); - } - } - if (!BcutPrimer){ - if ( !RC ) { d->cutSeq(0, start); } - else { d->cutSeq(stop,-1); } - return true; - } - - //remove the primer, if confimed before - if ( !RC ) { d->cutSeq(0, stop); } - else { d->cutSeq(start, -1); } - d->setFwdPrimCut(); - return true; -} -bool Filters::findPrimer(shared_ptr d, int primerID, bool RC, int pair) { - //only adapted to singular BC - if (PrimerL[0].length() == 0) { return true; } - int start(-1);// , - //int stop(-1); - int tolerance(22), startSearch(0); - if (!d->getBarcodeCut() && MaxTagLen > 0) { - tolerance = MaxTagLen + 4; - startSearch = MinTagLen-4; - } - else { tolerance = 16; }//in this case nothing is known about 5' end - if (!RC) { - start = d->matchSeq(PrimerL[primerID], PrimerErrs, tolerance, startSearch); - //stop = start + (int)PrimerL[primerID].length(); - } - else { - int QS = d->length(); int limit = max(QS >> 1, QS - 150); //stop = QS; - start = d->matchSeqRev(PrimerL_RC[primerID], PrimerErrs, limit, startSearch); - } - if (start == -1) {//failed to match primer - return false; - } - return true; -} -bool Filters::cutPrimerRev(shared_ptr d,int primerID,bool RC){ - //const string& se = d->getSeq(); - int start(-1) ,stop(d->length()); - int QS = d->length(); - int limit=max(QS>>1,QS-150); - //int limit = QS>>1; - - if (!RC) { - start = d->matchSeq(PrimerR[primerID] , PrimerErrs, 15,0); - stop = start + (int)PrimerR[primerID].length(); - } else { - start = d->matchSeqRev(PrimerR_RC[primerID] , PrimerErrs, limit); - } - - - - if (start == -1){//failed to match primer - return false; - } - - if ( !BcutPrimer ) { //found it, but no cut - if ( !RC ) { d->cutSeq(0, start); } - else { d->cutSeq(stop,-1); } - return true; - } - - //remove the primer, if confimed before - if ( !RC ) { - d->cutSeq(0, stop);//start everything in front has to be removed - } else { - d->cutSeq(start, -1); // everything in the end has to be removed - } - //string neSe = se.substr(0,start) + se.substr(stop); - d->setRevPrimCut(); - - return true; -} -bool Filters::readMap(OptContainer& cmdArgs){ - - if (cmdArgs.find("-map") == cmdArgs.end()){ - this->noMapMode(cmdArgs); - return true; - } - - string MapF = cmdArgs["-map"]; - - string path = ""; bool pathMode = false; - if (cmdArgs.find("-i_path") != cmdArgs.end() && cmdArgs["-i_path"].length() > 2){ - path=cmdArgs["-i_path"] + string("/"); - pathMode = true;//check later if mapping file contains fasta/fastq - } - - MinTagLen = 100000; MinTagLen2 = 1000000; MaxTagLen = 0; MaxTagLen2 = 0; MinPrimLen = 100000; - string line; - ifstream in(MapF.c_str()); - if (!in){ - cerr<<"Could not find "< terms(15);terms[0]="SampleID"; - terms[1] = "BarcodeSequence"; terms[2]="LinkerPrimerSequence"; - terms[3] = "ReversePrimer"; terms[4] = "fastqFile"; - terms[5] = "fnaFile"; terms[6] = "qualFile"; - terms[7] = "SampleIDinHead"; terms[8] = "MIDfqFile"; - terms[9] = "CombineSamples"; terms[10] = "ForwardPrimer"; - terms[11] = "Barcode2ndPair"; - terms[12] = "HetSpacerFwd"; terms[13] = "HetSpacerRev"; - terms[14] = "derepMin"; - bool hetOneSide = false; - - vector termIdx(terms.size(),-1); - - while(!safeGetline(in,line).eof()) { -// while(getline(in,line,'\n')) { - if(cnt!=0 && line.substr(0,1) == "#"){continue;} - if (line.length()<10){continue;} - if (cnt==0){ - line = line.substr(1); - } - - string segments; - stringstream ss; - ss << line; - int tbcnt=0; - - //cmdArgs["-i_MID_fastq"] - while (getline(ss,segments,'\t')) { - trim(segments); - if (cnt==0){ //search for header - //Primer, BarcodeSequence, LinkerPrimerSequence - //PrLCol(-1), PrRCol(-1), BCCol(-1), SIDCol(-1); - for (unsigned int i=0; iBarcodePreStats(); - - return true; -} -void Filters::decideHeadBC(){ - bDoMultiplexing=true; - if (HeadSmplID[0].length()>0 && Barcode[0].length()==0){ - bDoBarcode = true; bDoHeadSmplID = true; MinPrimLen = 0; return; - } else if ( HeadSmplID[0].length() == 0 && Barcode[0].length() > 0 && Barcode2[0].length() > 0 ) { - bDoBarcode = true; bDoHeadSmplID = false; bDoBarcode2 = true; return; - } else if ( HeadSmplID[0].length() == 0 && Barcode[0].length() > 0 ) { - bDoBarcode = true; bDoHeadSmplID = false; bDoBarcode2 = false; return; - } else if ( HeadSmplID[0].length() == 0 && Barcode[0].length() == 0 ) { - //simply check if each filename is different - if (FastqF.size() > 0 ){ - for (uint i = 0; i 0){ - for (uint i = 0; i < FastaF.size(); i++){ - for (uint j = i + 1; j < FastaF.size(); j++){ - if (FastaF[i] == FastaF[j]){ - cerr << "File names " << i << " and " << j << " are equal - no identification by filename possible.\n Aborting..\n"; exit(55); - } - } - } - - bOneFileSample = true; bDoBarcode = true; bDoHeadSmplID = false; - return; - } - } - - bDoMultiplexing = false; - cerr << "No Barcode and no ID in header defined.. aborting\n"; - exit(53); - -} - - -void Filters::checkDoubleBarcode(){ - if (!bDoBarcode || bDoHeadSmplID || Barcode.size()==0){ return; } - vector> doubles(0); vector empty(2, 0); - if ( bDoBarcode2 ) { - if ( Barcode.size() != Barcode2.size() ) { - cerr << "Unequal Barcode vector sizes in dual barcoding controls. Exiting.."; exit(45); - } - for ( unsigned int i = 0; i < Barcode.size(); i++ ) { - for ( unsigned int j = i + 1; j < Barcode.size(); j++ ) { - if ( strcmp(Barcode[i].c_str(), Barcode[j].c_str()) == 0 && strcmp(Barcode2[i].c_str(), Barcode2[j].c_str()) == 0 ) { - empty[0] = i; empty[1] = j ; doubles.push_back(empty); - } - } - } - if (doubles.size() > 0){ - for (uint x = 0; x < doubles.size(); x++){ - int i = doubles[x][0]; int j = doubles[x][1]; - cerr << "Duplicate dual Barcode detected: Barcode1 " << i + 1 << " (" << Barcode[i] << ") and " << j + 1 << " (" << Barcode[j] << ") as well as Barcode1 " << i + 1 << " (" << Barcode2[i] << ") and " << j + 1 << " (" << Barcode2[j] << ") are equal.\n"; - } - exit(8); - } - } - else { - for ( unsigned int i = 0; i < Barcode.size(); i++ ) { - for ( unsigned int j = i + 1; j < Barcode.size(); j++ ) { - if ( strcmp(Barcode[i].c_str(), Barcode[j].c_str()) == 0 ) { - empty[0] = i; empty[1] = j; doubles.push_back(empty); - } - } - } - if (doubles.size() > 0){ - for (uint x = 0; x < doubles.size(); x++){ - int i = doubles[x][0]; int j = doubles[x][1]; - cerr << "Duplicate Barcode detected: Barcode " << i + 1 << " (" << Barcode[i] << ") and " << j + 1 << " (" << Barcode[j] << ") are equal.\n"; - } - exit(8); - } - - } -} -void Filters::checkDoubleSampleIDHead(){ - if (!bDoHeadSmplID){return;} - vector> doubles(0); vector empty(2, 0); - for (unsigned int i=0; i 0){ - for (uint x = 0; x < doubles.size(); x++){ - int i = doubles[x][0]; int j = doubles[x][1]; - cerr << "Duplicate Header2split detected: pattern " << i + 1 << " and " << j + 1 << " are equal.\n"; - } - exit(8); - } - -} - -void Filters::checDoubleSampleID(){ - vector> doubles(0); vector empty(2, 0); - for (unsigned int i = 0; i 0){ - for (uint x = 0; x < doubles.size(); x++){ - int i = doubles[x][0]; int j = doubles[x][1]; - cerr << "Duplicate SampleID detected: SampleID " << i + 1 << " and " << j + 1 << " are equal.\n"; - } - exit(8); - } - if (!bDoCombiSamples){ - return; - } - bDoCombiSamples = false; - if (SampleID_Combi.size() <= 1){ - return; - } - - string prevCSID = SampleID_Combi[0]; - for (unsigned int i = 1; i < SampleID_Combi.size(); i++){ - if (SampleID_Combi[i] != prevCSID){ - bDoCombiSamples = true; - } - if (SampleID_Combi[i] == ""){ - SampleID_Combi[i] = SampleID[i]; - } - } -} - -void Filters::BarcodePreStats(){ - MinTagLen=100000;MaxTagLen=0; - for (unsigned int i=0; iMaxTagLen) MaxTagLen= (unsigned int) Barcode[i].length(); - //initialize Barcodes - BCList[Barcode[i]] = i; - Barcode_len[i] = (int) Barcode[i].length(); - } - if (MinTagLen == MaxTagLen){ - bBarcodeSameSize = true; - } - MinTagLen2 = 100000; MaxTagLen2 = 0; - for ( unsigned int i = 0; i < Barcode2.size(); i++ ) { - //create index - BCList2[Barcode2[i]] = i; - if ( Barcode2[i].length()MaxTagLen2 ) MaxTagLen2 = (unsigned int)Barcode2[i].length(); - Barcode2_len[i] = (int)Barcode2[i].length(); - } - //unique_ptr dPDS = make_unique(Barcode, Barcode2) - dPDS = make_shared(Barcode, Barcode2); - dHDS = make_shared(hetPrimer[0], hetPrimer[1]); - //fix empty last column specifically for derepMinNum - if (derepMinNum.size() > 0) { - derepMinNum.resize(Barcode.size(), -1); - } - -} -void Filters::resetStats(){ - statAddition.reset(); - PreFiltP1->reset(); PreFiltP2->reset(); - dPDS->reset(); dHDS->reset(); - for (size_t i = 0; i < 2; i++) { - RepStat[i]->reset(); RepStatAddition[i]->reset(); colStats[i].reset(); - } - -} - -void Filters::failedStats2(shared_ptr d,int pair){ - int pa = max(pair, 0); - if (bDoMultiplexing){ - int idx = d->getBCnumber() - BCoffset; - if ( bOneFileSample ) { - colStats[pa].BarcodeDetectedFail[0]++; - } else if (idx >= 0) { - - -#ifdef DEBUG - if (pa < 0 || pa>1) { cerr << "Pair in failedStats2 set to:" << pa << endl; } - if (idx >= (int)colStats[pa].BarcodeDetectedFail.size()) { - cerr << "idx in failedStats2 too big:" << idx << endl; - } -#endif // DEBUG - colStats[pa].BarcodeDetectedFail[idx]++; - } - } - -} -void Filters::prepStats() { - float remSeqs = float(colStats[0].total - colStats[0].totalRejected); - RepStat[0]->calcSummaryStats(remSeqs, min_l, min_q); - RepStat[1]->calcSummaryStats(remSeqs, min_l, min_q); - if (bAdditionalOutput) { - remSeqs = float(statAddition.total - statAddition.totalRejected); - RepStatAddition[0]->calcSummaryStats(remSeqs, min_l, min_q); - RepStatAddition[1]->calcSummaryStats(remSeqs, min_l, min_q); - } - PreFiltP1->calcSummaryStats(1, min_l, min_q); - PreFiltP2->calcSummaryStats(1, min_l, min_q); -} - - -void Filters::addPrimerL(string segments, int cnt){ - int used = -1; - for (unsigned int i=0; iaddPrimerL(segments,cnt); - break; - case 3: // right primer - trim(segments); - transform(segments.begin(), segments.end(),segments.begin(), ::toupper); - this->addPrimerR(segments,cnt); - break; - - case 0: //ID - trim(segments); - SampleID[cnt] = segments; - break; - - case 1: //Barcode - trim(segments); - transform(segments.begin(), segments.end(), segments.begin(), ::toupper); - Barcode[cnt] = segments; - break; - case 11: //Barcode rev - trim(segments); - transform(segments.begin(), segments.end(), segments.begin(), ::toupper); - Barcode2[cnt] = segments; - break; - case 7: //sample id in head - trim(segments); - HeadSmplID[cnt] = segments; - break; - case 8://mid xtra fq - trim(segments); - MIDfqF.push_back(segments); - break; - case 9://combine samples - trim(segments); - SampleID_Combi[cnt] = segments; - break; - case 14://demultiplex num - if (segments.length() == 0) { - derepMinNum.push_back(-1); - }else if (!is_digits(segments)){ - cerr << "Wrong map entry \"" << segments << "\". For header derepMin only number can be used.\n"; exit(313); - } else { - int nint = atoi(segments.c_str()); - derepMinNum.push_back( nint); - } - break; - case 12://het primer fw - if (!doHetPrimerExplicit){break;} - hetPrimer[0][cnt] = segments; - case 13://het primer rv - if (!doHetPrimerExplicit){ break; } - hetPrimer[1][cnt] = segments; - } - - if (k==6 || k==5){//a qual pushback was "" (was empty); replace - if (QualF.size() == FastaF.size() && QualF.back()==""){ - string newQ = FastaF.back(); - size_t pos = newQ.find_last_of("."); - newQ = newQ.substr(0,pos); - newQ += string(".qual"); - QualF.back() = newQ; - } - } - -} - -void Filters::printHisto(ostream& give,int which, int set){ - bool p2stat = pairedSeq > 1 ; - - if (set == 0) { - vector colStats(RepStat[0]->get_rstat_Vmed(which)); - vector ra( RepStat[0]->getVrange(which) ); - - if (which == 1) { - give << "Qual\tFilterObs" << endl; - } else { - give << "Length\tFilterObs" << endl; - } - for (size_t i = ra[0]; i < ra[1] ; i++) { - give << i << "\t" << colStats[i] << endl; - } - } else if (set == 1) { - vector ra(2,0), tra; - vector stat; - vector skips(6, false); - vector> matHist; - if (which == 1) { give << "#Qual\t"; - } else { give << "#Length\t"; } - if (p2stat && b_doFilter) { give << "FilteredP1\tFilteredP2\t"; } - else if ( b_doFilter){ give << "Filtered\t"; skips[1] = true; } - else { skips[1] = true; skips[0] = true; } - - if ( bAdditionalOutput && b_doFilter ) { - if (p2stat) { give << "AddFilterP1\tAddFilterP2\t"; } - else { give << "AddFilter\t"; skips[3] = true; } - } else { - skips[2] = true; skips[3] = true; - } - if (p2stat) { give << "RawReadsP1\tRawReadsP2"; } - else { give << "RawReads"; skips[5] = true; } - give << endl; - ra = RepStat[0]->getVrange(which); - if (p2stat) { tra = RepStat[1]->getVrange(which); ra[0] = min(tra[0], ra[0]); ra[1] = max(tra[1], ra[1]); } - if (!skips[2]) { - tra = RepStatAddition[0]->getVrange(which); ra[0] = min(tra[0], ra[0]); ra[1] = max(tra[1], ra[1]); - if (p2stat) { tra = RepStatAddition[1]->getVrange(which); ra[0] = min(tra[0], ra[0]); ra[1] = max(tra[1], ra[1]); } - } - tra = PreFiltP1->getVrange(which); ra[0] = min(tra[0], ra[0]); ra[1] = max(tra[1], ra[1]); - if (p2stat) { tra = PreFiltP2->getVrange(which); ra[0] = min(tra[0], ra[0]); ra[1] = max(tra[1], ra[1]); } - vector empt(ra[1], 0); - matHist = vector>(6, empt); - for (size_t kk = 0; kk < 6; kk++) { - if (skips[kk]) { continue; } - switch (kk) { - case 0: stat = RepStat[0]->get_rstat_Vmed(which); break; - case 1: stat = RepStat[1]->get_rstat_Vmed(which); break; - case 2: stat = RepStatAddition[0]->get_rstat_Vmed(which); break; - case 3: stat = RepStatAddition[1]->get_rstat_Vmed(which); break; - case 4: stat = PreFiltP1->get_rstat_Vmed(which); break; - case 5: stat = PreFiltP2->get_rstat_Vmed(which); break; - } - for (size_t i = 0; i < stat.size(); i++) { - if (i>=ra[1]) {break;} - matHist[kk][i] = stat[i]; - } - } - for (size_t i = ra[0]; i < ra[1] ; i++) { - give << i ; - for (size_t kk = 0; kk < matHist.size(); kk++) { - if (skips[kk]) { continue; } - give << "\t" << matHist[kk][i]; - } - give << endl; - } - - } -} - -vector Filters::combiSmplConvergeVec(const vector& inNames){ - vector retV(inNames.size(), -1); - unordered_map smpl2combi; - unordered_map::iterator s2cIT; - int cntGrps(-1); - for (size_t i = 0; i < SampleID_Combi.size(); i++){ - s2cIT = combiMapCollectGrp.find(SampleID_Combi[i]); - if (s2cIT == combiMapCollectGrp.end()){ - cntGrps++; - combiMapCollectGrp[SampleID_Combi[i]] = cntGrps; - } - smpl2combi[SampleID[i]] = combiMapCollectGrp[SampleID_Combi[i]]; - } - for (size_t i = 0; i < inNames.size(); i++){ - s2cIT = smpl2combi.find(inNames[i]); - if (s2cIT == smpl2combi.end()){ - cerr << "Can't find SampleID " << inNames[i] << " in reference Sample Names\n"; exit(113); - } - retV[i] = s2cIT->second; - } - return retV; -} - - -string Filters::shortStats( const string & file) { - collectstats& cst = colStats[0]; - string ret(""); - if (file != ""){ - ret+= file + "\n"; - } - if (pairedSeq > 1) { - ret+= "Pair 1: "; - } - char buffer[50]; - float tmp = (100.f*float(cst.total - cst.totalRejected) / (float)cst.total); - sprintf(buffer, "%.3f%% of %d", tmp, cst.total); ret += buffer; - sprintf(buffer," reads accepted (%.3f%% end-trimmed)\n", (100.f* float(cst.total - cst.Trimmed) / (float)cst.total)); ret += buffer; - - if (pairedSeq > 1) { - collectstats& cst = colStats[1]; - sprintf(buffer,"Pair 2: %.3f%% of %d", (100.f*float(cst.total - cst.totalRejected) / (float)cst.total), cst.total); ret += buffer; - sprintf(buffer," reads accepted (%.3f%% end - trimmed)\n", (100.f* float(cst.total - cst.Trimmed) / (float)cst.total)); ret += buffer; - } - return ret; -} -void Filters::printGC(ostream& os,int Npair) { - os << "Subset\t\tOccurence\t\tAvg.Quality\n"; - os << "\tA\tT\tG\tC\tA\tT\tG\tC\n"; - os << "R1 pre-filter"; - PreFiltP1->printGCstats(os); - if ( Npair > 1 ) { - os << "R2 pre-filter"; - PreFiltP2->printGCstats(os); - } - if ( !b_doFilter ) {return;} - os << "R1 filtered"; - RepStat[0]->printGCstats(os); - if ( Npair > 1 ) { - os << "R2 filtered"; - RepStat[1]->printGCstats(os); - } -} -void Filters::write2Demulti(shared_ptr d, int p, int fqOvr) { - if (!this->Demulti2Fls()) { - return; - } - int idx = d->getBCnumber() - this->getBCoffset(); //correct for BC offset as well.. - if (idx < 0 || !d->isPassed()) { - return; - } - d->prepareWrite(fqOvr); - ofbufstream * tar = (demultiSinglFiles[idx][p]); - if (tar == NULL) { - ofstream tar2; - tar2.open(demultiSinglFilesF[idx][p].c_str(), ios::app); - d->writeFastQ(tar2,false); - tar2.close(); - } - else { - d->writeFastQ(*(tar),false); - } -} -void Filters::printStats(ostream& give, string file, string outf, bool main) { - //TODO switch min_l to min_l_add - collectstats& cst = colStats[0]; - collectstats& cst2 = colStats[1]; - if (cst.total != cst.total2) { - cerr << "Unequal read numbers recorded " << cst.total << "," << cst.total2 << endl; - } - if (!main) { - cst = statAddition; - } - bool p2stat = pairedSeq > 1 && main; - give << "sdm " << sdm_version << " " << sdm_status << endl; - if (file.length()>0){ - give<<"Input File: "< 0) { - give<<"Output File: "<1) { - give <<"Singletons among these: " << intwithcommas((int)cst.singleton) << "; " << intwithcommas((int)cst2.singleton) << endl; - } - give << "Bad Reads recovered with dereplication: " << intwithcommas((int)cst.DerepAddBadSeq) << endl; - - if ( bShortAmplicons ) { - give << "Short amplicon mode.\n"; - } - - if ( checkBC2ndRd() ) { - give << "Looked for switched read pairs (" << intwithcommas(revConstellationN) << " detected)" << endl; - } - if (main) { - RepStat[0]->printStats2(give, remSeqs,0); - RepStat[1]->printStats2(give, remSeqs,1); - } else { - RepStatAddition[0]->printStats2(give,remSeqs,0); - } - - give << "Trimmed due to:\n"; - //EWwidth, EWthr no stat for this so far - float dval = (float)EWthr; if (!main) { dval = (float)alt_EWthr; } - int Xval = EWwidth; - if (EWthr > 0) { - give << " > " << EWthr << " avg qual in " << Xval << " bp windows : " << spaceX(10 - digitsFlt(dval)) << intwithcommas(cst.QWinTrimmed); - if (p2stat) { give << "; " << intwithcommas((int)cst2.QWinTrimmed); } give << endl; - } - dval = (float)maxAccumQP; if (!main) { dval = (float)alt_maxAccumQP; } - if (maxAccumQP>0.0) { - give << " > (" << dval << ") acc. errors, trimmed seqs : " << spaceX(8 - digitsFlt(dval)) << intwithcommas((int)cst.AccErrTrimmed); - if (p2stat) { give << "; " << intwithcommas((int)cst2.AccErrTrimmed); } give << endl; - } - - give << "Rejected due to:\n"; - float val = (float)min_l; - if (val == -1.f) {val = min_l_p;} - if (!main){ val = (float)alt_min_l; } - - give << " < min Seq length (" << val << ") : " << spaceX(18 - digitsFlt(val)) << intwithcommas((int)cst.minL); - if (p2stat) { give << "; " << intwithcommas((int)cst2.minL); } give << endl; - if (cst.minLqualTrim>0){//this is failed because seq was too short after trimming - give << " -after Quality trimming : " << spaceX(10) << intwithcommas((int)cst.minLqualTrim); - if (p2stat) { give << "; " << intwithcommas((int)cst2.minLqualTrim); } give << endl; - } - float valf = min_q; if (!main){ valf = alt_min_q; } - give << " < avg Quality (" << valf << ") : " << spaceX(21 - digitsInt((int)min_q)) << intwithcommas((int)cst.AvgQual); - if (p2stat) { give << "; " << intwithcommas((int)cst2.AvgQual); } give << endl; - give << " < window (" << FQWwidth << " nt) avg. Quality (" << FQWthr << ") : " << spaceX(5 - digitsInt(FQWwidth)) << intwithcommas((int)cst.QualWin); - if (p2stat) { give << "; " << intwithcommas((int)cst2.QualWin); } give << endl; - give << " > max Seq length (" << max_l << ") : " << spaceX(18 - digitsInt(max_l)) << intwithcommas((int)cst.maxL); - if (p2stat) { give << "; " << intwithcommas((int)cst2.maxL); } give << endl; - give << " > (" << maxHomonucleotide << ") homo-nt run : " << spaceX(21 - digitsInt(maxHomonucleotide)) << intwithcommas((int)cst.HomoNT); - if (p2stat) { give << "; " << intwithcommas((int)cst2.HomoNT); } give << endl; - int val2 = MaxAmb; if (!main){ val2 = alt_MaxAmb; } - give << " > (" << val2 << ") amb. Bases : " << spaceX(22 - digitsInt(val2)) << intwithcommas((int)cst.MaxAmb); - if (p2stat) { give << "; " << intwithcommas((int)cst2.MaxAmb); } give << endl; - if (BinFilP >= 0.f){ - give << " > (" << BinFilErr << ") binomial est. errors : " << spaceX(13 - digitsFlt(BinFilErr)) << intwithcommas((int)cst.BinomialErr); - if (p2stat) { give << "; " << intwithcommas((int)cst2.BinomialErr); } give << endl; - } - if ((bDoAdapter && tAdapter != "") || (bDoMultiplexing || cst.PrimerFail > 0) || ((!main && alt_bRequireFwdPrim) || bRequireFwdPrim) - || bPrimerR) { - give << "Specific sequence searches:\n"; - } - if (bDoAdapter && tAdapter!= "") { - give << " -removed adapter (" << tAdapter << ") : " << spaceX(18 - (uint)tAdapter.length()) << intwithcommas((int)cst.adapterRem); - if (p2stat) { give << "; " << intwithcommas((int)cst2.adapterRem); } give << endl; - } - if ( (bDoMultiplexing || cst.PrimerFail>0) || ((!main && alt_bRequireFwdPrim) || bRequireFwdPrim) ){ - give << " -With fwd Primer remaining (<= " << PrimerErrs << " mismatches"; - if ((!main && alt_bRequireFwdPrim) || bRequireFwdPrim){ - give << ", required) : "; - give << spaceX(1 - digitsInt(PrimerErrs)); - } - else { - give <<") : "<< spaceX(11 - digitsInt(PrimerErrs)); - } - give << intwithcommas((int)cst.PrimerFail); - if ( p2stat ) { give << "; " << intwithcommas((int)cst2.PrimerFail) << endl; } - else { give << endl; } - } - if (bPrimerR){ - give<<" -With rev Primer remaining (<= "< 0 || doubleBarcodes())) { give << "; " << intwithcommas((int)cst2.TagFail); give << " (" << intwithcommas((int)cst.dblTagFail) << " pairs failed)"; } - give << endl; - - if (TagErrs>0){ - give << " -corrected barcodes: " << spaceX(18) << intwithcommas((int)cst.suc_correct_BC); - if (p2stat){give << "; " << intwithcommas((int)cst2.suc_correct_BC); } - give << endl; - //<< ", failed to correct barcode: " << spaceX(5 - digitsInt(FQWwidth)) << intwithcommas((int)cst.fail_correct_BC) << endl; - } - - if ( bDoBarcode2 ) { - give << " -used dual index barcodes"; - if ( BCdFWDREV[0].reversedBCs || BCdFWDREV[1].reversedBCs ) { - give << " (reversed "; - if ( BCdFWDREV[1].reversedBCs && BCdFWDREV[0].reversedBCs ) { - give << " fwd & rev"; - } else if ( BCdFWDREV[0].reversedBCs ) { - give << " fwd"; - } else if ( BCdFWDREV[1].reversedBCs ) { - give << " rev"; - } - give << " BCs)" << endl; - } - - } else if ( BCdFWDREV[0].reversedBCs ) { - give << " -reversed all barcodes" << endl; - } - give << endl << "SampleID"; - if (bDoCombiSamples){ - give << "\tSampleGroup"; - } - give << "\tBarcode"; - if ( bDoBarcode2 ) {give << "\tBarcode2";} - give << "\tInstances\n"; - for (unsigned int i =0; i 1 ; - give << std::setprecision(3); - give << "SampleID"; - if ( bDoCombiSamples ) { - give << "\tSampleGroup"; - } - if ( bDoHeadSmplID ) { - give << "\tSampleID"; - - } else { - if ( bDoBarcode2 ) { give << "\tBarcode\tBarcode2"; } - else { give << "\tBarcode"; } - } - if ( p2stat ) { - give << "\tRead1Accepted\tRead1Filtered\tRead1PassedFrac\tRead2Accepted\tRead2Filtered\tRead2PassedFrac\n"; - } else { - give << "\tReadsAccepted\tReadsFailed\tPassed%\n"; - } - for ( unsigned int i = 0; i 0 ) { - give << float(cst.BarcodeDetected[i]) / totSum << "\t"; - } else {give << "NA\t"; } - - give << cst2.BarcodeDetected[i] << "\t" << cst2.BarcodeDetectedFail[i] <<"\t" ; - totSum = (float(cst2.BarcodeDetected[i]) + float(cst2.BarcodeDetectedFail[i])); - if ( totSum > 0 ) { - give << float(cst2.BarcodeDetected[i]) / totSum << ""; - } else { give << "NA"; } - give< 0 ) { - give << float(cst.BarcodeDetected[i]) / totSum << ""; - } else { give << "NA"; } - give<< endl; - } - } - - - -} - -void ReportStats::calcSummaryStats(float remSeqs, unsigned int min_l, float min_q){ - if (remSeqs == 0){ return; } - if (bMedianCalcs){ - rstat_Smed = (int) calc_median(rstat_VSmed,0.5f); - rstat_Qmed = (int) calc_median(rstat_VQmed,0.5f); - USQS=0.f; - } - RSQS = ( ( (float(rstat_NTs)/remSeqs)/(float)min_l ) + - ( (float(rstat_qualSum)/remSeqs) / min_q) ) / 2.f; -} - -void Filters::addStats(shared_ptr fil, vector& idx){ - colStats[0].addStats(fil->colStats[0], idx); colStats[1].addStats(fil->colStats[1], idx); - RepStat[0]->addStats(fil->RepStat[0]); RepStat[1]->addStats(fil->RepStat[1]); - if ( bAdditionalOutput){ - statAddition.addStats(fil->statAddition,idx); - RepStatAddition[0]->addStats(fil->RepStatAddition[0]); - RepStatAddition[1]->addStats(fil->RepStatAddition[1]); - } - PreFiltP1->addStats(fil->PreFiltP1); - PreFiltP2->addStats(fil->PreFiltP2); - maxReadsPerOFile = fil->maxReadsPerOFile; - ReadsWritten = fil->ReadsWritten;//the idea here is to have a number of reads in CURRENT file, not total reads - OFileIncre = fil->OFileIncre; - revConstellationN += fil->revConstellationN; -} - -void collectstats::addStats(collectstats& cs, vector& idx){ - if (BarcodeDetected.size() > (uint)10000){ - cerr<<"Unrealistic number of barcodes (>10000) in addStats\n"; exit(79);} - int BCS = (int)BarcodeDetected.size(); - for (unsigned int i=0;i= BCS){ return; } - //assert(idx[i] < BCS); - BarcodeDetected[idx[i]] += cs.BarcodeDetected[i]; - BarcodeDetectedFail[idx[i]] += cs.BarcodeDetectedFail[i]; - } - maxL += cs.maxL; PrimerFail += cs.PrimerFail ; - AvgQual += cs.AvgQual; HomoNT += cs.HomoNT; - PrimerRevFail += cs.PrimerRevFail; - minL += cs.minL ; minLqualTrim+= cs.minLqualTrim; TagFail += cs.TagFail; - MaxAmb += cs.MaxAmb ; QualWin += cs.QualWin; - Trimmed += cs.Trimmed ; AccErrTrimmed+= cs.AccErrTrimmed; total += cs.total; - QWinTrimmed += cs.QWinTrimmed; - totalRejected += cs.totalRejected; - fail_correct_BC += cs.fail_correct_BC; suc_correct_BC += cs.suc_correct_BC ; - failedDNAread += cs.failedDNAread; adapterRem += cs.adapterRem ; - RevPrimFound += cs.RevPrimFound; - singleton += cs.singleton; - BinomialErr += cs.BinomialErr; - dblTagFail += cs.dblTagFail; - DerepAddBadSeq += cs.DerepAddBadSeq; - total2 += cs.total2; totalSuccess += cs.totalSuccess; -} -void collectstats::reset(){ - singleton=0; - size_t BCsiz = BarcodeDetected.size(); - for (unsigned int i=0; i d){ - //pretty fast - addMeanStats(d->length(),(int) d->getAvgQual(), (float)d->getAccumError()); - //NT specific quality scores - d->NTspecQualScores(QperNT, NTcounts); - //more memory intensive - if (bMedianCalcs){ - //quali - uint avq = (uint) (d->getAvgQual()+0.5f); - //if (avq >= 10000){cerr << d->getID() << "high q: "<< d->getAvgQual()<length() >= 10000){cerr << d->getID() << "high l: "<< d->length()<length() ,rstat_VSmed); - } -} -ReportStats::ReportStats(bool MedianDo): - bMedianCalcs(MedianDo),rstat_totReads(0),rstat_NTs(0),rstat_qualSum(0), - rstat_Qmed(0),rstat_Smed(0),RSQS(0.f),USQS(0.f),rstat_accumError(0.f), - QperNT(1000,0), NTcounts(1000,0), - rstat_VQmed(0), rstat_VSmed(0) -{} -void ReportStats::reset() { - rstat_totReads = 0; rstat_NTs = 0; rstat_qualSum=0; - rstat_Qmed = 0; rstat_Smed = 0; - RSQS = 0.f; USQS = 0.f; rstat_accumError = 0.f; - QperNT.resize(1000,0); NTcounts.resize(1000,0); - std::fill(QperNT.begin(), QperNT.end(), 0); - std::fill(NTcounts.begin(), NTcounts.end(), 0); - rstat_VQmed.resize(0); rstat_VSmed.resize(0); -} -unsigned int ReportStats::lowest(const vector& in){ - for (uint i=0;i<(uint)in.size();i++){ - if (in[i]>0){return i;} - } - return(0); -} -unsigned int ReportStats::highest(const vector& in){ - if (in.size()==0){return 0;} - for (uint i=(uint)in.size()-1;i>=0;i--){ - if (in[i]>0){return i;} - } - return 0; -} -void ReportStats::printGCstats(ostream& give) { - //NT_POS['A'] = 0; NT_POS['T'] = 1; NT_POS['G'] = 2; NT_POS['C'] = 3; NT_POS['N'] = 4; - vector NTs(6, "X"); NTs[0] = "A"; NTs[1] = "T"; - NTs[2] = "G"; NTs[3] = "C"; NTs[4] = "N"; - for ( uint i = 0; i < 4; i++ ) { - give << "\t" << NTcounts[i]; - } - //give << endl; - for ( uint i = 0; i < 4; i++ ) { - give << "\t" << float(QperNT[i]) / float(NTcounts[i]); - } - give << endl; -} -void ReportStats::printStats2(ostream& give, float remSeqs,int pair){ - if ( pair == 1 ) { - return;//deactivate for now - } - if ( pair == 0 ) { - if ( bMedianCalcs ) { - unsigned int minS = lowest(rstat_VSmed); - unsigned int maxS = highest(rstat_VSmed); - unsigned int minQ = lowest(rstat_VQmed); - unsigned int maxQ = highest(rstat_VQmed); - give << "Min/Avg/Max stats Pair 1";// -RSQS : "< fil){ - //report stats: - rstat_NTs += fil->rstat_NTs; rstat_totReads += fil->rstat_totReads; - rstat_qualSum += fil->rstat_qualSum; - rstat_accumError += fil->rstat_accumError; - for ( uint i = 0; i < 6; i++ ) { - QperNT[i] += fil->QperNT[i]; - NTcounts[i] += fil->NTcounts[i]; - } - if (bMedianCalcs){ - //vectors for median calcs - if (rstat_VQmed.size() < fil->rstat_VQmed.size()){ - rstat_VQmed.resize(fil->rstat_VQmed.size(),0); - assert(rstat_VQmed.size() < 10000); - } - for (unsigned int i=0; irstat_VQmed.size(); i++){ - rstat_VQmed[i] += fil->rstat_VQmed[i]; - } - if (rstat_VSmed.size() < fil->rstat_VSmed.size()){ - rstat_VSmed.resize(fil->rstat_VSmed.size(),0); - //times change.. wrong assert here - //assert(rstat_VSmed.size() < 10000); - } - for (unsigned int i=0; irstat_VSmed.size(); i++){ - rstat_VSmed[i] += fil->rstat_VSmed[i]; - } - } -} -//calculate median value from data stored as histogram-vector -// for median use perc = 0.5f -float ReportStats::calc_median(vector& in, float perc){ - unsigned int sum = 0; - for (unsigned int i=0; i= tar){ - return (float) i; - } - } - - return 0.f; -} -void ReportStats::add_median2histo(vector& in, vector& histo) -{ - unsigned int max = *max_element(in.begin(),in.end()); - if (max> histo.size()){ - if (max > 10000){cerr<<"max bigger 10000.\n"; exit(77);} - histo.resize(max,0); - } - for (unsigned int i=0; i ReportStats::getVrange(int which) { - if (which == 1) { - return medVrange(rstat_VQmed); - } else { - return medVrange(rstat_VSmed); - } -} -vector ReportStats::medVrange(const vector x) { - vector ret(2, 0); ret[1] = 0; bool empty = true; - for (size_t i = 0; i < x.size(); i++) { - if (x[i]>0) { - if (empty) { ret[0] = i; empty = false; } - ret[1] = i; - } - } - ret[1]++; - return ret; -} - -void ReportStats::add_median2histo(unsigned int in, vector& histo) -{ - if (in >= histo.size()){ - //int oldS=histo.size(); - histo.resize(in+3,0); - assert(in < 1e6); - //explicit ini - /*for (int i=oldS; i>::iterator iterator = unusedID.begin(); iterator != unusedID.end(); iterator++) { -// delete (*iterator).second; -// } - for (uint i = 0; i= 8 && upVer < 9) { - UPARSE8up = true; UpUcFnd = true; - } else if (upVer >= 9 && upVer <11) { - UPARSE8up = true; UPARSE9up = true; UpUcFnd = true; - } else if (upVer >= 11) { - UPARSE8up = true; UPARSE9up = true; UPARSE11up = true; UpUcFnd = true; - otuTerm = "otu"; - } - } -} - - -void UClinks::readDerepInfo(string dereM) { - int cnt(0); - mapdere.open(dereM.c_str(), ios::in); - if (!mapdere) { - cerr << "Can't open " << dereM << ". \nAborting\n"; exit(54); - } - b_derepAvailable = true; - string line(""); - //only read header - while (!safeGetline(mapdere, line).eof()) { - // while(getline(in,line,'\n')) { - if (line.length() < 3) { continue; } - string segments; - stringstream ss; - ss << line; - int tbcnt = -1; - - //cmdArgs["-i_MID_fastq"] - while (getline(ss, segments, '\t')) { - tbcnt++; - if (cnt == 0) { //search for header - if (tbcnt == 0) { - if (segments != "#SMPLS") { cerr << "First line in dereplicate map has to start with #SMPLS, corrupted file.\n" << dereM << endl; exit(98); } - continue; - } - vector spl = header_string_split(segments, ":"); - SmplIDs[spl[1]] = stoi(spl[0]); - } else {//actual derep info per sorted line - cerr << "wrong mapping reading\n"; exit(64); - } - } - if (cnt == 0) { - OTUmat.resize(SmplIDs.size(), vector(clusCnt + 1, 0)); - } - break; - } - -} - -//read in dereplicated info feom derp.map -void UClinks::oneDerepLine(shared_ptr d) { - if (MAPread){ - return; - } - string line(""); - while (!safeGetline(mapdere, line).eof()) { - //if (line.length() < 3) { continue; } - string segments; - //stringstream ss; - //string head(""); - //ss << line; - int tbcnt = -1; - int curCnt(0),curCnt2(0); - size_t strpos(line.find_first_of('\t')), lastpos(0); - while (strpos != string::npos){ - //while (getline(ss, segments, '\t')) { - segments = line.substr(lastpos, strpos - lastpos); - lastpos = strpos+1; - strpos = line.find_first_of('\t', lastpos); - tbcnt++; - if (tbcnt == 0) { - if (!d->sameHead(segments)) { - cerr << segments << " is not " << d->getID() << endl; - } - size_t idx = segments.find(";size="); - if (idx != string::npos) { - curCnt = atoi(segments.substr(idx + 6, segments.find(";", idx + 5) - (idx + 6)).c_str()); - //curCnt - } - continue; - } - size_t spl = segments.find(":"); int occur = stoi(segments.substr(spl+1)); - d->setOccurence(stoi(segments.substr(0,spl)), occur); - curCnt2 += occur; - //string tmp = segments.substr(0, spl); - } - //last round - segments = line.substr(lastpos); - size_t spl = segments.find(":"); int occur = stoi(segments.substr(spl + 1)); - d->setOccurence(stoi(segments.substr(0, spl)), occur); - curCnt2 += occur; - - if (curCnt2 != curCnt) { - cerr << "ERROR: Mapping file unique abundance reconstruction: failed to find correct count (" << curCnt2 << " vs " << curCnt<<"):\n" << line << endl; - exit(67); - } - break; - } - - if (mapdere.eof()) {// end of file - mapdere.close(); - MAPread = true; - } -} - -//finishes reading the map, loading the sequences as DNAunique into mem (no DNA, mind) -void UClinks::finishMAPfile(){ - if (MAPread){ - return; - } - string line(""); - bool hardAdd = false; - - //go through ucl file line by line - while (!safeGetline(mapdere, line).eof()) { - string segments; - int tbcnt = -1; - int curCnt(0), curCnt2(0); - size_t strpos(line.find_first_of('\t')), lastpos(strpos + 1); - segments = line.substr(0, strpos); - //1st setup empty DNA for each remaining line - shared_ptr d = make_shared("", segments); - d->seal(); - size_t idx = segments.find(";size="); - if (idx != string::npos) { - curCnt = atoi(segments.substr(idx + 6, segments.find(";", idx + 5) - (idx + 6)).c_str()); - } - strpos = line.find_first_of('\t', lastpos); - - while (strpos != string::npos){ - segments = line.substr(lastpos, strpos - lastpos); - lastpos = strpos + 1; - strpos = line.find_first_of('\t', lastpos); - tbcnt++; - size_t spl = segments.find(":"); int occur = stoi(segments.substr(spl + 1)); - d->setOccurence(stoi(segments.substr(0, spl)), occur); - curCnt2 += occur; - //string tmp = segments.substr(0, spl); - } - //last round - segments = line.substr(lastpos); - size_t spl = segments.find(":"); int occur = stoi(segments.substr(spl + 1)); - d->setOccurence(stoi(segments.substr(0, spl)), occur); - curCnt2 += occur; - - if (curCnt2 != curCnt) { - cerr << "ERROR: Mapping file unique abundance reconstruction: failed to find correct count (" << curCnt2 << " vs " << curCnt << "):\n" << line << endl; - exit(67); - } - - //add d to oldDNA - string curID = d->getID(); - curID = curID.substr(0, curID.find_first_of(' ')); - unusedID[curID] = DNAunusedPos; - if (hardAdd){ - oldDNA.push_back(d); oldDNAid.push_back(curID); - DNAunusedPos++; - } else { - oldDNA[DNAunusedPos] = d; - oldDNAid[DNAunusedPos] = curID; - DNAunusedPos++; - if (DNAunusedPos >= maxOldDNAvec){ - hardAdd = true; - } - } - } - - if (mapdere.eof()) {// end of file - mapdere.close(); - MAPread = true; - } - oldDNA2.resize(oldDNA.size(), NULL); - -} - -//functions selects optimal read to represent OTU -void UClinks::findSeq2UCinstruction(shared_ptr IS, bool readFQ, - shared_ptr fil){ - if (UCread){return;} - - if (IS->hasMIDseqs()){ - CurSetPair = 0; - } else { - CurSetPair = -1; - } -#ifdef DEBUG - cerr << "UC seed extension" << endl; -#endif - - DNAidmapsIT fndDNAinOld; - std::list::iterator listIT; - shared_ptr match(NULL); shared_ptr match2(NULL); - bool cont(true),cont2(true); - string segs(""); - string segs2; - float perID; - vector curCLID(0,0); - bool sync(false); // syncing of 2 read pairs; not implemented for this function yet - - while ( getUCFlineInfo(segs, segs2, perID, curCLID, !b_derepAvailable) ) { - //int subcnt = 0; - if ( curCLID.size() == 0 ) { continue; } - if ( uclInOldDNA(segs, curCLID, perID, fil) ) { - curCLID.resize(0); - continue; - } - - while(cont){ - - shared_ptr tmpDNA = IS->getDNA(cont, 0, sync); - if (tmpDNA == NULL) { break; }//signal that at end of file - match.reset( new DNAunique(tmpDNA, -1)); - //delete tmpDNA; - oneDerepLine(match); - match2 = IS->getDNA(cont2, 1, sync); - string curID = match->getID(); - curID = curID.substr(0,curID.find_first_of(' ')); - //check if tdn1 a) matches ID b) is better - if (curID != segs){ - //block to store unused DNA & find this id in this block - unusedID[curID] = DNAunusedPos; - if (oldDNAid[DNAunusedPos]!=""){//delete old DNA at position - fndDNAinOld = unusedID.find(oldDNAid[DNAunusedPos]); unusedID.erase(fndDNAinOld); - //delete oldDNA[DNAunusedPos]; delete oldDNA2[DNAunusedPos]; - } - oldDNA[DNAunusedPos] = match; oldDNA2[DNAunusedPos] = match2; - oldDNAid[DNAunusedPos] = curID; - DNAunusedPos++; - if (DNAunusedPos>= maxOldDNAvec){ DNAunusedPos=0; } - } else { - //assign % identity score to DNA object -#ifdef DEBUG - cerr << "UC Hit"; -#endif - match->setTempFloat(perID); - besterDNA(curCLID, match, match2, fil); - curCLID.resize(0); - break; - } - - //if (tdn!=NULL && ch1 != tdn->isPassed()){cerr<<"isPassed is != ch1! Aborting..\n";exit(12);} - } - } -#ifdef DEBUG - cerr << "UC seed initialized" << endl; -#endif - -} - -void UClinks::finishUCfile(shared_ptr fil, string addUC, bool bSmplHd){ - if (UCread){ - addUCdo(addUC,bSmplHd); - UCread = true; - return; - } - string segs; - string segs2; - float perID(0.f); - vector curCLID(0); - - while ( getUCFlineInfo(segs, segs2, perID, curCLID, !b_derepAvailable) ) { - if ( uclInOldDNA(segs, curCLID, perID, fil) ) { - curCLID.resize(0); - continue; - } - cerr << segs << " "; - } - - addUCdo(addUC, bSmplHd); - UCread = true; -} -void UClinks::addUCdo(string addUC,bool SmplHd) { - if (addUC == "") { - return; - } - string segs; - string segs2; - float perID; - vector curCLID(0); - ucf.open(addUC.c_str(), ios::in); - if (!ucf) { - UCread = true; - cerr << "Could not find additional uc file: " << addUC << endl; - } else { - std::cerr << "Reading " << addUC << endl; - } - UCread = false; - if (SmplHd){ - while (getUCFlineInfo(segs, segs2, perID, curCLID, SmplHd)) { curCLID.resize(0); } - }else {//complicated.. - int cnt(0); - while (getUCFlineInfo(segs, segs2, perID, curCLID, SmplHd)) { - //find segs in remaining dereps - cnt++; //if (cnt < 61403) { continue; } - if (curCLID.size() == 0) { continue; } - if (uclInOldDNA_simple(segs, curCLID)) { - curCLID.resize(0); - continue; - } - - curCLID.resize(0); - } - } - -} - -//used for specific situation where only empty DNA string was read with derep info attached -bool UClinks::uclInOldDNA_simple(const string& segs,const vector& curCLID) { - //check if DNA is in group of unmatched DNA's ? - if (segs == ""){ return false; }//empty ucl line - DNAidmapsIT unusedIT = unusedID.find(segs); - if (unusedIT != unusedID.end()){// found something - int mID = (*unusedIT).second; - if (mID >= (int)oldDNA.size()) { cerr << "SEC MID too high\n"; exit(55); } - if (oldDNA[mID] == NULL){ return false; } - //give sequence a chance to be selected - matrixUnit matchSiz = (matrixUnit)curCLID.size(); - for (int k = 0; k < matchSiz; k++) { - add2OTUmat(oldDNA[mID], curCLID[k], matchSiz); - } - //was matched once to an OTU seed. Even if several later matches, doesn't mater - delete - - unusedID.erase(unusedIT); -// delete oldDNA[mID]; if (oldDNA2[mID] != NULL){ delete oldDNA2[mID]; } - - oldDNA[mID] = NULL; - oldDNA2[mID] = NULL; - oldDNAid[mID] = ""; - return true; - } - return false; -} -bool UClinks::uclInOldDNA(const string& segs,const vector& curCLID, float perID, - shared_ptr fil) { - //check if DNA is in group of unmatched DNA's ? - if (segs == ""){ return false; }//empty ucl line - DNAidmapsIT unusedIT = unusedID.find(segs); - if (unusedIT != unusedID.end()){// found something - //shared_ptr fndDNA = (*unusedIT).second; - //fndDNA->setTempFloat(perID); - //besterDNA(curCLID, fndDNA, fil); - //unusedID.erase(unusedIT); - - int mID = (*unusedIT).second; - //give sequence a chance to be selected - oldDNA[mID]->setTempFloat(perID); - besterDNA(curCLID, oldDNA[mID], oldDNA2[mID], fil); - //remove all trace - unusedID.erase(unusedIT); - oldDNA[mID] = NULL; - oldDNA2[mID] = NULL; - oldDNAid[mID] = ""; - return true; - } - return false; -} - -bool UClinks::getUCFlineInfo(string& segs, string& segs2,float& perID, - vector& curCLID, bool addFromHDstring) { - //reads UC file line by line - //can also be used to delineate UC's - - - if (UCread){return false;} - //close all file streams - if (ucf.eof()){ - UCread=true; - ucf.close(); - return false; - } - string line; - std::unordered_map::iterator itCL; - while (getline(ucf, line, '\n')) { - // cerr<= 3) { - UPARSE9up = true; - cerr << "Switching to Uparse 10+ style map file.\n"; - } - } - - - UpUcFnd = true; - } - - stringstream ss; - ss << line; - bool chimera = false; - vectortarsV; //saves hits to OTUs - //2 ways to get to a) hit info b) query & otu - if (!UPARSE8up){ - if ( (line.substr(0, 1) != "H")) { - continue; - } - - for (uint i = 0; i < 4; i++){//jump to pos X - getline(ss, segs, '\t'); - } - perID = (float)atof(segs.c_str()); - for (uint i = 0; i < 5; i++){//jump to pos X - getline(ss, segs, '\t'); - } - getline(ss, segs2, '\t'); - - - - } else if (!UPARSE9up){ // - //query first entry - string tmp; - getline(ss, segs, '\t');//0 - getline(ss, tmp, '\t');//1 - //should be "match" - if ( tmp == "chimera") { - if ( !doChimeraCnt ) {continue;} - chimera = true; }// segs = ""; continue;} - if (tmp == "otu"){ - segs2 = segs; - perID = 100.f; - } else { - getline(ss, tmp, '\t');//2 - perID = (float)atof(tmp.c_str()); - //indicator if hit - //OTU last entry - getline(ss, segs2, '\t');//3 - getline(ss, segs2, '\t');//4 - } - } else { //UP9, uparse 10 - //query first entry - string tmp; - getline(ss, segs, '\t');//0 - getline(ss, tmp, '\t');//1 - //should be "match" - if (tmp == "chimera" ) { - continue; - } else if ( tmp == "noisy_chimera" || tmp == "good_chimera") { //tmp == "perfect_chimera" || - if (!doChimeraCnt) { continue; } - chimera = true; - }else if (tmp == "perfect_chimera") { - removeSizeStr(segs); - perfectChims.insert(segs); - continue; - }// segs = ""; continue;} - if (tmp.substr(0,3) == otuTerm ){ - segs2 = segs; - perID = 100.f; - } - else {//match or perfect_chimera case - getline(ss, tmp, '\t');//2 - //dqt=1;top=GZV0ATA01ANJXZ;size=14;(99.6%); - size_t p1(tmp.find("top=")+4);//4 - size_t p2(tmp.find(";(", p1)+2); - size_t p3(tmp.find("%);", p2)); - segs2 = tmp.substr(p1, p2 - p1-1); - //string xx = tmp.substr(p2, p3 - p2); - perID = (float)atof(tmp.substr(p2,p3-p2).c_str()); - if (false && chimera) {//just use up9 top hit -// p1(tmp.find(";top=") + 5,p2); -// p2(tmp.find(";(", p1) + 2); - - } - } - } - //remove spaces - segs = segs.substr(0,segs.find_first_of(' ')); - //also remove sample identifier in string - string smplID = ""; - removeSampleID(segs, SEP, smplID); - - if ( chimera && UPARSE8up) { - tarsV = splitByComma(segs2, false, '+'); - for ( uint kk = 0; kk < tarsV.size(); kk++ ) { - tarsV[kk] = tarsV[kk].substr(0,tarsV[kk].find_last_of("(")); - } - } else if (!chimera){ - tarsV.push_back(segs2); - } - matrixUnit splChim = (matrixUnit)tarsV.size(); - curCLID.resize(tarsV.size()); - - for ( uint kk = 0; kk < tarsV.size(); kk++ ) { - //goes over all chimeric hits - string oriClKey (tarsV[kk]); - segs2 = oriClKey; - //remove ;size= argument - removeSizeStr(segs2); - removeSampleID(segs2, SEP); - //identifiers are ready, no identify first which cluster this is - //curCLID = -1; - - //find the cluster in the list of registered clusters - itCL = seq2CI.find(segs2); - - if ( itCL == seq2CI.end() ) {//new OTU? - if (OTUnumFixed) { - //cerr << "XX\n"; - if (perfectChims.find(segs2) == perfectChims.end()) { - cerr << "Unkown OTU entry found:" << segs2 << endl; - } - } else { - //not found in known clusters.. create entry - bestDNA.push_back(NULL); - bestDNA2.push_back(NULL); - oriKey.push_back(oriClKey); - bestPID.push_back(0.f); - bestLEN.push_back(0); - clusCnt = (int)bestDNA.size() - 1; - curCLID[kk] = clusCnt; - seq2CI[segs2] = clusCnt; - } - } else {//cluster exists - curCLID[kk] = ((*itCL).second); - } -#ifdef matrix_sum - if ( addFromHDstring ) { - //cerr << segs << "\t" << segs2 << endl; - add2OTUmat(smplID, (*itCL).second, splChim); - } -#endif - } - return true; - } - return true; -} -void UClinks::writeOTUmatrix(string outf,shared_ptr fil) { - cerr << "Writing OTU matrix to " << outf << endl; - this->setOTUnms(); - ofstream MA; - MA.open(outf); - //first write all sample IDs - std::unordered_map::iterator OTUid; - MA << "OTU"; - matrixUnit totalCnts = 0; - /*const vector& smpls_t = fil->SampleID; - - if (!unregistered_samples) { - smpls = smpls_t; - } else {*/ - vector smpls(SmplIDs.size(),""); - unordered_map::iterator smplNit; - for (smplNit = SmplIDs.begin(); smplNit != SmplIDs.end(); smplNit++) { - smpls[smplNit->second] = smplNit->first; - } - //} - for (size_t i = 0; i < smpls.size(); i++) { - MA<<"\t"<second]; - int rowI = OTUid->second; - for (size_t i = 0; i < smpls.size(); i++) { -// for (smplNit = SmplIDs.begin(); smplNit != SmplIDs.end(); smplNit++) { - matrixUnit val = OTUmat[ SmplIDs[smpls[i]] ][rowI]; - totalCnts += val; - //char num[24];sprintf(num, "\t%.2f", val); - //MA << num; - if ( isnan(val) ) { - MA << "\t0"; - } else { - MA << "\t"<::iterator OTUid; - for (OTUid = seq2CI.begin(); OTUid != seq2CI.end(); OTUid++) { - string newlySetID = "OTU_" + itos(cnt); - /*if (newlySetID == "OTU_7711"){ - int x = 0; cerr << "7711 ID " << OTUid->second; if (bestDNA[OTUid->second] == NULL){ cerr << "no DNA\n"; } - else { cerr << "YES\n"; } - }*/ - oriKey[OTUid->second] = newlySetID; - cnt++; - } - -} -void UClinks::add2OTUmat(const string& smplID, int curCLID, matrixUnit spl) { - - std::unordered_map::iterator smplNit = SmplIDs.find(smplID); - if (smplNit == SmplIDs.end()) {//create entry & expand matrix - SmplIDs[smplID] = (int) OTUmat.size(); - OTUmat.push_back(vector(clusCnt + 1, (matrixUnit)0)); - smplNit = SmplIDs.find(smplID); - cerr << "New Sample ID in uc file detected, that is not present in map: " << smplID<< endl; - unregistered_samples = true; - } - //easy, now add in - if ( spl <1 ) { spl = 1; } - OTUmat[(*smplNit).second][curCLID] += (matrixUnit)1 / spl; - -} -void UClinks::add2OTUmat(shared_ptr d, int curCLID, matrixUnit rep) { - if (d == NULL) { cerr << " add2OTUmat::d is NULL\n"; exit(85); } - unordered_map map = d->getDerepMap(); - if ( rep <1 ) { rep = 1; } - for (auto iterator = map.begin(); iterator != map.end(); iterator++) { - OTUmat[iterator->first][curCLID] += (matrixUnit)iterator->second / rep; - } -} - -void UClinks::setupDefSeeds(shared_ptr FA, shared_ptr fil) { - bool contRead = true; bool sync(false); - while (contRead) { - shared_ptr tmpDNA = FA->getDNA(contRead, 0,sync); - if (tmpDNA == NULL) { break; } - shared_ptr tmp = make_shared(tmpDNA, -1); -// delete tmpDNA; - //second pair - //shared_ptr tmp2 = FA->getDNA(contRead, 1); - string segs2 = tmp->getID(); - string oriClKey = segs2; - //remove ;size= argument - size_t idx = segs2.find(";size="); - segs2 = segs2.substr(0, idx); - //also remove sample identifier in string - removeSampleID(segs2, SEP); - - //cluster should not exist, test - std::unordered_map::iterator itCL; - itCL = seq2CI.find(segs2); - - if (itCL == seq2CI.end()) { - //not found in known clusters.. create entry - bestDNA.push_back(tmp); - bestDNA2.push_back(NULL); - oriKey.push_back(oriClKey); - bestPID.push_back(100.f); - bestLEN.push_back(tmp->length()); - clusCnt = (int)bestDNA.size() - 1; - seq2CI[segs2] = clusCnt; - } else {//cluster exists - cerr << "Found double ID " << segs2 << endl << "Aborting.." << endl; - exit(74); - } - } - //std::map::iterator smplNit; - if (derepMapFile != "") { - readDerepInfo(derepMapFile); - } else { - //TODO: fix this smplnew - const vector& smpls = fil->SampleID; - for (uint i = 0; i < smpls.size(); i++) { - SmplIDs[smpls[i]] = (int)OTUmat.size(); - if (i != OTUmat.size()) { cerr << "Err in setupDefSeeds\n"; exit(453); } - OTUmat.push_back(vector(clusCnt + 1, (matrixUnit)0)); - } - } -} - -void UClinks::addDefSeeds(shared_ptr FA, shared_ptr fil) { - bool contRead = true; - int addCnt = 0; bool sync(false); - while (contRead) { - shared_ptr tmpDNA = FA->getDNA(contRead, 0, sync); - if (tmpDNA == NULL) { break; } - shared_ptr tmp = make_shared(tmpDNA, -1); - //delete tmpDNA; - //second pair - //shared_ptr tmp2 = FA->getDNA(contRead, 1); - string segs2 = tmp->getID(); - string oriClKey = segs2; - //remove ;size= argument - size_t idx = segs2.find(";size="); - segs2 = segs2.substr(0, idx); - //also remove sample identifier in string - removeSampleID(segs2, SEP); - - //cluster should not exist, test - std::unordered_map::iterator itCL; - itCL = seq2CI.find(segs2); - - if (itCL == seq2CI.end()) { - //not found in known clusters.. create entry - bestDNA.push_back(tmp); - bestDNA2.push_back(NULL); - oriKey.push_back(oriClKey); - bestPID.push_back(100.f); - bestLEN.push_back(tmp->length()); - clusCnt = (int)bestDNA.size() - 1; - seq2CI[segs2] = clusCnt; - } - else {//cluster exists - cerr << "Found double ID " << segs2 << endl << "Aborting.." << endl; - exit(74); - } - addCnt++; - } - int siz = (int)bestDNA.size(); - //resize complete matrix to take these up - - for (uint i = 0; i < OTUmat.size(); i++){ - OTUmat[i].resize(siz, (matrixUnit)0); - } -} - -void UClinks::besterDNA(const vector curCLIDpre, shared_ptr tdn1, shared_ptr tdn2, shared_ptr fil) { - bool checkBC = true; - int TagIdx(-2); - if (tdn2 != NULL) {//fix for pairs assuming midSeqs - checkBC = false; //TagIdx = 0; - } - matrixUnit matchSiz = (matrixUnit)curCLIDpre.size(); - if ( matchSiz>1 ) { - for ( int k = 0; k < matchSiz; k++ ) { - add2OTUmat(tdn1, curCLIDpre[k], matchSiz); - } - return; - } - int curCLID = (int) curCLIDpre[0]; - add2OTUmat(tdn1, curCLID,1); - if (SeedsAreWritten){ - //delete tdn1; if (tdn2 != NULL){ delete tdn2; } - return; - } - //general routine, matching tdn found - if (bestDNA[curCLID] == NULL) { //just fill with current DNA - if (tdn2 == NULL) { - if ( fil->doReversePrimers() && !fil->check(tdn1, true, CurSetPair, TagIdx) ) { - return;//delete tdn1; - } - bestDNA[curCLID] = tdn1; - bestPID[curCLID] = tdn1->getTempFloat(); - bestLEN[curCLID] = tdn1->length(); - } else { - if (fil->doReversePrimers() && !fil->check(tdn1, true, CurSetPair, TagIdx)) { - return;//delete tdn2; delete tdn1; - } - bestDNA[curCLID] = tdn1; - bestPID[curCLID] = tdn1->getTempFloat(); - bestLEN[curCLID] = tdn1->length() + tdn2->length(); - fil->check(tdn2, true, 1, TagIdx); - bestDNA2[curCLID] = tdn2; - } - }//already a candidate sequence? check who is better.. - else if ( - fil->betterSeed(tdn1, tdn2, bestDNA[curCLID], bestDNA2[curCLID], bestPID[curCLID], bestLEN[curCLID], CurSetPair, checkBC) - ){ -// delete bestDNA[curCLID]; - bestDNA[curCLID] = tdn1; - if (tdn2 != NULL) { -// if (bestDNA2[curCLID] != NULL) {delete bestDNA2[curCLID]; } - bestDNA2[curCLID] = tdn2; - } - if (bestPID[curCLID] < tdn1->getTempFloat()){ - bestPID[curCLID] = tdn1->getTempFloat(); - } - uint curL = tdn1->length(); - if (tdn2 != NULL) { curL += tdn2->length(); } - if (bestLEN[curCLID] < curL){ - bestLEN[curCLID] = curL; - } - } else { -// delete tdn1;delete tdn2; - } -} - -void UClinks::removeSampleID(string& w, const string &SEP) { - size_t pos = w.find(SEP); - if (pos != std::string::npos) { - w = w.substr(pos + SEP.length()); - } -} -void UClinks::removeSampleID(string& w, const string &SEP, string & SMplID) { - size_t pos = w.find(SEP); - if (pos != std::string::npos) { - SMplID = w.substr(0,pos); - w = w.substr(pos + SEP.length()); - } -} -void UClinks::removeSizeStr(string& w) { - size_t idx = w.find(";size="); - /*#ifdef matrix_sum //not required, sanity check that is not really working out with usearch mappings - int OTUsize = atoi( segs2.substr(idx+6).substr(0,-1).c_str() ); - #endif*/ - w = w.substr(0, idx); -} - -void UClinks::writeNewSeeds(shared_ptr MD, shared_ptr fil, bool refSeeds, bool printLnk) { - if (!RefDBmode && refSeeds){ return; } - //ofstream O(outf.c_str()); - int paired = fil->isPaired(); - //MD->printStorage(); - ofstream links; - uint st (0), to ((uint) oriKey.size()); - //in case of refDB mode, need to start from point where refDBs were added in.. - if (refSeeds && RefDBmode && RefDBotuStart >= 0){ - cerr << "Writing ref DB sequences.."; - st = RefDBotuStart; - //and also write a link between OTU_name and refSeq - if (printLnk){ - string refLinkF = MD->leadOutFile() + ".lnks"; - links.open(refLinkF.c_str(), ios::out); - } - }else if (!refSeeds && RefDBmode && RefDBotuStart >= 0){ - cerr << "Writing new OTU seeds.."; - to = RefDBotuStart; - } - shared_ptr d; - for (uint i = st; i < to; i++) { - //if (i == 6628){ int x = 0; } - //if check DNA was done before (remove rev primer), do it now - if (bestDNA[i] == NULL) { - MD->writeAllStoredDNA(); - cerr << "No seed sequence found for DNA " << oriKey[i] << ". Aborting..\n"; // " << bestDNA[i]->getOldID()<<"( - //continue; - exit(54); - } - //fil->check(bestDNA[i],true); - string newH = oriKey[i]; - d = bestDNA[i]; - if (printLnk){ - string oriH = d->getIDshort(); removeSizeStr(oriH); - links << newH << "\t" << oriH << endl; - } - //replace ID to allow for later linkup with cluster - if (paired == 2) { - d->setNewID(newH + ".1"); - d->setPassed(true); - if (bestDNA2[i] != NULL) { - MD->saveForWrite(d, 1); - bestDNA2[i]->setPassed(true); - bestDNA2[i]->setNewID(newH + ".2"); - MD->saveForWrite(bestDNA2[i], 2); - } else { - MD->saveForWrite(d, 3); - } - } else { - d->setNewID(newH); - d->setPassed(true); - MD->saveForWrite(d, 1); - } - } - MD->writeAllStoredDNA(); - SeedsAreWritten = true; - if (printLnk){ links.close(); } - cerr << "Done" << endl; -} -void UClinks::printStats(ostream& os){ - if (oriKey.size() == 0) { - os << "No OTU's to re-seed\n"; - return; - } - uint numCls = 0; - float avgQ(0.f), minQ(10000.f), maxQ(0.f), - avgA(0.f), minA(10000.f), maxA(0.f), - avgS(0.f), minS(10000.f), maxS(0.f); - uint avgL = 0, minL(10000), maxL(0); - - //uint div(0);// = (uint)oriKey.size(); - vector lengths; - vector quals,accums,sims; - //oriKey.size() - not if there are only refs - uint to((uint)oriKey.size()); - if (RefDBmode && RefDBotuStart >= 0){ - to = RefDBotuStart; - } - - for (uint i = 0; i < to; i++){ - if (bestDNA[i] == NULL){ continue; } - float curQ = bestDNA[i]->getAvgQual(); - if (bestDNA2[i] != NULL) { - curQ += bestDNA2[i]->getAvgQual(); curQ /= 2.f; - } - uint curL = (uint) bestDNA[i]->length(); - if (bestDNA2[i] != NULL) { - curL += bestDNA2[i]->length(); - } - - if (curL < minL){ minL = curL; } - if (curL > maxL){ maxL = curL; } - avgL += curL; - lengths.push_back(curL); - - if (curQ < 1) {//no new Seed found, default seed - continue; - } - float sc = bestDNA[i]->getTempFloat(); - - if (curQ < minQ){ minQ = curQ; } - if (curQ > maxQ){ maxQ = curQ; } - avgQ += curQ; - - float curA = (float)bestDNA[i]->getAccumError(); - if (bestDNA2[i] != NULL) { - curA += (float) bestDNA2[i]->getAccumError(); - } - - quals.push_back(curQ); - accums.push_back(curA); - if (curA < minA){ minA = curA; } - if (curA > maxA){ maxA = curA; } - avgA += curA; - - sims.push_back(sc); - if (sc < minS){ minS = sc; } - if (sc > maxS){ maxS = sc; } - avgS += sc; - numCls++; - } - os << "Found " << numCls << " seeds of " << oriKey.size() << " OTU's in " << uclines << " mappings." << endl; - - //sort vectors - std::sort(lengths.begin(), lengths.end()); - std::sort(quals.begin(), quals.end()); - std::sort(accums.begin(), accums.end()); - std::sort(sims.begin(), sims.end()); - os << "Stats of Seed sequences (0th/10th/50th/90th/100th) percentile:\n"; - if (lengths.size() > 0) { os << "\n - Seq Length : " << minL << "/" << calc_median2(lengths, 0.1f) << "/" << calc_median2(lengths, 0.5f) << "/" << calc_median2(lengths, 0.9f) << "/" << maxL; } - if (quals.size() > 0) { os << "\n - Quality : " << minQ << "/" << calc_median2(quals, 0.1f) << "/" << calc_median2(quals, 0.5f) << "/" << calc_median2(quals, 0.9f) << "/" << maxQ; } - if (accums.size() > 0) { os << "\n - Accum. Error : " << minA << "/" << calc_median2(accums, 0.1f) << "/" << calc_median2(accums, 0.5f) << "/" << calc_median2(accums, 0.9f) << "/" << maxA; } - if (sims.size() > 0) { os << "\n - Sim2Consensus: " << minS << "/" << calc_median2(sims, 0.1f) << "/" << calc_median2(sims, 0.5f) << "/" << calc_median2(sims, 0.9f) << "/" << maxS; } - os << endl; -} diff --git a/configs/sdm_src/containers.h b/configs/sdm_src/containers.h deleted file mode 100644 index 76c2f8f..0000000 --- a/configs/sdm_src/containers.h +++ /dev/null @@ -1,824 +0,0 @@ -/* sdm: simple demultiplexer -Copyright (C) 2013 Falk Hildebrand - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - -#ifndef _containers_h -#define _containers_h - -#include "InputStream.h" - - -//#include - - - - -//definitions - -struct ltstr -{ - bool operator()( std::string s1, std::string s2) - { - return strcmp(s1.c_str(), s2.c_str()) < 0; - } -}; -typedef std::map OptContainer; -typedef std::unordered_map ClusterIdx; -typedef std::unordered_map BarcodeList;//links directly to entry number in Barcode vector -//typedef std::map ClusterIdx; -//used in UCF file -typedef std::unordered_map::iterator DNAidmapsIT; -typedef std::unordered_map DNAidmaps; - -#ifdef KHASH -typedef khset_t HashDNA; -typedef khset_t::iterator HashDNAIT; -#else - -/*size_t DNAHasher2(shared_ptr k) { - return ((hash()(k->getSeqPseudo())) >> 1); -}*/ - -class DNAHasher3 { -public: - size_t operator() (const shared_ptr k) const { - return ((hash()(k->getSeqPseudo())) >> 1); - } -}; - -class DNAequal { -public: - bool operator()(const shared_ptr val1, const shared_ptr val2) const { - return val1->getSeqPseudo() == val2->getSeqPseudo(); - } -}; - -//typedef std::unordered_set, function> HashDNA; -typedef std::unordered_set, DNAHasher3, DNAequal> HashDNA; -//typedef std::unordered_map::iterator HashDNAIT; -#endif - - -void trim(std::string& s);//removes white spaces from string -bool is_digits(const std::string &str); - -const std::string SingletonFileDescr = ".singl"; -const std::string DEFAULT_BarcodeNameSep = "__"; -const std::string DEFAULT_output_qual_offset = "33"; //61 or 33 -const std::string DEFAULT_ignore_IO_errors = "0"; //61 or 33 - - - -bool betterPreSeed(shared_ptr d1, shared_ptr d2, shared_ptr ref); - - //functions -string additionalFileName(const string& in); -string additionalFileName2(const string& in); -inline string getFileNoPath(string & s) { - size_t pos = s.find_last_of("/"); - size_t pos2 = s.find_last_of("\\"); - if (pos == string::npos || pos2 > pos) { - pos = pos2; - } - if (pos == string::npos) { - return s; - } - return s.substr(pos+1); -} -inline string removeFileEnding(string & s) { - size_t pos = s.find_last_of("."); - if (pos == string::npos) { - return s; - } - return s.substr(0,pos); -} - -inline int FastqVerMod(int x){ - if (x==1){ - return 33; - } else if (x==2){ - return 64; - } - return 59; -} - - -class MultiDNA; - - - - -struct collectstats{ - unsigned int maxL, PrimerFail,AvgQual, HomoNT; - unsigned int PrimerRevFail; //Number of sequences, where RevPrimer was detected (and removed) - unsigned int minL,minLqualTrim, TagFail, MaxAmb, QualWin; - unsigned int Trimmed, AccErrTrimmed, QWinTrimmed, total, totalRejected; - unsigned int fail_correct_BC, suc_correct_BC,failedDNAread; - unsigned int adapterRem, RevPrimFound; - uint total2, totalSuccess; - uint DerepAddBadSeq; - //binomial error model - unsigned int BinomialErr; - uint dblTagFail; - //recovered singletons within pairs - unsigned int singleton; - vector BarcodeDetected; - vector BarcodeDetectedFail; - collectstats() : maxL(0), PrimerFail(0), AvgQual(0), HomoNT(0), - PrimerRevFail(0),minL(0),minLqualTrim(0), - TagFail(0), MaxAmb(0), QualWin(0), - Trimmed(0),AccErrTrimmed(0), QWinTrimmed(0), - total(0), totalRejected(0), - fail_correct_BC(0), suc_correct_BC(0), - failedDNAread(0), adapterRem(0), RevPrimFound(0), - total2(0), totalSuccess(0), - DerepAddBadSeq(0), BinomialErr(0), - dblTagFail(0), - singleton(0), BarcodeDetected(0), BarcodeDetectedFail(0) {} - void addStats(collectstats&, vector& idx); - void reset(); - -}; - - -//reported stats on sequence properties -class ReportStats{ -public: - ReportStats(bool MedianDo); - void reset(); - void addDNAStats(shared_ptr d); - void calcSummaryStats(float remSeqs, unsigned int min_l, float min_q); - float calc_median(vector& in, float perc); - void add_median2histo(vector& in, vector& histo); - void add_median2histo(unsigned int in, vector& histo); - void addMeanStats(unsigned int NT, unsigned int Qsum, float AccErr){ - rstat_NTs+=NT;rstat_totReads++;rstat_qualSum +=Qsum;rstat_accumError+=AccErr; - } - unsigned int lowest(const vector& in); - unsigned int highest(const vector& in); - void printStats2(ostream& give, float remSeqs,int pair); - void printGCstats(ostream& give); - void addStats(shared_ptr); - bool bMedianCalcs; - const vector &get_rstat_Vmed(int x) { - if (x == 1) { return rstat_VQmed; } else { return rstat_VSmed; } - } - //const vector &get_rstat_VSmed(){return rstat_VSmed;} - vector getVrange(int which); -protected: - - //median - vector medVrange(const vector); - unsigned int rstat_totReads,rstat_NTs, rstat_qualSum, rstat_Qmed, rstat_Smed; - //means, Relative Sample Quality Score (RSQS), Unifying Sample Quality Score (USQS) - float RSQS, USQS; - float rstat_accumError; - vector QperNT, NTcounts; - float GCcontent() { return float(NTcounts[2] + NTcounts[3]) / float(NTcounts[0] + NTcounts[1] + NTcounts[2] + NTcounts[3]); } - - //bin based median calculation's - vector rstat_VQmed, rstat_VSmed; -}; - -class dualPrimerDistrStats{ -public: - dualPrimerDistrStats(){} - dualPrimerDistrStats(const vector&, const vector&); - ~dualPrimerDistrStats(){} - void reset(){} -}; - -/*class dualHetSpacDistrStats{ -public: - dualHetSpacDistrStats(){} - dualHetSpacDistrStats(const vector&, const vector&); - ~dualHetSpacDistrStats(){} - void reset(){} -}; -*/ - -//filters a fasta file for certain reads -class ReadSubset{ -public: - ReadSubset(const string,const string); - ~ReadSubset(){} - bool multiFile() { if (outFiles.size() > 1) { return true; } return false; } - vector getOFiles() { return outFiles; } - void findMatches(shared_ptr, shared_ptr,bool mocatFix); - void setRemainingFilepipe(int j) { RemainderStrPos = j; } -private: - //-1 deactivates - int RemainderStrPos; - unordered_map Targets; - vector newHD, outFiles; - vector outFilesIdx; -}; - - -//class Filters does the main demultiplexing of raw DNA/QUAL data -class Filters{ -public: - Filters(OptContainer&); - Filters(shared_ptr of, int, bool = false); - ~Filters(); - void close_outFiles_demulti(){ - for (size_t i = 0; i < demultiSinglFiles.size(); i++) { - if (demultiSinglFiles[i][0] != NULL) { delete demultiSinglFiles[i][0]; } if (demultiSinglFiles[i][1] != NULL) {delete demultiSinglFiles[i][1]; } - } - } - //pair:-1: no Pair-Seq, 0,1=pair 1/2 (assumes MID BC) - //doSeeding: extract longes Seed //false, -1, -2 - bool check(shared_ptr in, bool doSeeding, int pair, int &tagIdx);// , bool checkBC = true); - bool checkXtra(shared_ptr in, int pair, int &tagIdx ); - //vector check_pairs(shared_ptrp1, shared_ptrp2, shared_ptrmid, vector, bool changePHead); - void setSeqLength(float minL, int maxL); - void setMaxAmb(int x) { MaxAmb = x; }; - void setAvgMinQual(float x) { min_q = x; }; - bool readMap(OptContainer&); - void setPrimerErrs(int x) { PrimerErrs = x; } - void setTagErrs(int x) { TagErrs = x; } - void removePrimer(bool x) { BcutPrimer = x; } - void removeTag(bool x) { BcutTag = x; } - void setMaxHomo(int x) { maxHomonucleotide = x; } - void checkDoubleBarcode(); - void checDoubleSampleID(); - void checkDoubleSampleIDHead(); - - //complete: filter whole sequence if any window below threshhold - void setFloatingQWin(int width, float aveQ) { FQWwidth = width; FQWthr = aveQ; }; - //partial: cut end of Seq that is below the window threshold - void setFloatingEWin(int width, float aveQ) { EWwidth = width; EWthr = aveQ; }; - bool setcmdArgsFiles(OptContainer&); - bool remove_adapter(shared_ptr); - vector getFastaFiles() { return FastaF; } - vector getQualFiles() { return QualF; } - vector getFastqFiles() { return FastqF; } - vector getMIDfqFiles() { return MIDfqF; } - void restartFileSet(bool b) { restartSet = b; } - void setBCfixed(bool b, bool fwd) { - if ( fwd ) { BCdFWDREV[fwd].b_BCdirFix = b; } else { BCdFWDREV[fwd].b_BCdirFix = b; } - } - bool eval_reversingBC(bool); - bool haveToRestartSet() { if (restartSet) { restartSet = false; return true; }return false; } - - bool doOptimalClusterSeq() { return b_optiClusterSeq; } - bool doSubselReads() { return b_subselectionReads; } - void statAddDerepBadSeq(int BC){ //seq did not pass qual filter, but could be dereplicated - colStats[0].BarcodeDetected[BC - BCoffset]++; - //colStats[0].BarcodeDetectedFail[BC - BCoffset]--; - colStats[0].DerepAddBadSeq++; - } - void countBCdetected(int BC, int Pair, bool MidQ); - - void allResize(unsigned int x); - void addPrimerL(string, int); - void addPrimerR(string, int); - void BarcodePreStats(); - void resetStats(); - //idxG needs to be BCoffset free, BC from shared_ptr needs to have BCoffset added - void failedStats2(shared_ptr d,int); - - void BCintoHead(int idx, shared_ptr d, const string, const int, bool, bool = false); - void setBCdna(int idx, shared_ptr d){ d->setBCnumber(idx, BCoffset); } - void SampleIntoHead(const int idx, shared_ptr d, const size_t pos); - void setMultiDNA(shared_ptr m) { lMD = m; } - //stats... probably mutexed functions - bool doReversePrimers() { return bPrimerR; } - void preFilterSeqStat(shared_ptr d,int pair); - inline void updateMaxSeqL(int x); - bool betterSeed(shared_ptr, shared_ptr, shared_ptr, shared_ptr, float, uint, int,bool); - bool secondaryOutput(){return bAdditionalOutput;} - inline bool checkBC2ndRd() { return b2ndRDBcPrimCk; } - inline bool checkRevRd() { return bRevRdCk; } - bool synRdPairs() { return bChkRdPrs; } - int writtenReads(){return ReadsWritten;} - int maxReadsOutput(){return maxReadsPerOFile;} - void setWrittenReads(int x){ReadsWritten=x;} - int getFileIncrementor(){return OFileIncre;} - void incrementFileIncrementor(){ OFileIncre++; ReadsWritten = 0; }// - void setBCoffset(int x) { BCoffset = x; } - inline int getBCoffset() { return BCoffset; } - //if no qual file present, than deactivate qual filter - void deactivateQualFilter() { b_doQualFilter = false; } - //output file - int getuserReqFastqOutVer(void){ return userReqFastqOutVer; } - //input file - int getuserReqFastqVer(void){ return userReqFastqVer; } - int & isPaired(){ return pairedSeq; } - int FQheadV(){ return PEheaderVerWr; } - inline bool consistentPairs(){ return bCompletePairs; } - bool doDemultiplex(){ return bDoMultiplexing; } - bool doDereplicate() { return bDoDereplicate; } - bool doFilterAtAll() { return b_doFilter; } - - //************************* - //DNA statistic collection - void prepStats(); - void revConstellationCnts(int x) { revConstellationN += x; }//number of read pairs, where pair1/2 are changed (mo) - void addDNAtoCStats(shared_ptr d,int); - //void sPrimerFail(int pair) { colStats[pair].PrimerFail++; } - //void sAvgQual(int pair) { colStats[ pair].AvgQual++; } - //void sQualWin(int pair) { colStats[pair].QualWin++; } - //void sBinomError(int pair,float Err) { colStats[pair].BinomialErr++; }//maybe collect later info on expected error? - //void sMaxAmbig(int pair) { colStats[pair].MaxAmb++; } - //void sHomoNT(int pair) {colStats[pair].HomoNT++; } - //void sReversePrimerFnd(int pair) { colStats[ pair].RevPrimFound++; } - //void sRevPrimerFail(int pair) { colStats[ pair].PrimerRevFail++; } - //void sMinLength(int pair) { colStats[pair].minL++; } - //void sMinQTrim(int pair) {colStats[pair].minLqualTrim++; } - //void sMaxLength(int pair) { colStats[ pair].maxL++; } - //void sTagFail(int pair) { colStats[pair].TagFail++; } - //void sTagCorrected(int pair) { colStats[ pair].suc_correct_BC++; } - //void sTagNotCorrected(int pair) { colStats[pair].fail_correct_BC++; } - void sTotalPlus(int pair) { colStats[pair].total++; //colStats[pair].totalRejected++; -// if (secondaryOutput()) {sTotalPlusXtra(pair);} - } - //void sTotalMinus(int pair) { colStats[pair].total--; colStats[pair].totalRejected--; } - //void sTotalPlusXtra(int pair) { if (pair > 1) { return; } statAddition.total++; statAddition.totalRejected++;} - void addStats(shared_ptr, vector& idx); - void DNAstatLQ(shared_ptr d, int pair,bool Add) { - if (Add) { - RepStatAddition[pair]->addDNAStats(d); - } else { - RepStat[pair]->addDNAStats(d); - } - } - - //void sTrimmed(int pair) { colStats[pair].Trimmed++; } - void printStats(ostream&, string, string, bool); - void printGC(ostream&,int); - string shortStats(const string &); - void SmplSpecStats(ostream&); - void printHisto(ostream&, int which, int set = 1);//which: 1=qual //set:0 only filter, 1 all available - bool combineSamples(){ return bDoCombiSamples; } - //return a vector that says entry x (from invec) corresponds to group y - vector combiSmplConvergeVec(const vector&); -//public version of BC finder.. - int cutTag(shared_ptr d, string&, int&,bool);//returns id, important for cutPrimer() - int findTag(shared_ptr d, string&, int&, bool);//returns id, important for cutPrimer() - inline bool doubleBarcodes() { return bDoBarcode2; } - inline bool doBarcodes() { return bDoBarcode; } - - void dblBCeval(int& tagIdx, int& tagIdx2, string presentBC, shared_ptr tdn, shared_ptr tdn2); - vector getDrerepSampleSpecifity() {return derepMinNum; } - bool Demulti2Fls() {return bDoDemultiplexIntoFiles;} - void write2Demulti(shared_ptr,int, int fqOvr); - bool findPrimer(shared_ptr d, int primerID, bool, int); - - //-1= no HIT; -5=reverse hits - int cutTag(shared_ptr d, bool);//returns id, important for cutPrimer() - - - //public vars ************************************************* - //check for heterogenity primers (can be useful for chimera estimation) - bool doHetPrimerExplicit; - vector PrimerL; vector PrimerR; - vector PrimerL_RC; vector PrimerR_RC; - vector PrimerIdx; //one entry per barcode / links to PrimerL - vector PrimerIdxRev; //one entry per barcode / links to PrimerR - vector Barcode, revBarcode, Barcode2, revBarcode2; - vector SampleID, SampleID_Combi,HeadSmplID; - vector> hetPrimer; - //demultiplex files into these - vector> demultiSinglFiles; - vector> demultiSinglFilesF; - //collect statistics of filter reasons - vector colStats; - collectstats statAddition; // stats for additional reads to be output - //combiner of samples map to collect the group number - unordered_map combiMapCollectGrp; - int getXreads() { return Xreads; } - int totalAccepts() { //just plain number of successes.. - if (pairedSeq > 1) { - return colStats[0].totalSuccess + colStats[1].totalSuccess; - } - return colStats[0].totalSuccess; - } -protected: - bool check_lengthXtra(shared_ptr d, int hindrance=0, int leng=-1){ - if (min_l <= 0 &&alt_min_l <= 0 ){return false;} - if (leng==-1){ - leng = d->length(); - } - if (leng-hindrance < min_l){ - if (leng-hindrance >= alt_min_l){ - d->setMidQual(true); - d->QualCtrl.minL = false; - return false; - } - //statAddition.minL++; - d->QualCtrl.minL = true; - return true; - } - return false; - } - bool check_length( int leng, int hindrance=0){ - if (min_l==0){return false;} - return leng-hindrance < min_l; - } - bool cutPrimer(shared_ptr d, int primerID, bool, int); - bool cutPrimerRev(shared_ptr d,int primerID,bool); - - inline void scanBC(shared_ptr d,int& start,int& stop,int& idx,int c_err, int scanRegion, - string & presentBC, bool fwdStrand); - inline void scanBC_rev(shared_ptr d,int& start,int& stop,int& idx,int c_err, int scanRegion, - string & presentBC, bool fwdStrand); - - void extractMap(int k, int cnt, int tbcnt, string & segments, bool); - void fakeEssentials(void); - void noMapMode(OptContainer& cmdArgs); - void reverseTS_all_BC(); - void reverseTS_all_BC2(); - - void decideHeadBC(); - int currentBCnumber() { return curBCnumber; }//only used in "one sample per file" cases - - void generateDemultiOutFiles( string); - - - vector FastaF, QualF, FastqF, MIDfqF; - vector derepMinNum; - shared_ptr lMD; - vector> RepStat;//2 entries (2 pairs) - vector> RepStatAddition; - shared_ptr PreFiltP1; shared_ptr PreFiltP2; - - //technical adapter removal - string tAdapter; - unsigned int tAdapterLength; - //do adapter removal? Do Barcode checking? - bool bDoAdapter, bDoMultiplexing; - //which kind of barcoding? - bool bDoBarcode, bDoBarcode2, bDoHeadSmplID; - bool bBarcodeSameSize; - - //related to "one sample per file" - bool bOneFileSample; - int curBCnumber,BCoffset; - - //do additional 2nd output file using different filter options - bool bAdditionalOutput; - //check if reverse primer + rev BC are present (on 2nd read) - bool b2ndRDBcPrimCk; - //check if reads have been reversed - bool bRevRdCk; - //check if read pairs are correctly synced - bool bChkRdPrs; - //specialized function for LotuS, which doesn't need all the huge output files.. - //BCs are in mid file - //bool bHasMidSeq; - - //filter related - int min_l, alt_min_l; - float min_l_p, alt_min_l_p; - int maxReadLength; - bool norm2fiveNTs; //change IUPAC code to 5 bases (ACTGN) - uint max_l; - float min_q,alt_min_q; - bool BcutPrimer,alt_BcutPrimer,bPrimerR;//cut Primers from seq? - bool bRequireRevPrim,alt_bRequireRevPrim; // reject seq if reverse primer not found - bool bRequireFwdPrim,alt_bRequireFwdPrim; - bool BcutTag;//cut Tag from seq? - bool bCompletePairs;//if paired seq, only accept complete pairs - bool bShortAmplicons; - //MinTagLen is Barcode length - unsigned int MinTagLen, MinTagLen2, MaxTagLen, MaxTagLen2, MinPrimLen, maxHomonucleotide; - int PrimerErrs,alt_PrimerErrs,TagErrs,MaxAmb, alt_MaxAmb;//allowed max errs per Primer, Tag; max Ambigous Chars(not ACGT) - int FQWwidth, EWwidth; //Floating window width for avg quality - int RevPrimSeedL; // seed length of primer that will be searched for - bool b_BinFilBothPairs; - float BinFilErr, BinFilP; //binomal filter parameters - float FQWthr, EWthr, alt_FQWthr, alt_EWthr; //Floating window avg quality under which seq is kicked - int PEheaderVerWr;//correct PE header format (0/1/2) this is to accomodate the illumina miSeq paired end annotations 2="@XXX 1:0:4" insteand of 1="@XXX/1". 0=don't change or no PE seq. - int TrimStartNTs;//remove start NT. -1 indicates auto check for GC infrequencies - int TruncSeq;//remove trailing NT's after this seq length (length after removal of adapters, primers, Barcodes) - string iniSpacer; // spacer in fasta file name after barcoding - int userReqFastqVer;//either 1 (33), 2(59) or 3 (62) - int userReqFastqOutVer; - double maxAccumQP,alt_maxAccumQP; - - //paired end sequencing related - int pairedSeq; //1= single read, 2= PE, 3= PE + 1 file with barcodes - int revConstellationN;//number of read pairs, where pair1/2 are changed (mo) - - - //flow control bools - struct BCdecide - { - int BChit, BCrevhit; - bool b_BCdirFix, reversedBCs; - BCdecide(): BChit(0), BCrevhit(0), b_BCdirFix(false), reversedBCs(false){} - void reset() { BChit = 0; BCrevhit = 0; b_BCdirFix = false; reversedBCs = false; } - void fix() { BChit = 0; BCrevhit = 0; b_BCdirFix = true; reversedBCs = false; } - }; - vector BCdFWDREV; - int Xreads;//just prints the first X reads for experiment (read pairs being counted as 2) - bool restartSet;//start from beginning, i.e. wrong BC direction - bool b_optiClusterSeq;//SEED extension - bool b_subselectionReads;//filter out a specific set of reads - bool b_doQualFilter;//qulity file provided? Then no qual filter - bool b_doFilter; //option file not provided? just crunch files through, but careful about demultiplexing.. - - bool bDoDereplicate; - bool bDoCombiSamples; - //demultiplexing into files - bool bDoDemultiplexIntoFiles; - - //controls output file size - int maxReadsPerOFile,ReadsWritten,OFileIncre; - //needed to pass by ref - BarcodeList emptyBCs; - //map with barcodes.. faster matching (?) - BarcodeList BCList, BCList2; - vector Barcode_len, Barcode2_len; - - //double BC / het spacer collect stats - shared_ptr dPDS; - shared_ptr dHDS; -}; - - -bool DNAuPointerCompare(shared_ptr l, shared_ptr r); - -class Dereplicate{ -public: - Dereplicate(OptContainer&); - ~Dereplicate() { -// for (size_t i = 0; i < Dnas.size(); i++) { delete Dnas[i]; } - } - int getHighestBCoffset() { return (int)BCN2SmplID.size(); } - //bool addDNA(shared_ptr d); - bool addDNA( shared_ptr d,shared_ptr d2,bool& added); - string writeDereplDNA(shared_ptr); - void writeLog(string logF, string rep) { - ofstream logx; - string logPS = logF.substr(0, logF.length() - 4) + "dereplication.log"; - logx.open(logPS.c_str(), ios_base::out); - logx << "Dereplication log:\n"< fil); - void reset(); -private: - //is the exact derep string fullfilled? - inline bool pass_deprep_conditions(shared_ptr); - - //vector> Dnas; - //vector> DNApair; - vector BCN2SmplID; - HashDNA Tracker; - //vector Counts; - string outfile; - bool b_usearch_fmt, b_singleLine; - bool b_pairedInput; - vector minCopies; - size_t minCopiesSiz; - string minCopiesStr; - int totSize; - int tmpCnt; - int curBCoffset; -}; - - -class UClinks{ -public: - UClinks(OptContainer& ); - ~UClinks(); - void findSeq2UCinstruction(shared_ptr,bool, shared_ptr ); - void writeNewSeeds(shared_ptr, shared_ptr,bool, bool=false); - void printStats(ostream&); - void finishUCfile(shared_ptr, string, bool); - void finishMAPfile(); - void setupDefSeeds(shared_ptr FA, shared_ptr fil); - //to add "high qual" ref sequences - void addDefSeeds(shared_ptr FA, shared_ptr fil); - void pairedSeqsMerged(shared_ptr fil){ pairsMerge = true; fil->setFloatingEWin(0, 0.f); } - void writeOTUmatrix(string, shared_ptr fil); - void resetInputUcUp(){ UpUcFnd = false; } - void set2UC(){ UPARSE8up = false; } - void setRefMode(){ RefDBmode = true; RefDBotuStart = (int)oriKey.size(); }//from now on only count adds or ref DB seqs -private: - void addUCdo(string,bool ); - void add2OTUmat(const string&, int, matrixUnit); - void add2OTUmat(shared_ptr, int, matrixUnit); - bool uclInOldDNA(const string&, const vector&, float, shared_ptr); - bool uclInOldDNA_simple(const string&, const vector&); - bool getUCFlineInfo(string&, string&, float&, vector&, bool addFromHDstring = false); - void besterDNA(const vector curCLID, shared_ptr tdn1, shared_ptr tdn2, shared_ptr); - void setOTUnms(); - - inline void removeSizeStr(string&); - inline void removeSampleID(string&, const string &); - inline void removeSampleID(string&, const string &, string&); - void readDerepInfo(string); - void oneDerepLine(shared_ptr); - - //pair: important to keep track whether to remove BC etc.: -1 to remove BC (454); 0 not to (MID miSeq) - int CurSetPair; - //store not matched DNA and keep track - uint maxOldDNAvec; - vector oldDNAid; - vector> oldDNA; - vector> oldDNA2; - DNAidmaps unusedID; - //std::list oldestID; - uint DNAunusedPos; - string derepMapFile; - - //search terms: "otu" "chimera" "chimera" - //string otu_term, chimera_term, chimera_term_noise; - - ClusterIdx seq2CI; - vector> bestDNA; - vector> bestDNA2; - vector oriKey; - vector bestPID; - vector bestLEN; - int clusCnt, uclines; - string SEP; - ifstream ucf, mapdere; - bool UCread,pairsMerge,MAPread; - bool b_derepAvailable;//has sdm been run in demultiplexer mode? - bool UPARSE8up, UPARSE9up, UPARSE11up, UpUcFnd; - string otuTerm; - bool RefDBmode; - int RefDBotuStart; - bool SeedsAreWritten; - //count matrix related - vector < vector > OTUmat; - unordered_map SmplIDs; - unordered_set perfectChims; - bool unregistered_samples; - bool doChimeraCnt; - bool OTUnumFixed; // can new OTUs be added, after inital reading of DNA OTU.fna? -}; - - -//writes successful demultis and stores unsuccessful matches of fna/qual for later matching -class MultiDNA{ -public: - //wrStatus controls if this appends or overwrites output - MultiDNA(shared_ptr filter, OptContainer& cmdArgs, - std::ios_base::openmode wrStatus, shared_ptr, - string fileExt = "",int=-1); - ~MultiDNA(); - // void threadAnalyzeDNA(shared_ptr); - void setFastQWrite(bool x) { BWriteFastQ = x; BWriteQual = !x; } - void setQualWrite(bool x) { BWriteQual = x; } - void addNoHeadDNA(shared_ptr d) { DNAsNoHead.push_back(d); } - //-1,-1,-2 - void analyzeDNA(shared_ptr d, int FilterUse, int pair, int &idx); - void writeAllStoredDNA(); - vector analyzeDNA(shared_ptr p1, shared_ptr p2, shared_ptr mid, bool changePHead, int = -1); - - //void writeAndDel(shared_ptr d, int p=1) { writeAndDel(d.get(), p); } - void writeAndDel(shared_ptr d, int Pair = 1);//1=pair1;2=pair2;3=singleton1,4=singl2 - //Function specifically if several output files are required - void writeSelectiveStream(shared_ptr d, int Pair, int FS);//1=pair1;2=pair2;3=singl1,4=singl2 ;; FS: different multi FileStreams to be used - - //pretty final bool, aborts all, so careful with this - bool saveForWrite(shared_ptr d, int Pair = 1);//1=pair1;2=pair2;3=singleton - shared_ptr getFilters(int w = -1) { if (w == -1) { return MFil; } else { return subFilter[w]; } } - int isPEseq() { return pairedSeq; } - void closeOutStreams(bool wr = true); - //ofstream::app, ios_base::out - void openOutStreams(OptContainer& cmdArgs, int, std::ios_base::openmode, string = "",int=-1); - void openSeveralOutstreams(OptContainer& cmdArgs, shared_ptr, std::ios_base::openmode); - string leadOutFile() { return leadingOutf; } - //void setfastQver(int x){fastQver = x;} - //void setfastQoutVer(int x){fastQoutVer = x;} - - bool checkFastqHeadVersion(shared_ptr d, bool = false); - //int getFastqMod(){return MFil->FastqModifier();} - int getFastqVer() { return fastQver; } - int getfastQoutVer() { return fastQoutVer; } - bool haveToRestartSet() { return MFil->haveToRestartSet(); } - void resetOutFilesAndFilter();//MD->closeOutStreams(); - void setBCfixed(bool b,bool fwd) { MFil->setBCfixed(b,fwd); write2File = b; } - void setSubfilters(int num); - void mergeSubFilters(); - void activateWrite2File() { write2File = true; } - void createWriteThread() { writeThreadStatus = 1; } - void setOneLinerFastaFmt(bool b) { b_oneLinerFasta = b; } - void printStorage() { cerr << "Size of MD DNA P1:" << DNAsP1.size() << " P2: " << DNAsP2.size() << endl; } - void revConstellationCnts(int x) { MFil->revConstellationCnts(x); } - //dereplication of DNA seqs - void attachDereplicator(shared_ptr de); - //void depPrep(shared_ptr); - void depPrep(shared_ptr,shared_ptr); - //debug function to look closer at nonBC reads - void writeNonBCReads(shared_ptr d, shared_ptr d2); - void setReadLimit(int x) { maxRdsOut = x; } - -private: - void setwriteMode(std::ios_base::openmode wm) { wrMode = wm;} - inline void setFilePos(ofstream& str,streamoff& pos){ - str.seekp(0,ios_base::end); pos = str.tellp(); - } - //wh: 0=fastq; 1=fna; 2=qual - inline void openOFstream(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg, bool, int); - inline void openOFstreamFQ(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg,bool=false ); - inline void openOFstreamFNA(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg,bool=false); - inline void openOFstreamQL(const string opOF, std::ios_base::openmode wrMode, int p1, int p2, string errMsg,bool=false); - void openNoBCoutstrean(const string); - - -// void resetOutStreams(); - void delAllDNAvectors(); - void writeAllStoredDNA2(); - void writeAllStoredDNA2t(); - void incrementOutputFile(); - - //contains min seq pars & Barcodes etc. - shared_ptr MFil; - //UClinks* optim; - //for threaded statisitics counting - vector> subFilter; - //contains DNA sequences (that failed to have matching Q and vice versa - vector> DNAsP1; - vector> DNAsP2,DNAsS1,DNAsS2,DNAsNoHead; - vector> DNAsP1_alt,DNAsP2_alt,DNAsS1_alt,DNAsS2_alt; - vector IDs; - //controls how memory DNA is written to out file - int suppressOutWrite;//0=all normal, 1=skip mainfile, 2=skip addfile, 3=skip both - bool write2File; - bool mem_used; - int DNAinMem, writeThreadStatus; -#ifdef _THREADED - std::thread wrThread; - std::mutex mutex; - vector threads; -#endif - int fastQver; //33, 62 or 59 - int fastQoutVer; //33, 62 or 59 - //write out quality file, is the input paired End sequenced - bool BWriteQual, BWriteFastQ; - bool b_multiOutStream; - int pairedSeq; //1=single, 2=PE, 3=PE+MID - bool b_changeFQheadVer; // T/F 0=no PE, 1= XX/1, 2=XX 1:0:3 - bool b_oneLinerFasta; // write one line per sequence? - bool b_doDereplicate; - bool b_writePassed; - bool b_writeMidPass; - - //asynchronous threads - //std::vector> threads; - int Nthrds,thrdsCnt; bool thrdsActive; - //controls output file size - int maxReadsPerOFile,ReadsWritten; - int maxRdsOut; - bool stopAll;//red button, just stop all - string leadingOutf; - OptContainer locCmdArgs; - shared_ptr Derepl; - int cntDerep; - - //abstraction to real file type - //0,1,2,3 refers to pairs (0,1) & singletons (2,3) - //0=high qual, 1=mid qual - std::ios_base::openmode wrMode; - vector> sFile, qFile, fqFile; - vector> sFileStr, qFileStr, fqFileStr; - vector fqNoBCFile; - uint totalFileStrms; - - //future derepThread; - //vector sFile_alt, qFile_alt, fqFile_alt; - /*ofstream qFile, sFile, fqFile; - ofstream qFile_alt, sFile_alt, fqFile_alt; - ofstream qFile2, sFile2, fqFile2;//second pair - ofstream qFile2_alt, sFile2_alt, fqFile2_alt; - ofstream qFileS, sFileS, fqFileS;//singleton - ofstream qFileS_alt, sFileS_alt, fqFileS_alt; - ofstream qFileS2, sFileS2, fqFileS2;//singleton - ofstream qFileS2_alt, sFileS2_alt, fqFileS2_alt; - */ - -// streamoff qFilePos, sFilePos, fqFilePos; -// streamoff qFile2Pos, sFile2Pos, fqFile2Pos;//second pair -// streamoff qFileSPos, sFileSPos, fqFileSPos;//singleton -// streamoff qFileS2Pos, sFileS2Pos, fqFileS2Pos;//singleton - -}; - -//fwd declarations -//bool read_fasta_entry(ifstream&fna,ifstream&qual,shared_ptr in,shared_ptr,int&); -//shared_ptr read_fastq_entry(ifstream & fna,int fastQver, int &minQScore, -// long& pos); - -#endif \ No newline at end of file diff --git a/configs/sdm_src/containers.o b/configs/sdm_src/containers.o deleted file mode 100644 index 5676140..0000000 Binary files a/configs/sdm_src/containers.o and /dev/null differ diff --git a/configs/sdm_src/gzstream.h b/configs/sdm_src/gzstream.h deleted file mode 100644 index 1afa208..0000000 --- a/configs/sdm_src/gzstream.h +++ /dev/null @@ -1,221 +0,0 @@ -// ============================================================================ -// gzstream, C++ iostream classes wrapping the zlib compression library. -// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// ============================================================================ -// -// File : gzstream.h -// Revision : $Revision: 1.5 $ -// Revision_date : $Date: 2002/04/26 23:30:15 $ -// Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The -// Standard C++ Library". -// ============================================================================ - -#ifndef GZSTREAM_H -#define GZSTREAM_H 1 - -// standard C++ with new header file names and std:: namespace -#include -#include -#include -#include - -#ifdef GZSTREAM_NAMESPACE -namespace GZSTREAM_NAMESPACE { -#endif - - // ---------------------------------------------------------------------------- - // Internal classes to implement gzstream. See below for user classes. - // ---------------------------------------------------------------------------- - - class gzstreambuf : public std::streambuf { - private: - static const int bufferSize = 47 + 256; // size of data buff - // totals 512 bytes under g++ for igzstream at the end. - - gzFile file; // file handle for compressed file - char buffer[bufferSize]; // data buffer - char opened; // open/close state of stream - int mode; // I/O mode - - - int flush_buffer() { - // Separate the writing of the buffer from overflow() and - // sync() operation. - int w = pptr() - pbase(); - if (gzwrite(file, pbase(), w) != w) - return EOF; - pbump(-w); - return w; - } - public: - gzstreambuf() : opened(0) { - setp(buffer, buffer + (bufferSize - 1)); - setg(buffer + 4, // beginning of putback area - buffer + 4, // read position - buffer + 4); // end position - // ASSERT: both input & output capabilities will not be used together - } - int is_open() { return opened; } - - ~gzstreambuf() { close(); } - gzstreambuf* open(const char* name, int open_mode) { - if (is_open()) - return (gzstreambuf*)0; - mode = open_mode; - // no append nor read/write mode - if ((mode & std::ios::ate) - || ((mode & std::ios::in) && (mode & std::ios::out) && (mode & std::ios::app))) - return (gzstreambuf*)0; - char fmode[10]; - char* fmodeptr = fmode; - if (mode & std::ios::in) - *fmodeptr++ = 'r'; - else if (mode & std::ios::out) - *fmodeptr++ = 'w'; - else if (mode & std::ios::app) - *fmodeptr++ = 'a'; - *fmodeptr++ = 'b'; - *fmodeptr = '\0'; - file = gzopen(name, fmode); - if (file == 0) - return (gzstreambuf*)0; - opened = 1; - return this; - } - - gzstreambuf * close() { - if (is_open()) { - sync(); - opened = 0; - if (gzclose(file) == Z_OK) - return this; - } - return (gzstreambuf*)0; - } - - virtual int underflow() { // used for input buffer only - if (gptr() && (gptr() < egptr())) - return *reinterpret_cast(gptr()); - - if (!(mode & std::ios::in) || !opened) - return EOF; - // Josuttis' implementation of inbuf - int n_putback = gptr() - eback(); - if (n_putback > 4) - n_putback = 4; - memcpy(buffer + (4 - n_putback), gptr() - n_putback, n_putback); - - int num = gzread(file, buffer + 4, bufferSize - 4); - if (num <= 0) // ERROR or EOF - return EOF; - - // reset buffer pointers - setg(buffer + (4 - n_putback), // beginning of putback area - buffer + 4, // read position - buffer + 4 + num); // end of buffer - - // return next character - return *reinterpret_cast(gptr()); - } - - virtual int overflow(int c = EOF) { // used for output buffer only - if (!(mode & std::ios::out) || !opened) - return EOF; - if (c != EOF) { - *pptr() = c; - pbump(1); - } - if (flush_buffer() == EOF) - return EOF; - return c; - } - - virtual int sync() { - // Changed to use flush_buffer() instead of overflow( EOF) - // which caused improper behavior with std::endl and flush(), - // bug reported by Vincent Ricard. - if (pptr() && pptr() > pbase()) { - if (flush_buffer() == EOF) - return -1; - } - return 0; - } - }; - - class gzstreambase : virtual public std::ios { - protected: - gzstreambuf buf; - public: - - gzstreambase() { init(&buf); } - gzstreambase(const char* name, int mode) { - init(&buf); - open(name, mode); - } - ~gzstreambase() { - buf.close(); - } - void open(const char* name, int open_mode) { - if (!buf.open(name, open_mode)) - clear(rdstate() | std::ios::badbit); - } - - void close() { - if (buf.is_open()) - if (!buf.close()) - clear(rdstate() | std::ios::badbit); - } - gzstreambuf* rdbuf() { return &buf; } - }; - - // ---------------------------------------------------------------------------- - // User classes. Use igzstream and ogzstream analogously to ifstream and - // ofstream respectively. They read and write files based on the gz* - // function interface of the zlib. Files are compatible with gzip compression. - // ---------------------------------------------------------------------------- - - class igzstream : public gzstreambase, public std::istream { - public: - igzstream() : std::istream(&buf) {} - igzstream(const char* name, int open_mode = std::ios::in) - : gzstreambase(name, open_mode), std::istream(&buf) {} - gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } - void open(const char* name, int open_mode = std::ios::in) { - gzstreambase::open(name, open_mode); - } - }; - - class ogzstream : public gzstreambase, public std::ostream { - public: - ogzstream() : std::ostream(&buf) {} - ogzstream(const char* name, int mode = std::ios::out) - : gzstreambase(name, mode), std::ostream(&buf) {} - gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } - void open(const char* name, int open_mode = std::ios::out) { - gzstreambase::open(name, open_mode); - } - }; - -#ifdef GZSTREAM_NAMESPACE -} // namespace GZSTREAM_NAMESPACE -#endif - -#endif // GZSTREAM_H - // ============================================================================ - // EOF // \ No newline at end of file