diff --git a/configs/sdm_src/DNAconsts.cpp b/configs/sdm_src/DNAconsts.cpp
deleted file mode 100644
index b81ada1..0000000
--- a/configs/sdm_src/DNAconsts.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-email: Falk.Hildebrand@gmail.com
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-
-#include "DNAconsts.h"
-char DNA_trans[256];
-short DNA_amb[256];//to count amb chars
-short DNA_IUPAC[256 * 256];
-short NT_POS[256];
-
-
-void ini_DNAconstants(){
-//static char* DNA_trans[256] = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
- //DNA_trans.resize(256,'X');
- for (int i = 0; i<256; i++){ DNA_trans[i] = 'N'; }
- DNA_trans['A'] = 'T'; DNA_trans['T'] = 'A';
- DNA_trans['C'] = 'G'; DNA_trans['G'] = 'C';
- DNA_trans['a'] = 'T'; DNA_trans['t'] = 'A';
- DNA_trans['c'] = 'G'; DNA_trans['g'] = 'C';
- for (int i = 0; i<256; i++){ DNA_amb[i] = 0; }
- DNA_amb['A'] = 1; DNA_amb['T'] = 1;
- DNA_amb['C'] = 1; DNA_amb['G'] = 1;
- DNA_amb['a'] = 1; DNA_amb['t'] = 1;
- DNA_amb['c'] = 1; DNA_amb['g'] = 1;
-
-
- for ( int i = 0; i < 256 ; i++ ) { NT_POS[i] = 5; }
- NT_POS['A'] = 0; NT_POS['T'] = 1; NT_POS['G'] = 2; NT_POS['C'] = 3;
- NT_POS['N'] = 4;
- NT_POS['a'] = 0; NT_POS['t'] = 1; NT_POS['g'] = 2; NT_POS['c'] = 3;
- NT_POS['n'] = 4;
- for ( int i = 0; i < 256 * 256; i++ ) { DNA_IUPAC[i] = 1; }
- for ( int i = 0; i<14; i++ ) {//first: N is always a hit
- DNA_IUPAC['N'+256*DNA_SPACE[i]]= 0;
- DNA_IUPAC[256*'N'+DNA_SPACE[i]]= 0;
- }
- for ( int i = 0; i<5; i++ ) {//first: N is always a hit
- DNA_IUPAC['B'+256*DNA_SPACE[i]]=0;
- DNA_IUPAC[256*'B'+DNA_SPACE[i]]=0;
- DNA_IUPAC['H'+256*DNA_SPACE[i]]=0;
- DNA_IUPAC[256*'H'+DNA_SPACE[i]]=0;
- DNA_IUPAC['D'+256*DNA_SPACE[i]]=0;
- DNA_IUPAC[256*'D'+DNA_SPACE[i]]=0;
- DNA_IUPAC['V'+256*DNA_SPACE[i]]=0;
- DNA_IUPAC[256*'V'+DNA_SPACE[i]]=0;
- }
-
- DNA_IUPAC['B'+256*'A']=1;DNA_IUPAC[256*'B'+'A']=1;
- DNA_IUPAC[('D'+256*'C')]=1;DNA_IUPAC[256*'D'+'C']=1;
- DNA_IUPAC['H'+256*'G']=1;DNA_IUPAC[256*'H'+'G']=1;
- DNA_IUPAC['V'+256*'T']=1;DNA_IUPAC[256*'V'+'T']=1;
-
-
-
- DNA_IUPAC['R'+256*'A']=0;DNA_IUPAC[256*'R'+'A']=0;
- DNA_IUPAC['R'+256*'G']=0;DNA_IUPAC[256*'R'+'G']=0;
- DNA_IUPAC['M'+256*'C']=0;DNA_IUPAC[256*'M'+'C']=0;
- DNA_IUPAC['M'+256*'A']=0;DNA_IUPAC[256*'M'+'A']=0;
- DNA_IUPAC['Y'+256*'C']=0;DNA_IUPAC[256*'Y'+'C']=0;
- DNA_IUPAC['Y'+256*'T']=0;DNA_IUPAC[256*'Y'+'T']=0;
- DNA_IUPAC['K'+256*'G']=0;DNA_IUPAC[256*'K'+'G']=0;
- DNA_IUPAC['K'+256*'T']=0;DNA_IUPAC[256*'K'+'T']=0;
- DNA_IUPAC['W'+256*'A']=0;DNA_IUPAC[256*'W'+'A']=0;
- DNA_IUPAC['W'+256*'T']=0;DNA_IUPAC[256*'W'+'T']=0;
- DNA_IUPAC['S'+256*'C']=0;DNA_IUPAC[256*'S'+'C']=0;
- DNA_IUPAC['S'+256*'G']=0;DNA_IUPAC[256*'S'+'G']=0;
-
- DNA_IUPAC['A'+256*'A']=0;DNA_IUPAC[256*'A'+'A']=0;
- DNA_IUPAC['T'+256*'T']=0;DNA_IUPAC[256*'T'+'T']=0;
- DNA_IUPAC['G'+256*'G']=0;DNA_IUPAC[256*'G'+'G']=0;
- DNA_IUPAC['C'+256*'C']=0;DNA_IUPAC[256*'C'+'C']=0;
- //DEBUG
-
- //fake use to surpress compiler warnings
- //if (sdm_version == 0.f){ cout << "too low version" << sdm_status; }
- //if (strcmp(sdm_status, "XX")){ cout << "Some"; }
-
-}
\ No newline at end of file
diff --git a/configs/sdm_src/DNAconsts.h b/configs/sdm_src/DNAconsts.h
deleted file mode 100644
index 0265bfb..0000000
--- a/configs/sdm_src/DNAconsts.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-
-
-#ifndef _DNAconsts_h
-#define _DNAconsts_h
-
-//compile with multi threading support
-#define _THREADE //D
-
-//do vector based DNA matching
-#define _NEWMATCH
-
-//sum up uc file to OTU abundance matrix in seed extension step
-#define matrix_sum
-
-//match barcodes based on maps
-#define _fastBCmatch
-
-//keep a map of dereplicated file
-#define _MAPDEREPLICATE
-
-//KHASH for faster / lower mem access
-#define KHAS_H
-
-//disable win warning about fopen
-#define _CRT_SECURE_NO_WARNINGS
-
-
-//read gzip'd files using zlib.h
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
-#define _gziprea//d
-#else
-#define _gzipread
-#endif
-
-//DEBUG mode: more output
-#define DEB//UG
-
-
-#include
-#include
-#include
-#include
-#include
-#include
-//#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-//#include
-#ifdef KHASH
-#include "khash.hh"
-#endif
-
-#ifdef _gzipread
-#include "gzstream.h"
-#endif
-
-
-#ifdef _THREADED
-#include
-#include
-#endif
-//#include
-
-#ifdef _WIN32
-//#include
-#endif // _WIN32
-
-
-static const float sdm_version = 1.50f;
-static const char* sdm_status = "beta";
-
-
-using namespace std;
-
-static const double SAqualP[110] = {1.000000e+00,7.943282e-01,6.309573e-01,5.011872e-01,3.981072e-01,3.162278e-01,2.511886e-01,1.995262e-01,1.584893e-01,1.258925e-01
-,1.000000e-01,7.943282e-02,6.309573e-02,5.011872e-02,3.981072e-02,3.162278e-02,2.511886e-02,1.995262e-02,1.584893e-02,1.258925e-02
-,1.000000e-02,7.943282e-03,6.309573e-03,5.011872e-03,3.981072e-03,3.162278e-03,2.511886e-03,1.995262e-03,1.584893e-03,1.258925e-03
-,1.000000e-03,7.943282e-04,6.309573e-04,5.011872e-04,3.981072e-04,3.162278e-04,2.511886e-04,1.995262e-04,1.584893e-04,1.258925e-04
-,1.000000e-04,7.943282e-05,6.309573e-05,5.011872e-05,3.981072e-05,3.162278e-05,2.511886e-05,1.995262e-05,1.584893e-05,1.258925e-05
-,1.000000e-05,7.943282e-06,6.309573e-06,5.011872e-06,3.981072e-06,3.162278e-06,2.511886e-06,1.995262e-06,1.584893e-06,1.258925e-06
-,1.000000e-06,7.943282e-07,6.309573e-07,5.011872e-07,3.981072e-07,3.162278e-07,2.511886e-07,1.995262e-07,1.584893e-07,1.258925e-07
-,1.000000e-07, 1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07
-,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07 ,1.000000e-07
-, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07
-, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07, 1.000000e-07 };
-
-static const char DNA_SPACE[15] = {'A','C','G','T','N','R','Y','M','K','W','S','B','D','H','V'};
-static const int DNAinMemory = 5000;
-static const unsigned int maxFileStreams = 500;
-static const int RDBUFFER = 4096;
-
-typedef unsigned int uint;
-typedef unsigned long ulong;
-typedef int qual_score; //used for quality scores in vectors
-
-//seeding
-static const float BestLengthRatio = 0.83f;
-static const float RefLengthRatio = 0.9f;
-static const qual_score MinQualDiff = 5;
-
-
-
-void ini_DNAconstants();
-
-
-// first base is [ACTG], second can be IUPAC
-/*code description
-A Adenine
-C Cytosine
-G Guanine
-T Thymine
-U Uracil
-R Purine (A or G)
-Y Pyrimidine (C, T, or U)
-M C or A
-K T, U, or G
-W T, U, or A
-S C or G
-B C, T, U, or G (not A)
-D A, T, U, or G (not C)
-H A, T, U, or C (not G)
-V A, C, or G (not T, not U)
-N Any base (A, C, G, T, or U)
-*/
-
-
-
-#endif
\ No newline at end of file
diff --git a/configs/sdm_src/DNAconsts.o b/configs/sdm_src/DNAconsts.o
deleted file mode 100644
index e0d0472..0000000
Binary files a/configs/sdm_src/DNAconsts.o and /dev/null differ
diff --git a/configs/sdm_src/Demultipl.cpp b/configs/sdm_src/Demultipl.cpp
deleted file mode 100644
index 3f3769e..0000000
--- a/configs/sdm_src/Demultipl.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-email: Falk.Hildebrand@gmail.com
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-
-
-
-#include "IO.h"
-
-
-
-int main(int argc, char* argv[])
-{
- if (argc<3){
- //help_options,help_map,help_commands
- if (argc==2){
- if (string(argv[1])=="-help_commands"){
- printCmdsHelp();
- } else if (string(argv[1])=="-help_options"){
- printOptionHelp();
- } else if (string(argv[1])=="-help_map"){
- printMapHelp();
- }
- else if (string(argv[1]) == "-version" || string(argv[1]) == "-v"){
- printVersion();
- }
- exit(0);
- }
- general_help();
- exit(0);
- }
-#ifdef DEBUG
- cerr << "DEBUG mode"< fil = make_shared(cmdArgs);
- bool bReads = fil->readMap(cmdArgs);
-#ifdef DEBUG
- cerr << "filter setup & map is read" << endl;
-#endif
- if (!bReads){cerr<<"Failed to read Map.\n";exit(3);}
- //cerr<setcmdArgsFiles(cmdArgs);
-
- clock_t tStart = clock();
- //main function
-
- separateByFile(fil,cmdArgs);
- //cerr << "\nXXXX\n\n";
-// delete fil;
-
- fprintf(stderr,"Time taken: %.2fs\n", (double)(clock() - tStart) / CLOCKS_PER_SEC);
-
- return 0;
-}
-
-
-
diff --git a/configs/sdm_src/Demultipl.o b/configs/sdm_src/Demultipl.o
deleted file mode 100644
index 6d09da8..0000000
Binary files a/configs/sdm_src/Demultipl.o and /dev/null differ
diff --git a/configs/sdm_src/IO.cpp b/configs/sdm_src/IO.cpp
deleted file mode 100644
index 74c6965..0000000
--- a/configs/sdm_src/IO.cpp
+++ /dev/null
@@ -1,1282 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-email: Falk.Hildebrand@gmail.com
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-
-
-#include "IO.h"
-//#include
-void threadAnalyzeDNA(shared_ptr tdn, shared_ptr MD,int thrCnt){
- //if (threadActive){
- // threads[thrCnt].join();
- //}
- //threads[thrdsCnt] =
-
- //auto f1 = std::async(&MultiDNA::analyzeDNA,this,tdn);
- int tagIdx(-2);
- MD->analyzeDNA(tdn,thrCnt,-1,tagIdx);
- MD->saveForWrite(tdn);
-}
-/*void trippleThreadAnalyzeDNA(shared_ptr MD, shared_ptr tdn,shared_ptr tdn2,
- shared_ptr MIDseq,bool changePHead){//,int thrCnt){
-
-
- int thrCnt = 0;
- vector chs = MD->analyzeDNA(tdn,tdn2,MIDseq,changePHead,thrCnt);
-
- if (chs[0] && chs[1]){
- MD->saveForWrite(tdn,1);
- MD->saveForWrite(tdn2,2);
- } else if (chs[0]){
- MD->saveForWrite(tdn,3);
- MD->getFilters(thrCnt)->colStats[0].singleton++;
- } else if (chs[1]){
- MD->saveForWrite(tdn2,4);
- MD->getFilters(thrCnt)->colStats[1].singleton++;
- }
-
-}*/
-
-void read_single(OptContainer& cmdArgs, shared_ptr MD, shared_ptr IS){
- //output files
-#ifdef _THREADED
- int Nthrds = atoi(cmdArgs["-threads"].c_str()) -2 ;
- int thrCnt = 0;
- MD->setSubfilters(Nthrds);
- bool threadActive(false);
- bool writeThread(false);
- vector threads( 0 );
- if (Nthrds>=0){
- MD->createWriteThread();writeThread=true;
- threads.resize(Nthrds);
- }
-#endif
- shared_ptr curFil = MD->getFilters();
- bool cont(true); bool sync(false);
- while (cont){
- shared_ptr tdn1 = IS->getDNA(cont,0,sync);
- if (tdn1 == NULL) {
-#ifdef DEBUG
- cerr << "NULL read returned" << endl;
-#endif
- break;
- }
-
- /*if (!tdn1->isPassed()){
- MD->addNoHeadDNA(tdn1);
- tdn1 = tdn2; tdn2 = new DNA("","");
- continue;
- }*/
-#ifdef _THREADED
- if (Nthrds>0){
- if (threadActive){
- threads[thrCnt].join();
- }
-
- //threadAnalyzeDNA(MD,tdn);
- threads[thrCnt] = std::thread(threadAnalyzeDNA,tdn,MD,thrCnt);
- thrCnt++;
- if (thrCnt >= Nthrds){
- thrCnt=0;
- threadActive=true;
- }
- } else { //single Core
- MD->analyzeDNA(tdn);
- MD->saveForWrite(tdn);
- }
-#else
- curFil->sTotalPlus(0);
- int tagIdx(-2);
- MD->analyzeDNA(tdn1,-1,-1,tagIdx);
- //here BC has to be correctly set within DNA object
- if (tagIdx == -1 ) {
- tdn1->setBarcodeDetected(false);
- }
- MD->depPrep(tdn1,NULL);
- curFil->write2Demulti(tdn1, 0,MD->getfastQoutVer());
-
- if (!MD->saveForWrite(tdn1)) {
- cont = false;
- break;
- }
-
-#endif
- //if (tdn!=NULL && ch1 != tdn->isPassed()){cerr<<"isPassed is != ch1! Aborting..\n";exit(12);}
- }
-#ifdef _THREADED
- if (threadActive){
- for (uint i=0; icloseOutStreams();
-}
-
-
-//is called from a while loop, that reads the DNA pairs
-bool read_paired_DNAready(shared_ptr tdn, shared_ptr tdn2, shared_ptr MIDseq, bool MIDuse, shared_ptr MD, int& revConstellation) {
-
- if (tdn == NULL) { return true; } //|| tdn->length()==0
-
- shared_ptr curFil = MD->getFilters();
-
- //register read at all with stat counter:
- curFil->sTotalPlus(0); curFil->sTotalPlus(1);
-
- //prep some variables
- int BCoffs = curFil->getBCoffset();
- bool checkBC2ndRd = curFil->checkBC2ndRd();
- bool dualBCs = curFil->doubleBarcodes();
- bool doBCsAtAll = curFil->doBarcodes();
- bool checkReversedRead = curFil->checkRevRd();
-
-
- int tagIdx(-2); int tagIdx2(-2);
- string presentBC(""); int c_err(0);
- bool isReversed(false);//was a reversion detected?
-
- if (MIDuse && MIDseq!= 0) {
- tagIdx = curFil->cutTag(MIDseq, presentBC, c_err, true);
-// delete MIDseq;
- tdn->setBCnumber(tagIdx, BCoffs);
- }
- if (checkBC2ndRd ) {
- if (!dualBCs) {
- bool revT = false;
- bool Pr1 = curFil->findPrimer(tdn, 0, false, 0);
- bool Pr2 = curFil->findPrimer(tdn2, 0, false, 0);
- tagIdx = curFil->findTag(tdn, presentBC, c_err, true);
- tagIdx2 = curFil->findTag(tdn2, presentBC, c_err, true);
- if ( true &&checkReversedRead && (tagIdx2 < 0 && tagIdx < 0) ) {
- tdn->reverse_transcribe(); tdn2->reverse_transcribe();
- Pr1 = curFil->findPrimer(tdn, 0, false, 0);
- Pr2 = curFil->findPrimer(tdn2, 0, false, 0);
- tagIdx = curFil->findTag(tdn, presentBC, c_err, true);
- tagIdx2 = curFil->findTag(tdn2, presentBC, c_err, true);
- revT = true;
- }
- if ((tagIdx2 >= 0 && tagIdx < 0 && !Pr1) || (Pr2 && !Pr1)) { //swap first & second read
- swap(tdn, tdn2);
- revConstellation++;
- }
- /*else if (tagIdx2 < 0 && tagIdx < 0) {
- int x = 0;
- }*/
- if (revT) {
- tdn->reverse_transcribe(); tdn2->reverse_transcribe();
- }
- }
- tagIdx2 = -2; tagIdx = -2;
- tdn2->setpairREV(); tdn->setpairFWD();
- }
-
- //tdn->reverse_transcribe();
- MD->analyzeDNA(tdn, -1, 0, tagIdx);
- //tdn->matchSeqRev
- bool ch1(false); if (tdn != NULL) { ch1 = tdn->isPassed(); }
- bool ch2(false); bool ch2n(false);
-
- //this is all about barcodes..
- if (checkReversedRead && tdn != NULL && tagIdx < 0) {
- if (!MIDuse) { tagIdx = -2; }
-// curFil->sTotalMinus(0);
- tdn->reverse_transcribe();
- MD->analyzeDNA(tdn, -1, 0, tagIdx);
- ch1 = tdn->isPassed();
- isReversed = ch1;
- if (!isReversed) {//reset
- tdn->reverse_transcribe();
- }
- }
-
- //test for reverse complemented reads (mohammad samples), when BC not found (NOT dual BC)
- //in that case, this is the first read
- if (false &&checkBC2ndRd && tagIdx < 0 && tdn2 != NULL) {// && !tdn->getBarcodeDetected() ) {
- //tdn2->reverse_transcribe();
- if (!MIDuse) { tagIdx = -2; }
-// curFil->sTotalMinus(0);
- MD->analyzeDNA(tdn2, -1, 0, tagIdx);
- ch2n = tdn2->isPassed();
- if (!ch2n && checkReversedRead) {
- if (!MIDuse) { tagIdx = -2; }
-// curFil->sTotalMinus(0);
- tdn2->reverse_transcribe();
- MD->analyzeDNA(tdn2, -1, 0, tagIdx);
- ch2n = tdn2->isPassed();
- isReversed = ch2n;
- if (!ch2n) { tdn2->reverse_transcribe(); }//reset to ori
- }
- if (ch2n) {//passed ch2 through BC filter, now really reverse
- //1st, now 2nd pair
- tdn2->setpairFWD();
- ch1 = ch2n;
- if (tdn != NULL) {
- //tdn->reverse_transcribe();
- tdn->setpairREV();
- tdn->reset();
- if (!dualBCs) { tagIdx2 = tdn->getBCnumber(); } // no 2nd BC, thus no BC search in 2nd read
- MD->analyzeDNA(tdn, -1, 1, tagIdx2);
- ch2 = tdn->isPassed();
- }
- swap(tdn, tdn2);
- revConstellation++;
- }
-
- }
-
-
-
- //if ( ch1 ) { cerr << cnt << " \n"; }
- //normal case for check 2nd read
- if (!ch2 && tdn2 != NULL) { //ch1&&
- //tdn2->setBCnumber(tdn->getBCnumber());
- if (doBCsAtAll && !dualBCs) { //only check in read1 for BC, if not dual BCing!!
- tagIdx2 = tdn->getBCnumber(); // no 2nd BC, thus no BC search in 2nd read
- if (tagIdx2 >= 0) {
- tagIdx2 -= BCoffs;
- }else if (tagIdx2 < -1) {//something wrong with BCoffs
- cerr << "tagidx2 wrongly truncated to " << tagIdx2 << endl;
- }
- }
- if (isReversed) { tdn2->reverse_transcribe(); }
- MD->analyzeDNA(tdn2, -1, 1, tagIdx2);
- ch2 = tdn2->isPassed();
- }
-
- //set up BC in DNA header
- //remember that dual BCs are only valid after this step!
- if (dualBCs) {
- //tagIdx2 = -2; //reset just to be sure
- curFil->dblBCeval(tagIdx, tagIdx2, presentBC, tdn, tdn2);
- c_err = -1;
-
- //check a second time that barcode was correctly identified, just to be double sure...
- if (tagIdx != tagIdx2 || tdn->getBCnumber() != tdn2->getBCnumber()) {
- cerr << "Unequal BC numbers:" << tagIdx << " : " << tagIdx2 << "; in object: " << tdn->getBCnumber() << " : " << tdn2->getBCnumber() << endl;
- cerr << "In read:" << tdn->getID() << endl;
- exit(835);
- }
- }
- else if (tagIdx >= 0) {
- if (MIDuse&&ch1) { curFil->BCintoHead(tagIdx, tdn, presentBC, c_err, true); }
- else { curFil->setBCdna(tagIdx, tdn); }
- if (ch2) { curFil->BCintoHead(tagIdx, tdn2, presentBC, c_err, true); }
- }
-
- if (tagIdx == -1 || tagIdx2 == -1) {
- if (ch1) {
- tdn->setBarcodeDetected(false);
- }
- if (ch2) {
- tdn2->setBarcodeDetected(false);
- }
- }
-
- //demultiplex write? do this first before DNA is deleted..
- if (curFil->Demulti2Fls()) {
- curFil->write2Demulti(tdn, 0, MD->getfastQoutVer());
- curFil->write2Demulti(tdn2, 1, MD->getfastQoutVer());
- }
-
-
- //at this point the tagIDX *MUST* be correctly set + BCoffset (in the DNA object, tagIDX doesn;t matter)
- MD->depPrep(tdn, tdn2);
- MD->writeNonBCReads(tdn, tdn2);
-
- int idx1 = 1; int idx2 = 2;
- if (ch1 && !ch2) {
- idx1 = 3; idx2 = 4;
- if (tdn2 != NULL) { tdn2->failed(); }
-// delete tdn2;
- }
- else if (ch2 && !ch1) {
- idx2 = 4; idx1 = 3;
- if (tdn != NULL) { tdn->failed(); }
-// delete tdn;
- }
- else if (!ch1 && !ch2){ //nothing passes
- if (tdn != NULL) { tdn->failed(); }
- if (tdn2 != NULL) { tdn2->failed(); }
-// delete tdn; delete tdn2;
- }
-
- //save for later .. and collect stats
- if (!MD->saveForWrite(tdn, idx1) ||
- !MD->saveForWrite(tdn2, idx2)) {
- return false;
- }
- return true;
-}
-
-bool read_paired(OptContainer& cmdArgs, shared_ptr MD, shared_ptr IS, bool MIDuse) {
- DNAmap oldMIDs;
- bool fqHeadVer(true);
- shared_ptr MIDseq(NULL);
- bool syncedMID(MD->getFilters()->synRdPairs());
-
- bool sync2pair(true);
- DNAmap pair1rem,pair2rem,MIDrem;
-
- /*if (sync2pair && MIDuse) {
- cout << "Can not sync read pairs, while explicit MID sequences are being used! (not supported, sorry)\n";
- sync2pair = false;
- } */
-
- //bool syncedMID = false;
-#ifdef DEBUG
- cerr << "Read paired routine" << endl;
-#endif
-#ifdef _THREADED
- vector threads( Nthrds );
- int Nthrds = atoi(cmdArgs["-threads"].c_str()) -1 ;
- int thrCnt = 0;
- bool threadActive(false);
- MD->setSubfilters(Nthrds);
- int DNAinMem(0);
-#endif
- bool cont(true),cont2(true),cont3(true);
- int revConstellation(0);
- int cnt(0);
- string tdnSh(""), tdnSh2("");
- bool switching(true); // important to keep track of this, to fix swapped read pairs
-
- while ( cont ) {
-
-
- bool settdnSh(false);
- shared_ptr tdn = IS->getDNA(cont, 0, sync2pair);
-
- cnt++;
- if ( tdn == NULL && !cont) { break; }
- //tagIdx = -2; tagIdx2 = -2;
- shared_ptr tdn2 = IS->getDNA(cont2, 1, sync2pair);//read_fastq_entry(fna2,fastQver,minQScore,lnCnt);
- if (!sync2pair && !cont2 && cont ) {
- cerr << "Second provided file has not the same number of entries as first file.\n";
- exit(5);
- }
- //syn 2nd pair
- if (sync2pair) {
- if (!settdnSh) { tdnSh = tdn->getIDshort(); settdnSh = true; }
- if (tdn2 != NULL) { tdnSh2 = tdn2->getIDshort(); }else { tdnSh2 = ""; }
- DNAmap::iterator search;
- if (tdnSh2 != tdnSh) {//something wrong at all? ini search on old pair2
- if (switching) {
- search = pair2rem.find(tdnSh);
- if (search != pair2rem.end()) { pair2rem[tdnSh2] = tdn2; tdn2 = search->second; pair2rem.erase(search); tdnSh2 = tdn2->getIDshort(); switching = !switching;}
- } else {
- search = pair1rem.find(tdnSh2);
- if (search != pair1rem.end()) { pair1rem[tdnSh] = tdn; tdn = search->second; pair1rem.erase(search); tdnSh = tdn->getIDshort(); switching = !switching;}
- }
- }
- while (tdnSh2 != tdnSh) {// still wrong? search in old unmatched reads, switching between 1/2
- if (switching) {
- search = pair1rem.find(tdnSh2); pair1rem[tdnSh] = tdn;
- if (search != pair1rem.end()) {
- tdn = search->second; pair1rem.erase(search);
- }else {//nothing? try getting new reads, maybe there is a match here
- tdn = IS->getDNA(cont, 0, sync2pair);
- }
- if (tdn != NULL) { tdnSh = tdn->getIDshort(); } else {tdnSh = "";}
- } else {
- search = pair2rem.find(tdnSh); pair2rem[tdnSh2] = tdn2;
- if (search != pair2rem.end()) { tdn2 = search->second; pair2rem.erase(search);
- } else {
- tdn2 = IS->getDNA(cont2, 1, sync2pair);
- }
- if (tdn2 != NULL) { tdnSh2 = tdn2->getIDshort(); } else { tdnSh2 = ""; }
- }
-
- //read 1 / read 2
- switching = !switching;
- }
- }
- if (cnt % 50 == 0) { switching = !switching; } //add some randomness to the process.
- //security check that pairs are in sync
- if (!sync2pair && cnt % 1000 == 0) {//default check for synced reads, no matter what
- if (!settdnSh) { tdnSh = tdn->getIDshort(); settdnSh = true; }
- if (!tdn2->sameHead(tdnSh) ) { cerr << "WARNING: read pairs out of sync (" << cnt << "): " << tdnSh << " " << tdnSh2 << endl; }
- }
- //sync mid sequence header
- if (MIDuse) {
- if (!settdnSh) {tdnSh = tdn->getIDshort(); settdnSh = true; }
- //1st try to find in old heap
- if ( !syncedMID && (cont || cont2) ) {
- auto search = oldMIDs.find(tdnSh);
- if ( search != oldMIDs.end() ) {
- //it's in old.. give it to MIDseq, other rountines will del it
- MIDseq = (*search).second; oldMIDs.erase(search);
- } else {
- //read some new lines, maybe here?
- MIDseq = IS->getDNA(cont3, 2, sync2pair);
- bool SameMIDHead(MIDseq->sameHead(tdnSh));
- while ( !SameMIDHead && cont3 ) {
- //current MID not matching.. stove away
- oldMIDs[MIDseq->getIDshort()] = MIDseq; MIDseq = IS->getDNA(cont3, 2, sync2pair);
- SameMIDHead = MIDseq->sameHead(tdnSh);
- }
- }
- } else {MIDseq = IS->getDNA(cont3, 2, sync2pair);}
- MIDseq->setMIDseq(true);
- //FQ header version changes have to occur, before the MID tag is labelled
-
- }
- //check if the PE format is right
- if ( fqHeadVer ) { MD->checkFastqHeadVersion(tdn); fqHeadVer = false; }
-
- cont = read_paired_DNAready(tdn,tdn2, MIDseq, MIDuse, MD, revConstellation);
- }
-
- //check on remainders in pair2rem / pair1rem
- if (sync2pair) {
- if (pair1rem.size() > 0 && pair2rem.size() > 0) {
- cerr << "Trying to match " << pair1rem.size() << " / " << pair2rem.size() << " out-of-sync read pairs..\n";
- DNAmap::iterator search;
- for (DNAmap::iterator sr = pair1rem.begin(); sr != pair1rem.end(); sr++) {
- search = pair2rem.find(sr->first);
- if (search != pair2rem.end()) {
- cont = read_paired_DNAready(sr->second, search->second, MIDseq, MIDuse, MD, revConstellation);
- pair2rem[search->first] = NULL; pair1rem[sr->first] = NULL;
- pair2rem.erase(search); pair1rem.erase( sr );
- }
- }
- cerr << "Writing remaining " << pair1rem.size() << " / " << pair2rem.size() << " out-of-sync reads as singletons.\n";
- } else if (pair1rem.size() > 0 || pair2rem.size() > 0) {
- cerr << "Found " << pair1rem.size() << " / " << pair2rem.size() << " out-of-sync read pairs.\n";
- }
- for (DNAmap::iterator sr = pair1rem.begin(); sr != pair1rem.end(); sr++) {
- cont = read_paired_DNAready(sr->second, NULL, MIDseq, MIDuse, MD, revConstellation);
- pair1rem[sr->first] = NULL;//object needs to remain in mem
- }
- for (DNAmap::iterator sr = pair2rem.begin(); sr != pair2rem.end(); sr++) {
- cont = read_paired_DNAready(NULL, sr->second, MIDseq, MIDuse, MD, revConstellation);
- pair2rem[sr->first] = NULL;
-// pair2rem.erase(sr);
- }
-
- }
- //close shop
- MD->revConstellationCnts(revConstellation);
- MD->closeOutStreams();
- return true;
-}
-
-bool readCmdArgs(int argc, char* argv[],OptContainer& cmdArgs){
- if (argc%2!=1){
- cerr<<"It seems command line arguments were not passed in pairs. Aborting.\n";
- exit(666);
- }
- for (int i=1; i\n";
- exit(2);
- }
- if (cmdArgs.find("-i_qual") == cmdArgs.end()){
- string newQ = cmdArgs["-i_fna"];
- int pos = (int)newQ.find_last_of(".");
- newQ = newQ.substr(0,pos);
- newQ += string(".qual");
- fstream fin;
- fin.open(newQ.c_str(),ios::in);
- if( fin.is_open() ) {
- cerr<<"Using quality file: "<\n";
- newQ = "";
- //fin.close(); exit(2);
- }
- fin.close();
- cmdArgs["-i_qual"] = newQ;
- }
- }
- //auto create output file name
- if (cmdArgs.find("-o_fna") == cmdArgs.end()){
- if (cmdArgs.find("-o_fastq") == cmdArgs.end()){
- //cmdArgs["-o_fna"] = cmdArgs["-i_fna"]+string(".sdm");
- //cerr<<"Writing output fasta into "<\n";
- exit(2);
- } */
- if (cmdArgs.find("-o_qual") == cmdArgs.end()){
- cmdArgs["-o_qual"] = "";
- } else {
- if (cmdArgs.find("-o_fastq") != cmdArgs.end()){
- cerr<<"\"-o_qual\" was over-writen by \"-o_fastq\"\n";
- cmdArgs["-o_qual"] = "";
- }
- }
- if (cmdArgs.find("-options") == cmdArgs.end()){
- cmdArgs["-options"] = string("sdm_options.txt");
- }
- if (cmdArgs.find("-threads") == cmdArgs.end()){
- cmdArgs["-threads"] = "1";
- }
- if (cmdArgs.find("-log") == cmdArgs.end()){
- string ofile1 = cmdArgs["-o_fna"];
- if (ofile1==""){ofile1 = cmdArgs["-o_fastq"];}
- vector tvec = splitByComma(ofile1,false);
- ofile1 = tvec[0];
- //remove file ending
- unsigned int pos = (unsigned int) ofile1.find_last_of(".");
- if (pos != string::npos){ofile1 = ofile1.substr(0,pos); }
- if (tvec.size()==2){
- ofile1+= "_" + getFileNoPath(tvec[1]);
- pos = (unsigned int) ofile1.find_last_of(".");
- if (pos != string::npos){ofile1 = ofile1.substr(0,pos); }
- }
- cmdArgs["-log"] = ofile1 + string(".log");
- }
- string ofile1 = cmdArgs["-log"];
- ofile1.find_last_of(".log");
- size_t logPos = ofile1.find_last_of(".");
- if (logPos != std::string::npos){
- ofile1 = ofile1.substr(0,logPos);
- }
- if (cmdArgs.find("-length_hist") == cmdArgs.end()){
- cmdArgs["-length_hist"] = ofile1 + string("_lenHist.txt");
- }
- if (cmdArgs.find("-qual_hist") == cmdArgs.end()){
- cmdArgs["-qual_hist"] = ofile1 + string("_qualHist.txt");
- }
- //-length_hist -qual_hist
-
- if (cmdArgs.find("-sample_sep") == cmdArgs.end()){
- cmdArgs["-sample_sep"] = DEFAULT_BarcodeNameSep;
- } else if (cmdArgs["-sample_sep"]==""){
- cerr<<"Invalid sample separator (empty).\nAborting..\n";exit(82);
- }
-
-
- if (cmdArgs.find("-o_qual_offset") == cmdArgs.end()) {
- cmdArgs["-o_qual_offset"] = DEFAULT_output_qual_offset;
- }
-
- if (cmdArgs.find("-ignore_IO_errors") == cmdArgs.end()) {
- cmdArgs["-ignore_IO_errors"] = DEFAULT_ignore_IO_errors;
- } else if (cmdArgs["-ignore_IO_errors"] != "0" && cmdArgs["-ignore_IO_errors"] != "1") {
- cerr << "Argument \"ignore_IO_errors\" can only be \"1\" or \"0\". Instead it has value: " << cmdArgs["-ignore_IO_errors"] << endl;
- exit(323);
- }
- if (cmdArgs.find("-o_dereplicate") == cmdArgs.end()) {
- cmdArgs["-o_dereplicate"] = "";
- }
- if (cmdArgs.find("-derep_map") == cmdArgs.end()) {
- cmdArgs["-derep_map"] = "";
- }
-
-
-
- //if (cmdArgs.count("-i_fna")==0){}
-
- return true;
-}
-
-
-/*******************************************
-* read_fasta *
-*******************************************
-
-void openOutFiles(string files, string fmt, string xtr){
- ofstream fnaOut;
-
- vector tfnaout(0);
- if (files.find(",") != string::npos){
- tfnaout = splitByCommas(files);
- } else {
- tfnaout.push_back(files);
- }
- bool multiple = tfnaout.size() > 1;
- string xtr2 = "";
- if (multiple){xtr2 = "paired ";}
- for (uint i =0; i< tfnaout.size(); i++){
- fnaOut.open ( tfnaout[i].c_str(),ios_base::out);
- if (!fnaOut){ cerr<<"Could not open "< tfnaout = splitByComma(cmdArgs["-o_fna"],false);
- for (unsigned int i=0; i tqout = splitByComma(cmdArgs["-o_qual"],false);
- for (unsigned int i=0; i mainFil,OptContainer& cmdArgs){
-#ifdef DEBUG
- cerr << "separateByFile"< FastaF = mainFil->getFastaFiles();
- vector QualF = mainFil->getQualFiles();
- vector FastqF = mainFil->getFastqFiles();
- vector MIDfq = mainFil->getMIDfqFiles();
- vector tar;
- vector < vector > idx(0);
- bool bFASTQ = true;
- //prepareOutFiles(cmdArgs);
- string path="";
- if (cmdArgs.find("-i_path") != cmdArgs.end() && cmdArgs["-i_path"].length() > 2){
- path=cmdArgs["-i_path"] + string("/");
- }
-
-
- if (FastaF.size()>0){ //fasta way
- tar = FastaF;
- bFASTQ = false;
- } else { // fastq way
- tar = FastqF;
- if (FastqF.size()==0){
- cerr<<"No FastQ or Fasta file given.\n Aborting..\n";
- exit(12);
- }
- }
-
- vector uniqueFas(1,tar[0]);
- idx.push_back(vector (1,0));
-
- for (unsigned int i=1; i (1,i));
- }
- }
-
- //unique Fas files set up.. check for their existence
- shared_ptr testFiles =
- make_shared(!bFASTQ, mainFil->getuserReqFastqVer(), "1");
- for (unsigned int i = 0; i < uniqueFas.size(); i++) {
- int tarID = idx[i][0]; string tmp;
- string x = testFiles->setupInput(path, i, tarID, uniqueFas, FastqF, FastaF, QualF, MIDfq, mainFil->isPaired(), cmdArgs["-onlyPair"], tmp, true);
- }
-// delete testFiles;
-
- string mainFile = "", outFile = cmdArgs["-o_fna"];
-
- //prepare for Seed extension or Read subselection, if requested
- UClinks *ucl = NULL; shared_ptr RDSset;
- shared_ptr Dere ;
- if (mainFil->doOptimalClusterSeq()){
- ucl = new UClinks(cmdArgs);
- if (cmdArgs.find("-mergedPairs") != cmdArgs.end() && cmdArgs["-mergedPairs"] == "1"){
- ucl->pairedSeqsMerged(mainFil);
- }
- else {
- mainFil->setFloatingEWin(10, 25);
- }
- //are fallback fasta sequences available?
- if (cmdArgs["-OTU_fallback"] != ""){
- shared_ptr FALL = make_shared(true, mainFil->getuserReqFastqVer(), cmdArgs["-ignore_IO_errors"]);
- FALL->setupFna(cmdArgs["-OTU_fallback"]);
- ucl->setupDefSeeds(FALL,mainFil);
- }
- }
- else if (mainFil->doSubselReads()){
- //this will select a list of reads and distribute these into multiple files
- RDSset = make_shared(cmdArgs["-specificReads"],"");
- } else if (mainFil->doDereplicate()) {
- Dere = make_shared(cmdArgs);
- }
- //needs to attach to existing file sometimes
- std::ios_base::openmode writeStatus = ios_base::out;
- bool shortStats = false;
- string shrtLog = "";
-
-
- // main loop that goes over different files
- int maxRds = mainFil->getXreads();
- int totReadsRead(0);
- for (unsigned int i=0; i0 && maxRds - totReadsRead <= 0) { break; }
- shared_ptr fil = make_shared(mainFil, idx[i][0]);
- unsigned int tarSi = (unsigned int) idx[i].size();
- fil->allResize(tarSi);
- int tarID=-1;
- bool BC2mode = mainFil->doubleBarcodes();
- //int readsRead(0);
-
-
- for (unsigned int j=0; jPrimerIdx[tarID]>-1) {
- fil->addPrimerL(mainFil->PrimerL[mainFil->PrimerIdx[tarID]], j);
- }
- if (mainFil->doReversePrimers() && mainFil->PrimerIdxRev[tarID]>-1) {
- fil->addPrimerR(mainFil->PrimerR[mainFil->PrimerIdxRev[tarID]], j);
- }
- fil->Barcode[j] = mainFil->Barcode[tarID];
- if ( BC2mode ) {
- fil->Barcode2[j] = mainFil->Barcode2[tarID];
- }
- fil->SampleID[j] = mainFil->SampleID[tarID];
- fil->SampleID_Combi[j] = mainFil->SampleID_Combi[tarID];
- fil->HeadSmplID[j] = mainFil->HeadSmplID[tarID];
- if (fil->Demulti2Fls()) {
- fil->demultiSinglFiles[j] = mainFil->demultiSinglFiles[tarID];
- fil->demultiSinglFilesF[j] = mainFil->demultiSinglFilesF[tarID];
-
- //closing of ofstreams is only handled on the main object
- //mainFil->demultiSinglFiles[tarID] = vector (2,NULL);
- }
- }
- fil->checkDoubleBarcode();
-
- if (tarID==-1){cerr<<"tar == -1. abort.\n";exit(10);}
-
- //initialize object to handle all input file combinations
-
- shared_ptr IS = make_shared(!bFASTQ, mainFil->getuserReqFastqVer(), cmdArgs["-ignore_IO_errors"]);
- if (tarSi < 2 && uniqueFas.size() > 1) {
- IS->atFileYofX(i + 1, (uint)uniqueFas.size(), tarSi);
- }
- string mainFileShort = "";
- mainFile = IS->setupInput(path, i, tarID, uniqueFas, FastqF, FastaF, QualF, MIDfq, fil->isPaired(), cmdArgs["-onlyPair"], mainFileShort, false);
- if (!IS->qualityPresent()) {
- fil->deactivateQualFilter();
- cerr << "\n*********\nWarning:: Quality file is not present.\nRecommended to abort demultiplexing.\n*********\n\n";
- }
- fil->BarcodePreStats();
- fil->checkDoubleBarcode();
- fil->checkDoubleSampleIDHead();
-
-
- if (mainFil->doOptimalClusterSeq()){
- ucl->findSeq2UCinstruction(IS,bFASTQ,mainFil);
- continue;
- }
-
-
-#ifdef DEBUG
- cerr << "Setting up output" << endl;
-#endif
- //MultiDNA MD = MultiDNA(&fil, cmdArgs, writeStatus, RDSset);
- shared_ptr MD = make_shared(fil, cmdArgs, writeStatus, RDSset);
- fil->setMultiDNA(MD);
- if (maxRds > 0) { MD->setReadLimit(maxRds - totReadsRead); }
- writeStatus = ofstream::app;
- //prepare for BC checking (rev/fwd)
- if (fil->doDemultiplex()){
- MD->setBCfixed(false, true);
- if (MD->isPEseq() == 2) { MD->setBCfixed(false, false); }
- }
- if (cmdArgs.find("-oneLineFastaFormat") != cmdArgs.end() && cmdArgs["-oneLineFastaFormat"] == "1") {
- MD->setOneLinerFastaFmt(true);
- }
- //cout << Dere->Nms_size() << " DEBCs\n";
- MD->attachDereplicator(Dere);
- //only pull out a subset of sequences
- if (mainFil->doSubselReads()) {
- if (cmdArgs.find("-mocatFix") != cmdArgs.end()) {
- cerr << "MOCAT fix appplies\n";
- RDSset->findMatches(IS, MD, true);
- }else{
- RDSset->findMatches(IS, MD, false);
- }
- //delete MD;
- continue;
- }
-
-
-#ifdef DEBUG
- cerr << "Processing reads" << endl;
-#endif
-
-
-
-
-
- //**********************
- //heavy reading routine
- //**********************
- if (MD->isPEseq() == 2){
- //read_paired(cmdArgs,MD,IS);
- while ( !read_paired(cmdArgs, MD, IS, IS->hasMIDseqs()) ) {
- //reset output files to previous state
- MD->resetOutFilesAndFilter();
- }
- } else {
- read_single(cmdArgs,MD,IS);
- }
-
-
-
-
-#ifdef DEBUG
- cerr << "All read processed" << endl;
-#endif
- outFile = MD->leadOutFile();
-// delete MD;
-#ifdef DEBUG
- cerr << "MD deleted" << endl;
-#endif
-
- //stats
- fil->prepStats();
- if (IS->getCurFileN() == 0) {
- fil->printStats(cerr, mainFile, outFile, true);
- } else {
- cerr<shortStats(""); shortStats = true;
- }
-
- totReadsRead += fil->totalAccepts();
-
- shrtLog += fil->shortStats( mainFileShort);
-// delete IS;
- //write log file
- if (uniqueFas.size() > 1){//only print sub log if neccessary
- ofstream log;
- string logF = cmdArgs["-log"] + string("0") + itos(i);
- log.open (logF.c_str() ,ios_base::out);
- fil->printStats(log,mainFile,outFile,true);
- log.close();
- }
- mainFil->addStats(fil,idx[i]);
-#ifdef DEBUG
- cerr << "Delete tmp filter" << endl;
-#endif
- //and cleanup
- //
- //fil;
- }
-#ifdef DEBUG
- cerr << "Prep final logging" << endl;
-#endif
-//write log files
- if (uniqueFas.size() > 1){
- mainFile = "several";
- }
-
- ofstream log; string deLog("");
- string logF = cmdArgs["-log"], logFA = cmdArgs["-log"].substr(0, cmdArgs["-log"].length()-3) + "add.log";
-
- //different logfile for SEED extension
- if (mainFil->doOptimalClusterSeq()){
- //finish up dereplication file (creating pseudo seeds with counts)
- ucl->finishMAPfile();
- if (cmdArgs["-ucAdditionalCounts"] != ""){
- ucl->set2UC();
- ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts"], true);//with smplHead (.mid)
- ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts1"], false);//without smplHead (.rest)
- }
- if (cmdArgs["-ucAdditionalCounts_refclust"] != ""){
- //reference based clustering has some high qual seqs (no replacement with reads..)
- shared_ptr FALL = make_shared(true, mainFil->getuserReqFastqVer());
- //this reads in the SLV fna's & creates matrix entries for these
- FALL->setupFna(cmdArgs["-OTU_fallback_refclust"]);
- ucl->setRefMode();
- ucl->addDefSeeds(FALL, mainFil);
- ucl->set2UC();
- //mapping from ref OTU clustering
- ucl->finishUCfile(mainFil, cmdArgs["-optimalRead2Cluster_ref"], false);
- //mid / rest mappings
- ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts_refclust"], true);//with smplHead (.rest)
- ucl->finishUCfile(mainFil, cmdArgs["-ucAdditionalCounts_refclust1"], false);//without smplHead (.rest)
-
- }
- if (cmdArgs["-log"] != "nolog") {
- log.open(logF.c_str(), ios_base::out);
- ucl->printStats(cerr);
- ucl->printStats(log);
- log.close();
- }
-
- //everything done on DNA? Then write & delete
- if (cmdArgs["-otu_matrix"] != "") {
- ucl->writeOTUmatrix(cmdArgs["-otu_matrix"], mainFil);
- }
- //needs to be written after OTU matrix (renaming scheme)
- shared_ptr MD = make_shared(mainFil, cmdArgs, ios::out, RDSset);
- mainFil->setMultiDNA(MD);
- ucl->writeNewSeeds(MD, mainFil,false);
- //delete MD;
- //new fastas also need to be written..
- MD.reset(new MultiDNA(mainFil, cmdArgs, ios::out, RDSset, ".ref", 1));//force fna output
- mainFil->setMultiDNA( MD );
- ucl->writeNewSeeds(MD, mainFil,true,true);
- //delete MD;
-
-
- return;
- } else if (mainFil->doDereplicate()) {
-#ifdef DEBUG
- cerr << "write Dereplicated DNA" << endl;
-#endif
- deLog = Dere->writeDereplDNA(mainFil);
-#ifdef DEBUG
- cerr << "done write Dere" << endl;
-#endif
- }
-#ifdef DEBUG
- cerr << "Logging almost finished" << endl;
-#endif
-
- if (cmdArgs["-log"] == "nolog") {
-// delete Dere;
- return;
- }
-#ifdef DEBUG
- cerr << "DereLog start" << endl;
-#endif
- if (mainFil->doDereplicate()) {
- string dereLog = logF.substr(0,logF.length()-3) + "dere";
- Dere->writeLog(dereLog, deLog);
-// delete Dere;
- }
-
-#ifdef DEBUG
- cerr << "DereLog end" << endl;
-#endif
- if (shortStats) {
- mainFil->printStats(std::cerr, mainFile, outFile, true);
- }
-#ifdef DEBUG
- cerr << "other logs start" << endl;
-#endif
- //per sample success rate
- string logPS = logF.substr(0, logF.length() - 3) + "acceptsPerSample.log";
- log.open(logPS.c_str(), ios_base::out);
- mainFil->SmplSpecStats(log);
- log.close();
- log.open(logF.c_str(), ios_base::out);
- mainFil->printStats(log,mainFile,outFile,true);
- log.close();
-
- string logFs = logF.substr(0, logF.length() - 3) + "acceptsPerFile.log";
- log.open(logFs.c_str(), ios_base::out);
- log << shrtLog;
- log.close();
-
- string logFGC = logF.substr(0, logF.length() - 3) + "GC.txt";
- log.open(logFGC.c_str(), ios_base::out);
- mainFil->printGC(log, mainFil->isPaired());
- log.close();
-#ifdef DEBUG
- cerr << "other logs end" << endl;
-#endif
-
-
-//for additional files
- if (mainFil->secondaryOutput()){
- log.open (logFA.c_str() ,ios_base::out);
- mainFil->printStats(log,mainFile,outFile,false);
- log.close();
- }
-
-
- //length histogram
- logF = cmdArgs["-length_hist"];
- log.open (logF.c_str() ,ios_base::out);
- mainFil->printHisto(log,0);
- log.close();
- //quality histogram
- logF = cmdArgs["-qual_hist"];
- log.open (logF.c_str() ,ios_base::out);
- mainFil->printHisto(log,1);
- log.close();
- mainFil->close_outFiles_demulti();
-#ifdef DEBUG
- cerr << "separateByFile finished" << endl;
-#endif
-
-}
-
-void rewriteNumbers(OptContainer& cmdArgs){
- //no renumbering asked for
- if (!(cmdArgs.find("-number") != cmdArgs.end() && cmdArgs["-number"]=="T")){
- return;
- }
- string prefix="";
- if (cmdArgs.find("-prefix") != cmdArgs.end()){
- prefix = cmdArgs["-prefix"];
- }
- //read fasta & write with new headers
- int cnt=0;
-
- string line;
- //ofstream qualOut,fnaOut;
- ifstream fna;
- ofstream ofna;
-
- // rerwite input fasta file
- string tname="",tseq="";
- fna.open(cmdArgs["-i_fna"].c_str(),ios::in);
- ofna.open(cmdArgs["-o_fna"].c_str(),ios::out);
- while (getline(fna,line,'\n')){
-
- if (line[0]=='$'){ //$ marks comment
- continue;
- }
- if(line[0] == '>'){ //fasta description
- if (cnt!=0){
- tname = ">"+prefix+itos(cnt)+"\n";
- ofna << tname << tseq;
- }
- cnt++;tseq="";
- continue;
- }
- tseq += line+"\n";
- }
- tname = ">"+prefix+itos(cnt)+"\n";
- ofna << tname << tseq;
- ofna.close(); fna.close();
-
- exit(0);
-}
-
-void Announce_sdm(){
- cerr << endl << "This is sdm (simple demultiplexer) " << sdm_version << " " << sdm_status << "." << endl << endl;
-}
-void help_head(){
- cout <<"------------------------------\nThis is sdm version "<\n------OR------\n -i \n------OR------\n -i_fastq \n------OR------\n -i_fna (required)\n -i_qual (required, unless quality file is \"xx1.qual\" and fasta is \"xx1.yy\")\n\n -map (optional)\n -o_fna (optional)\n -o_qual (optional)\n -o_fastq (optional)\n -log (optional). Set to \"nolog\" to deactivate alltogether.\n \n-sample_sep \"X\" string X is used to delimit samples and ID (optional, default:\"" << def_sep << "\")\n -paired 1/2/3 (input is paired end sequenced(2), assumes two input files delimited by \',\'. 1=singleton (default); 3=paired end (R1,R3) + one file with MID (R2))\n";
- cout << " -o_demultiplex [path] write input into single, demultiplexed files\n";
- cout << " -onlyPair [1/2] consider only read pair 1 or 2. Useful when streamlining inputs (LotuS) or considering double barcoding.\n -i_MID_fastq fastq file with only MID sequences; if paired reads are supplied with -i_fna/-i_fastq and the MID identifier via -i_MID_fastq, paired has to be set to 2. If e.g. merged reads are supplied + mids, paired has to be set to 1.\n";
- cout << " -oneLineFastaFormat [0/1] write Fasta and Quality file sequence string in one line, opposed to default 80 characters per line.\n -o_dereplicate of dereplicated DNA reads (with size in header)\n -dere_size_fmt [0/1] either (0) usearch format \"size=X;\" or (1) \"_X\"\n -min_derep_copies only print seq if at least X copies present. Can be complex terms like \"10:1,3:3\" -> meaning at least 10x in 1 sample or 3x in 3 different samples.\n";
- cout << " -SyncReadPairs [T/F] sdm can check, if read pairs occur in the same (correct) order in the input files, and correct this in case not (T).\n";
- cout << " -maxReadsPerOutput number of filtered reads in output files. If more reads, a new file is created. Only works with -o_fna\n -mergedPairs <1/0> 1: paired sequences were merged externally, important for assumption that read quality is detoriating.\n -OTU_fallback : Fallback fasta sequences for OTU's, only used in SEED extension mode\n";
- cout << " -i_qual_offset [0-64] fastq offset for quality values. Set this to \'0\' or \'auto\' if you are unsure which fastq version is being used (default: read from sdm option file)\n -o_qual_offset [0-64] set quality offset for fastq outfile. Default: 33\n";
- cout << " -ignore_IO_errors [0/1]: 1=Errors in fastq reads are ignored, with sdm trying to sync reads pairs after corrupted single reads (default: 0)\n";
- //-binomialFilterBothPairs [1/0]
- //-count_chimeras [T/F]
- // ucAdditionalCounts_refclust -OTU_fallback_refclust -optimalRead2Cluster_ref
- cout<<"\nMinimal Example:\n./sdm -i test.fna -map mapping.txt (assuming quality file is \"test.qual\")\n";
- // further options (undocumented) :
- //-length_hist -qual_hist
- //-suppressOutput[0/1]
- //
-}
-void printOptionHelp(){
- help_head();
- cout<<"The option file, specified via the \"-options\" argument, provides more specific control over filtering, barcode handling, and sequencing technologies, among others. A reference option file is printed below.\n\n";
- string helpOptionFile="";
- /*helpOptionFile += "minSeqLength - minimal accepted Sequence Length\nmaxSeqLength - maximal Length of Sequence\nminAvgQuality - minimal average Quality\nmaxAmbiguousNT - max number of Ambigous nt's in sequence\nQualWindowThreshhold - Q threshold where seq is rejected\nQualWindowWidth - average quality in this windows is used for QualWindowThreshhold\n";
- helpOptionFile += string("TrimWindowThreshhold - Q value below which sequence is 3' trimmed\nTrimWindowWidth - window size used for TrimWindowThreshhold\nmaxBarcodeErrs - max accepted barcode errors\nmaxPrimerErrs - max accepted Primer errors\nkeepBarcodeSeq - leave Barcode Seq on read? (0/1)\n");
- helpOptionFile += "keepPrimerSeq - keep Primer attached to seq? (0/1)\nmaxHomonucleotide - sequences with a homonucleotide run longer will be rejected\nmaxAccumulatedError - if P is surpassed, sequence is trimmed at that point\nTechnicalAdapter - sequence of the technical adapter (will be removed, if found 5')\nPEheaderPairFmt - ?\nTrimStartNTs - trim X nucleotides from the start of the sequence ";
- helpOptionFile += "fastqVersion - 1 = ";*/
-
- helpOptionFile += "#--- Example ---\n#copy into new file\n#sequence length refers to sequence length AFTER removal of Primers, Barcodes and trimming. this ensures that downstream analyis tools will have appropiate sequence information\nminSeqLength 250\nmaxSeqLength 1000\nminAvgQuality 25\n\n";
- helpOptionFile += "#Ambiguous bases in Sequence - uclust only supports 0 ambiguous nucleotides\nmaxAmbiguousNT 0\n\n#Homonucleotide Runs.. this should normally be filtered by sequencer software\nmaxHomonucleotide 8\n\n";
- helpOptionFile += "#Filter whole sequence if one window of quality scores is below average\nQualWindowWidth 50\nQualWindowThreshhold 25\n\n#Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence\n\nTrimWindowWidth 20\nTrimWindowThreshhold 25\n\n#Max number of accumulated P for a mismatch. After this length, the rest of the sequence will be deleted. Complimentary to TrimWindowThreshhold. (-1) deactivates this option.\nmaxAccumulatedError 1\n\n";
- helpOptionFile += "#Barcode Errors - currently this can only be 0; \nmaxBarcodeErrs 0\nmaxPrimerErrs 0\n\n#keep Barcode / Primer Sequence in the output fasta file - in a normal 16S analysis this should be deactivated (0) for Barcode and de-activated (0) for primer\nkeepBarcodeSeq 0\nkeepPrimerSeq 0\n\n";
- helpOptionFile += "#set fastqVersion to 1 if you use Sanger, Illumina 1.8+ or NCBI SRA files. Set fastqVersion to 2, if you use Illumina 1.3+ - 1.7+ or Solexa fastq files.\n\nfastqVersion 1\n\n#if one or more files have a technical adapter still included (e.g. TCAG 454) this can be removed by setting this option\n\nTechnicalAdapter TCAG\n\n#delete X NTs (e.g. if the first 5 bases are known to have strange biases)\n\nTrimStartNTs 0\n";
- helpOptionFile += "#truncate total Sequence length to X (length after Barcode, Adapter and Primer removals)\nTruncateSequenceLength 200\n";
- helpOptionFile += "#correct PE header format (1/2) this is to accomodate the illumina miSeq paired end annotations 2=\"@XXX 1:0:4\" instead of 1=\"@XXX/1\". Note that the format will be automatically detected\nPEheaderPairFmt 1\n\n#sets if sequences without match to reverse primer will be accepted (T=reject ; F=accept all); default=F\nRejectSeqWithoutRevPrim F\n";
- helpOptionFile += "#sets if sequences without a forward (LinkerPrimerSequence) primer will be accepted (T=reject ; F=accept all); default=T\nRejectSeqWithoutFwdPrim T\n\n";
- helpOptionFile += "#checks if the whole amplicon was reverse-transcribed sequenced (Default = F)\nCheckForReversedSeqs F\n\n";
- helpOptionFile += "#this option should be \"T\" if your amplicons are possibly shorter than a read in a paired end sequencing run (e.g. amplicon of 300 in 250x2 miSeq is \"T\")\nAmpliconShortPE T\n\n";
- //CheckForMixedPairs CheckForReversedSeqs
- cout<& cmdArgs);
diff --git a/configs/sdm_src/IO.h b/configs/sdm_src/IO.h
deleted file mode 100644
index 88d593f..0000000
--- a/configs/sdm_src/IO.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-//most obvious input / output operations
-
-#ifndef _IO_h
-#define _IO_h
-
-
-#include "containers.h"
-
-
-typedef std::map> DNAmap;
-
-
-//void openOutFiles(string files, string fmt,string );
-//void prepareOutFiles(OptContainer& cmdArgs);
-//void read_fastq(OptContainer& cmdArgs, MultiDNA* MD,string fileS);
-bool read_paired(OptContainer& cmdArgs, shared_ptr MD, shared_ptr,bool );
-bool read_paired_DNAready(shared_ptr tdn, shared_ptr tdn2, shared_ptr MIDseq, bool MIDuse, MultiDNA* MD, int& revConstellation);
-
-//bool read_tripple(OptContainer& cmdArgs, MultiDNA* MD, InputStreamer*);
-
-void separateByFile(shared_ptr mainFil,OptContainer& cmdArgs);
-
-void threadAnalyzeDNA(shared_ptr tdn, shared_ptr MD,int thrCnt);
-//void trippleThreadAnalyzeDNA(shared_ptr MD, shared_ptr tdn,shared_ptrtdn2,shared_ptr MIDseq,bool changePHead);//,int thrCnt=0);
-
-void read_single(OptContainer& cmdArgs, shared_ptr MD, shared_ptr IS);
-
-bool readCmdArgs(int argc, char* argv[],OptContainer& cmdArgs);
-
-
-
-
-//specialized functions .. end sdm after execution
-void rewriteNumbers(OptContainer& cmdArgs);
-
-
-void Announce_sdm();
-void help_head();
-void general_help();
-void printCmdsHelp();
-void printOptionHelp();
-void printMapHelp();
-void printVersion();
-
-
-//bool readCmdArgs(int argc, char* argv[],map& cmdArgs);
-#endif
\ No newline at end of file
diff --git a/configs/sdm_src/IO.o b/configs/sdm_src/IO.o
deleted file mode 100644
index b2a1b21..0000000
Binary files a/configs/sdm_src/IO.o and /dev/null differ
diff --git a/configs/sdm_src/InputStream.cpp b/configs/sdm_src/InputStream.cpp
deleted file mode 100644
index ea76b4a..0000000
--- a/configs/sdm_src/InputStream.cpp
+++ /dev/null
@@ -1,1974 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-email: Falk.Hildebrand@gmail.com
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-
-#include "InputStream.h"
-string spaceX(uint k){
- string ret = "";
- for (uint i = 0; i < k; i++){
- ret += " ";
- }
- return ret;
-}
-
-int digitsInt(int x){
- int length = 1;
- while (x /= 10)
- length++;
- return length;
-}
-int digitsFlt(float x){
- std::stringstream s;
- s << x;
- return (int)s.str().length();
-}
-string intwithcommas(int value) {
- string numWithCommas = std::to_string((long long)value);
- int insertPosition = (int)numWithCommas.length() - 3;
- while (insertPosition > 0) {
- numWithCommas.insert(insertPosition, ",");
- insertPosition -= 3;
- }
- return (numWithCommas);
-}
-
-std::string itos(int number) {
- std::stringstream ss;
- ss << number;
- return ss.str();
-}
-std::string ftos(float number) {
- std::stringstream ss;
- ss << number;
- return ss.str();
-}
-bool isGZfile(const string fi) {
- string subst = fi.substr(fi.length() - 3);
- if (subst == ".gz") {
- return true;
- }
- return false;
-}
-
-std::istream& safeGetline(std::istream& is, std::string& t) {
- t.clear();
- //from http://stackoverflow.com/questions/6089231/getting-std-ifstream-to-handle-lf-cr-and-crlf
- // The characters in the stream are read one-by-one using a std::streambuf.
- // That is faster than reading them one-by-one using the std::istream.
- // Code that uses streambuf this way must be guarded by a sentry object.
- // The sentry object performs various tasks,
- // such as thread synchronization and updating the stream state.
-
- std::istream::sentry se(is, true);
- std::streambuf* sb = is.rdbuf();
-
- for (;;) {
- int c = sb->sbumpc();
- switch (c) {
- case '\n':
- return is;
- case '\r':
- if (sb->sgetc() == '\n')
- sb->sbumpc();
- return is;
- case EOF:
- // Also handle the case when the last line has no line ending
- if (t.empty())
- is.setstate(std::ios::eofbit);
- return is;
- default:
- t += (char)c;
- }
- }
-}
-//MOCAT
-std::vector header_string_split(const std::string str, const std::string sep) {
- std::vector tokens;
- tokens.reserve(13);
- size_t start = 0;
- size_t pos = 0;
- while ((pos = str.find_first_of(sep, start)) != std::string::npos) {
- tokens.push_back(str.substr(start, pos - start));
- start = pos + 1;
- }
- if (start < str.length()) {
- tokens.push_back(str.substr(start));
- } else if (start == str.length()) {
- tokens.push_back("0");
- }
- return tokens;
-}
-void remove_paired_info(string &s, short RP) {
- size_t f1 = s.find(" ");
- if (f1 != string::npos) {
- s = s.substr(0, f1);
- f1 = string::npos;
- }
- if (RP == 0) {
- f1 = s.find("/1");
- }
- else if (RP == 1) {
- f1 = s.find("/2");
- }
- else {
- f1 = s.find("/1");
- if (f1 == string::npos) { f1 = s.find("/2"); }
- }
- if (f1 != string::npos && f1 == s.size() - 2) {
- s = s.substr(0, f1);
- }
-}
-std::string header_stem(string& header) {
- const size_t slash = header.find('/');
- if (slash != std::string::npos) {
- return header.substr(0, slash);
- }
-
- std::vector tokens = header_string_split(header, ": ");
-
- if (tokens.size() == 11) {
- //if (tokens[8] == "Y") seq.clear();
- return tokens[0] + ":" + tokens[3] + ":" + tokens[4]
- + ":" + tokens[5] + ":" + tokens[6];// +"#" + tokens[10];
- } else if (tokens.size() == 6 || tokens.size() == 7) {
- return tokens[0] + ":" + tokens[2] + ":" + tokens[3]
- + ":" + tokens[4] + ":" + tokens[5];// + "#0";
- } else {
- cerr << "fastq header format error\n"; exit(98);
- }
- //throw std::runtime_error("fastq header format error\n" + header); }
-}
-void reverseTS(string & Seq) {
- int qs = (int)Seq.length() - 1;
- string S2 = Seq.c_str();
- for (int i = qs; i >= 0; i--) {
- Seq[i] = DNA_trans[(int)S2[qs - i]];
- }
-}
-string reverseTS2(const string & Seq) {
- int qs = (int)Seq.length() - 1;
- string S2 = Seq.c_str();
- for (int i = qs; i >= 0; i--) {
- S2[i] = DNA_trans[(int)Seq[qs - i]];
- }
- return S2;
-}
-bool any_lowered(const string& is) {
- for (uint i = 0; i < is.length(); i++) {
- char c = is[i];
- if (islower(c)) { return true; }
- }
- return false;
-}
-string applyFileIT(string x, int it,const string xtr){
-
- size_t pos = x.find_last_of(".");
- if (pos != string::npos && isGZfile(x)) {
- pos = x.find_last_of(".", pos-1);
- }
- if (it == 0) {
- if (pos == string::npos) {
- return x + xtr;
- }
- return x.substr(0, pos) + xtr + x.substr(pos);
- }
- ostringstream ss;
- ss << it;
- if (pos == string::npos) {
- return x + "." + ss.str() + xtr ;
- }
- return x.substr(0,pos) + "."+ss.str() + xtr + x.substr(pos);
-
-
-}
-bool fileExists(const std::string& name, int i, bool extiffail) {
- if (name == "") { return true; }
- if (FILE *file = fopen(name.c_str(), "r")) {
- fclose(file); return true;
- } else {
- if (extiffail) {
- cerr << "ERROR: Could not find file " << name << endl;
- if (i >= 0) {
- cerr << "on mapping file line " << i << endl;
- }
- exit(92);
- }
- return false;
- }
-}
-
-string detectSeqFmt(const string inF) {
-
- vector tfasP = splitByCommas(inF, ';');
- for (size_t i = 0; i < tfasP.size(); i++) {
- vector tfas = splitByCommas(tfasP[i]);
- string fileS = tfas[0];
- istream* fnax(NULL);
- string file_type = "test file";
- if (fileS != "") {
- if (isGZfile(fileS)) {
-#ifdef _gzipread
- file_type = "gzipped fasta file";
- fnax = new igzstream(fileS.c_str(), ios::in);
-#else
- cerr << "gzip not supported in your sdm build\n" << fileS; exit(50);
-#endif
- }
- else {
- fnax = new ifstream(fileS.c_str(), ios::in);
-
- }
- if (!*(fnax)) { cerr << "\nCouldn't find " << file_type << " file \"" << fileS << "\"!\n Aborting..\n"; exit(4); }
- //char Buffer[RDBUFFER];
- //fna_u[0]->rdbuf()->pubsetbuf(Buffer, RDBUFFER);
- }
- string tmp("");
- string ret = "";
- while (safeGetline(*fnax, tmp)) {
- if (tmp[0] == '>') {
- ret = "-i_fna"; break;
- }
- else if (tmp[0] == '@') {
- ret = "-i_fastq"; break;
- }
- else if (tmp.length() == 0) {//do nothing
- ;
- }
- else {
- cerr << " Could not auto detect input format. First non-empty line of your file looked like:\n" << tmp << endl;
- exit(888);
- }
- }
- if (ret == "") {
- cerr << "Empty input file detected:\n" << fileS << endl;
- } else {
- return ret;
- }
-
- delete fnax;
- }
-
- return "empty";
-}
-
-
-/*vector orderOfVec(vector& vin) {
- struct MyStruct
- {
- int key;
- int Value;
- MyStruct() :key(0), Value(0) {}
- MyStruct(int k, const int s) : key(k), Value(s) {}
-
- bool operator < (const MyStruct& str) const {
- return (key > str.key);
- }
- };
- std::vector < MyStruct > vec(vin.size());
- //fill vector
- for (int i = 0; i < (int)vin.size(); i++) {
- vec[i] = MyStruct(vin[i], i);
- }
-
- sort(vec.begin(), vec.end());
- //extract from sorted vector
- vector ret(vin.size(), 0);
- for (size_t i = 0; i < vin.size(); i++) {
- ret[i] = vec[i].Value;
- }
- return ret;
- }
- */
-bool DNA::seal() {//DN = Seq.c_str();
- size_t QSi = Qual.size();
- if (QSi == 0 && Seq == "" ) {
-
- this->setPassed(false);
- return true;//nothing to be done, just empty DNA
- }
- if (QSi != Seq.length()) {
- cerr << "Unequal length of seq and quality for name " << this->getID() << "\n";
- this->setPassed(false);
- return false;
- }
- //uppercase DNA
- std::transform(Seq.begin(), Seq.end(), Seq.begin(), ::toupper);
- SeqLength = Seq.length();
- return true;
-}
-
-string DNA::getIDPosFree() { // remove /1 /2 #1:0 etc
- string s = this->getIDshort();
- remove_paired_info(s,Read_position);
- return s;
-}
-
-
-int DNA::numACGT(){
- int DNAch = 0;
- for (unsigned int i = 0; i < length(); i++){
-
- DNAch += DNA_amb[(int)Seq[i]];
-// if (tmp == 'A' || tmp == 'C' || tmp == 'G' || tmp == 'T'){ DNAch++; }
- }
- return static_cast (length()) - DNAch;
-}
-void DNA::Qappend(const vector &q)
-{
- Qual.insert(Qual.end(), q.begin(), q.end());
- /*unsigned int Qsiz = (unsigned int) Qual.size();
- Qual.resize(Qsiz+q.size(),0);
- for (register unsigned int i=0; ilength(); i++){
- Qsum += Qual[i];
- }
- }
- avgQual = static_cast (Qsum) / static_cast (this->length());
-
- }
- return avgQual;
-}
-
-/*float DNA::qualWinfloat_hybr(int W, float T, int W2, float T2, int& reason){//not used
- //if (T==0.f){return true;}
- int AQS=0, AQL;
- int TotQ = 0;
- int upTs = static_cast(T * W);
- int upTl = static_cast(T2 * W2);
- int QS = int (Qual.size());
- if (W>=QS){W = QS;} // too short
- int smallerW = W, largerW = W2;
-
- bool W1IsSmall = true;
- if (smallerW > W2){ largerW=W; smallerW = W2; W1IsSmall=false;
- std::swap(upTl,upTs); }
-
- for (unsigned int i=0; i<(unsigned int) smallerW; i++){
- AQS += Qual[i];
- }
- AQL = AQS;
-
- //hybrid schleife
- for (unsigned int i=smallerW; i<(unsigned int) largerW; i++){
- AQL += Qual[i];
- AQS += Qual[i]; AQS -= Qual[i-smallerW];
- if (AQS < upTs){
- if (W1IsSmall){ reason=0; return 0.f;
- } else {reason=1; this->cutSeq(i/2,this->length()); return float(AQL)/float(QS);}
- }
- }
-
- TotQ = AQL;
- for (unsigned int i=W; i<(unsigned int) QS; i++){
- AQS += Qual[i]; AQL += Qual[i];
- TotQ += Qual[i];
- AQS -= Qual[i-W];AQL -= Qual[i-W];
- if (AQS < upTs || AQL < upTl){
- if (W1IsSmall){ reason=0; return 0.f;
- } else {reason=1; this->cutSeq(i/2,this->length()); return float(AQL)/float(QS);}
- }
- }
- //if (averageQ > static_cast (TotQ) /static_cast ( Qual.size())){return false;}
- return static_cast (TotQ) /static_cast ( QS);
- }
- */
-
-// modified from https://github.com/fpusan/moira/blob/master/moira/bernoullimodule.c
-float DNA::interpolate(int errors1, float prob1, int errors2, float prob2, float alpha)
-{
- float result = errors1 + ((errors2 - errors1) * ((1 - alpha) - prob1) / (prob2 - prob1));
- if (result < 0) //Happens only for very-short high qual sequences in which the probability of having 0 errors is higher than 1 - alpha.
- {
- result = 0;
- }
- return result;
-}
-float DNA::prob_j_errors(float p, float j, float n) //Where p is the error probability, j is the number of errors and n the number of observations.
-{
- float per_position_accum_probs;
- if (j > n) {
- return 0.0f; //The formula below would also return 0.
- }
- per_position_accum_probs = pow((1 - p), n); //For j == 0.
- float i(1);
- for (; i <= j; i += 1.f) {//For j > 0.
- per_position_accum_probs = ((n - i + 1.f) / (1.0f*i)) * (p / (1.f - p)) * per_position_accum_probs;
- }
- return per_position_accum_probs;
-
-}
-float DNA::sum_of_binomials(const float j, int k, float n, int qual_length, const vector& error_probs, const vector< vector> & per_position_accum_probs)
-//#Where j is the number of errors and k is the position in the sequence.
-{
- float probability = 0;
- float i(0); int k1 = (int) k - 1;
-
- for (; i <= j; i+=1.f)
- {
- probability += DNA::prob_j_errors(error_probs[k], i, n) * per_position_accum_probs[int(j - i)][k1];
- //Where error_probs[k] is the error probability of the k-th position.
- //Where per_position_accum_probs[j-i][k-1] is the probability that all the bases from position 0 to k-1 had a total of j-i errors.
- }
-
- return probability;
-}
-
-float DNA::binomialFilter(int maxErr, float alpha){
-
- if (alpha == -1.f|| this->length()<3){ return 0; }//deactivated
-
- ///Initialize some variables.
-
- int SeqLengthI = (int)SeqLength;
- int n = 1; //Since we have a Bernoulli distribution.
- float n_f = (float)n;
- float alpha1 = 1.f - alpha;
-
- ///Translate quality scores into error probabilities.
- vector error_probs (SeqLength,0.f);
- for (size_t i = 0; i < SeqLength; i++){
- error_probs[i] = (float)SAqualP[Qual[i]];//pow(10, (contig_quals[i] / -10.0)); //Since we want a continuous list of non-N error_probs.
- }
-
- ///Actually get the job done.
- int max_expected_errors = maxErr + 3;// (int)SeqLength + 1;
- int expected_errors = 0;
- float probability;
- vector accumulated_probs (max_expected_errors,0.f);
- //int j;
- int k;
- vector empty(SeqLength, 0.f);
- vector< vector> per_position_accum_probs(max_expected_errors, empty);
-
- while (1)
- {
- float expected_errors_f = (float)expected_errors;
- //vector per_position_accum_probs(SeqLength, 0.f);
- for (k = 0; k < (int)SeqLength; k++) {
- if (k == 0) {
- per_position_accum_probs[expected_errors][k] = DNA::prob_j_errors(error_probs[k], expected_errors_f, n_f);
- } else {
-
- per_position_accum_probs[expected_errors][k] = DNA::sum_of_binomials((float)expected_errors, k, n_f, SeqLengthI, error_probs, per_position_accum_probs);
- }
- }
- probability = per_position_accum_probs[expected_errors][SeqLengthI - 1];
-
- if (expected_errors == 0){
- accumulated_probs[expected_errors] = probability;
- }else{
- accumulated_probs[expected_errors] = accumulated_probs[expected_errors - 1] + probability;
- }
-
- if (accumulated_probs[expected_errors] > (alpha1) || expected_errors >= (max_expected_errors-1)){
- break;
- }else{
- expected_errors++;
- }
- }
- if (expected_errors == 0){
- return 0;
- }
- float EXE = interpolate(expected_errors - 1, accumulated_probs[expected_errors - 1], expected_errors, accumulated_probs[expected_errors], alpha);
- return EXE;
-}
-
-float DNA::qualWinfloat(unsigned int W, float T, int& reason){
- //if (T==0.f){return true;}
- int AQS=0;
- int TotQ = 0;
- int upTs = static_cast(T * W);
- unsigned int QS = this->length();//static_cast (Qual.size());
- if (W>=QS){W = QS;} // too short
-
-//1st loop to ini window
- for (unsigned int i=0; i<(unsigned int) W; i++){
- AQS += Qual[i];
- }
- TotQ = AQS;
-
- for (unsigned int i=W; i<(unsigned int) QS; i++){
- AQS += Qual[i] - Qual[i - W];
- TotQ += Qual[i];
- if (AQS < upTs ){
- reason=1; return 0.f;
- }
- }
- //if (averageQ > static_cast (TotQ) /static_cast ( Qual.size())){return false;}
- return static_cast (TotQ) /static_cast ( QS);
-}
-
-int DNA::qualAccumulate(double d){
- unsigned int i(0);double accErr(0.0);
- for (; ilength(); i++) {
- accErr+=SAqualP[Qual[i]];
- if (accErr >= d){break;}
- }
-
- this->AccumError = accErr;
-
- return i;
-}
-void DNA::NTspecQualScores(vector& qsc, vector& ntcnt) {
- size_t sql = Seq.length();
- if (qsc.size() < sql) {
- // cerr << "qsc";
- qsc.resize(sql,0);
- }
- if (ntcnt.size() < sql) {
- // cerr << "ntcnt";
- ntcnt.resize(sql,0);
- }
- for ( uint i = 0; i < sql; i++ ) {
- short p = NT_POS[(int)Seq[i]];
- qsc[p] += Qual[i];
- ntcnt[p]++;
- }
-}
-
-bool DNA::qualAccumTrim(double d){
- if (d == -1.) {
- return true;
- }
- unsigned int i(qualAccumulate(d));
- if (i != this->length()){
- //cut 3' end
- this->cutSeq(i,this->length());
- this->QualCtrl.AccErrTrimmed = true;
- return false;
- }
- //did not cut this sequence:
- return true;
-}
-bool DNA::qualWinPos(unsigned int W, float T){
- if (T==0.f){return true;}
- int AQ=0;
- int unT = static_cast((float)W*T);
- unsigned int QS = this->length();// (unsigned int)Qual.size();
- unsigned int QSh = QS >> 2;
- QSh = max(QSh,W);
- if (W>=QS){return true;} // too short
-
- vector WQ((int) W);
- //TODO: check that the right num of nucs is taken..
- int cnt=0;
- if (QS < W) {return true;}
- for (unsigned int i=QS-1; i> QS-(unsigned int) W-1; i--){
- AQ += Qual[i];
- cnt++;
- }
- if (AQ > unT){return true;}
- int curW = QS-(unsigned int) W;
- for (uint i=QS-(unsigned int) W-1; i > QSh; i--){
- AQ += Qual[i];
- AQ -= Qual[i+W];
-
- if (AQ < unT){ //min Window qual was broken.. kick seq
- curW = i;
- } else {
- break;
- }
- }
-
- //partial seq removal
- int pos = curW - (W>>1);
- this->cutSeq(pos);
- this->QualCtrl.QWinTrimmed = true;
- return false;
-}
-
-//removes part of seq and qual indexes between start & stop
-bool DNA::cutSeq(int start, int stop,bool Pseudo){
-
- if (stop == -1) {
- if (start >= (int) SeqLength) { return false; }
- } else if (start >= stop || stop > (int)Qual.size() || start >= (int)Qual.size()) {
- return false;
- }
-
- //pseudo deactivates cutting of 3'
- if (Pseudo) {
- if (stop == -1 || stop <= (int) SeqLength) {
- SeqLength = start;
- return true;
- }
- }
-
- string se = Seq;
- if (stop == -1) {
- stop = (int)Seq.length();
- }
- if (start == 0) {
- Seq = se.substr(stop);
- } else {
- Seq = se.substr(0, start) + se.substr(stop);
- }
-
- //DN = Seq.c_str();
- //Quali
- Qual.erase(Qual.begin()+start, Qual.begin()+stop);
-
- SeqLength = Seq.length();
-
- return true;
-}
-
-int DNA::matchSeq(std::string PrSt,int Err,int tolerance, int startPos){
- //const char* DN = Seq.c_str();
- //const char* Pr = PrSt.c_str();
- int PrL = (int) PrSt.length();
- int mthL = this->length() - PrL;
- //int wantSc = PrL - Err;
- int endPos(-1),pos(startPos), Prp(0), c_err(0),Prp2(0);
- //bool res(false);
- for (; pos< tolerance; pos++){
- if (pos > mthL) { break; }
- c_err=0;Prp=0; Prp2=pos;
- do {
-
-#ifdef _NEWMATCH
- //new vector based matching
- c_err += DNA_IUPAC[Seq[Prp2]+256*PrSt[Prp]]; if (c_err > Err){break;}
-#else
- //old, direct match
- if (!matchDNA(Seq[Prp2],PrSt[Prp])){c_err++;if (c_err > Err){break;}}
-#endif
- Prp++; Prp2++;
- } while ( Prp < PrL);
- if (c_err<=Err ){endPos=pos;break;}
- }
- //if(!suc){pos=-1;}
- return endPos;
-}
-void DNA::reset() {
- AccumError = 0.; goodQual = false; midQual = false;
- //FtsDetected.reset();
- avgQual = -1.f; Qsum = 0; tempFloat = 0.f;
- QualTraf = "";
-
- this->resetTruncation();
-}
-
-void DNA::reverse_transcribe() {
- reverseTS(Seq);
- std::reverse(Qual.begin(), Qual.end());
- AccumError = 0.; goodQual = false; midQual = false;
- avgQual = -1.f; Qsum = 0; tempFloat = 0.f;
- QualTraf = "";
-
-}
-
-//match from end of Seq to find rev primer
-int DNA::matchSeqRev(std::string PrSt,int Err, int check_l,
- int coverage){
- //fail::ret -1
- int PrL = (int) PrSt.length();
- if (coverage==0){coverage=5;} //default seed set to 5
- int SeL = (int) Seq.size();
- //int wantSc = PrL - Err;
- int pos(SeL-coverage), Prp(0), c_err(0),endPos(-1);
- for (; pos> check_l; pos--){
- c_err=0;Prp=0;
- int PrL2 = min(PrL,SeL-pos);
- do {
-#ifdef _NEWMATCH
- c_err += DNA_IUPAC[Seq[pos+Prp]+256*PrSt[Prp]]; if (c_err > Err){break;}
-#else
- if(!matchDNA(Seq[pos+Prp],PrSt[Prp])){c_err++;if (c_err > Err){break;} }
-#endif
- Prp++;
- } while (Prp < PrL2);
- if (c_err<=Err ){
- endPos=pos;break;}
- }
- //secondary check for last few NT's
- if (endPos==-2){
- pos = (SeL-1);
- for (; pos> (SeL-coverage); pos--){
- c_err=0;Prp=0;
- int PrL2 = min(PrL,SeL-pos);
- do {
-#ifdef _NEWMATCH
- c_err += DNA_IUPAC[Seq[pos+Prp]+256*PrSt[Prp]]; if (c_err > Err){break;}
-#else
- if(!matchDNA(Seq[pos+Prp],PrSt[Prp])){c_err++; if (c_err > Err){break;}}
-#endif
- Prp++;
- } while (Prp < PrL2);
- if (c_err<=Err ){
- endPos=pos;break;}
- }
- }
- return endPos;
-}
-// looks through total DNA seq
-int DNA::matchSeq_tot(std::string Pr,int Err,int MaxPos, int& c_err){
- //const char* Pr = PrSt.c_str();
- int PrL = (int) Pr.length();
- int pos(0), Prp(0), Prp2(0);
- bool suc(false);
- for (pos=0; pos< MaxPos; pos++){
- c_err=0;Prp=0;Prp2=pos;
- do {
- if(Seq[Prp2]!=Pr[Prp]){
- c_err++;
- }
- Prp++; Prp2++;
- } while (c_err <= Err && Prp< PrL);
- if (c_err<=Err){suc=true;break;}
- }
- if(!suc){pos=-1;}
- return pos;
-}
-
-
-bool DNA::matchDNA(char t1,char t2){
- if (t1==t2){
- return true;
- }
- switch (t2){
- case 'N': return true;
- case 'R': if (t1=='A' || t1=='G' ) {return true;}break;
- case 'Y': if (t1=='T' || t1=='C' ) {return true;}break;
- case 'M': if (t1=='C' || t1=='A' ) {return true;}break;
- case 'K': if (t1=='T' || t1=='G' ) {return true;}break;
- case 'W': if (t1=='T' || t1=='A' ) {return true;}break;
- case 'S': if (t1=='C' || t1=='G' ) {return true;}break;
- case 'B': if (t1!='A') {return true;}break;
- case 'D': if (t1!='C' ) {return true;}break;
- case 'H': if (t1!='G' ) {return true;}break;
- case 'V': if (t1!='T' ) {return true;}break;
- }
- return false;
-}
-bool DNA::HomoNTRuns(int max){
- char lastC = Seq[0];
- int rowC=1;
- for (unsigned int i=1;i= max){
- return false;
- }
- } else {
- rowC = 1;
- lastC = Seq[i];
- }
- }
- return true;
-}
-
-/*
-void DNA::writeSeq(ofstream& wr){
- int cnt=0;
- if (Seq.size()==0){return;}
- wr<<">"<0){
- wr<"<0){
- wr<"<0){
- wr< Q2(Qual);
- for (int i=qs;i>=0;i--){
- Qual[i] = Q2[qs-i];
- }
- reverseTS(Seq);
-}
-*/
-bool DNA::sameHead(shared_ptr d){
- if (d == NULL){return false;}
- return sameHead(d->getIDshort());
-}
-bool DNA::sameHead(const string& oID) {
- size_t pos = getShorterHeadPos(ID);
- if (oID.size() < pos) { return false; }
- if (ID.substr(0, pos) == oID.substr(0, pos)) {
- return true;
- }
- return false;
-}
-
-
-void DNA::setPassed(bool b){
- goodQual=b;
- if (goodQual && midQual) {
- midQual = false;
- }
-}
-int DNA::getBCnumber() {
- //if (Sample==-1 )
- return Sample;
-}
-
-void DNA::prepareWrite(int ofastQver) {
- uint len = length();
- if (QualTraf.size() == len) {
- return;
- }
- QualTraf.resize(len);
- unsigned int i = 0;
- for (; i < len; i++) {
- QualTraf[i] = char(Qual[i] + ofastQver);
- }
- QualTraf[i] = '\0';
-}
-void DNA::resetQualOffset(int x, bool fqSol) {
- for (size_t i = 0; i < Qual.size(); i++) { Qual[i] += x; }
- if (fqSol) {//quick hack, since q<13 scores are the only deviants and uninteresting in most cases..
- for (size_t i = 0; i < Qual.size(); i++) {
- if (Qual[i] < 0) {
- Qual[i] = 0;
- }
- }
- }
-}
-
-///////////////////////////////////////////////////////////////
-//INPUT STREAMER
-
-
-
-void DNAunique::addSmpl(int k) {
- if (k < 0) {
- return;
- }
- Count++;
-#ifdef _MAPDEREPLICATE
- unordered_map::iterator smID = occurence.find(k);
- if (smID == occurence.end()) {
- occurence[k] = 1;
- } else {
- smID->second++;
- }
-#endif
-}
-void DNAunique::setOccurence(int smpl, int N) {
- unordered_map::iterator smID = occurence.find(smpl);
- if (smID == occurence.end()) {
- occurence[smpl] = N;
- } else {
- smID->second += N;
- }
-}
-/*vector> DNAunique::getDerepMapSort2(size_t wh ){
- typedef std::pair mypair;
- size_t siz = occurence.size();
- if (wh > siz) { wh = siz; }
-
- struct IntCmp {
- bool operator()(const mypair &lhs, const mypair &rhs) {
- return lhs.second > rhs.second;
- }
- };
-
-
- vector myvec(occurence.begin(), occurence.end());
- std::partial_sort(myvec.begin(), myvec.begin() + wh, myvec.end(), IntCmp());
-
- return myvec;
-}*/
-
-bool sortDescending(int i, int j) { return (i>j); }//descending sort
-vector DNAunique::getDerepMapSort(size_t wh) {
- vector vals;
- size_t siz = occurence.size();
- if (wh > siz) { wh = siz;}
- vals.reserve(siz);
- for (auto kv = occurence.begin(); kv != occurence.end(); kv++) {
- vals.push_back(kv->second);
- }
- //partial sort doesn't make sense, as I want to break border asap
- //partial_sort(vals.begin(), vals.begin() + wh, vals.end(), sortDescending);
- sort(vals.begin(), vals.end(), sortDescending);
- return vals;
-}
-
-bool DNAunique::pass_deprep_smplSpc(const vector& cv) {
- unordered_map occ;
- //combined samples will not be considered
- //occ = occurence;
- for (std::unordered_map::iterator iter = occurence.begin(); iter != occurence.end(); ++iter) {
- //int cnts = iter->second;
- int ref = cv[iter->first];
- if (ref != -1 && iter->second >= ref ) {
- return true;
- }
- }
- return false;
-
-}
-
-
-void DNAunique::transferOccurence(shared_ptr odu) {
- if (occurence.size() == 0) {
- occurence = odu->occurence;
- Count = odu->Count;
- } else {
- //which sample contains this dna?
- unordered_map oldMap = odu->getDerepMap();
- unordered_map::iterator smID;
- for (std::unordered_map::iterator oID = oldMap.begin(); oID != oldMap.end(); ++oID) {
- smID = occurence.find(oID->first);
- if (smID == occurence.end()) {
- occurence[oID->first] = oID->second;
- } else {
- smID->second += oID->second;
- }
- }
- //size track
- Count += odu->Count;
- }
-}
-void DNAunique::writeMap(ofstream & o, const string & hd,
- vector & cntspersmpl, const vector& combiID) {
- if (occurence.size() == 0) { return; }
- int totCnt(0);
- unordered_map occ;
- if (combiID.size() > 0){//combine all counts on combined categories
- std::unordered_map::iterator fnd;
- for (std::unordered_map::iterator iter = occurence.begin(); iter != occurence.end(); ++iter) {
- //aim: occ[combiID[iter->first]] += iter->second;
- fnd = occ.find(combiID[iter->first]);
- if (fnd == occ.end()){
- occ[combiID[iter->first]] = iter->second;
- }else{
- fnd->second += iter->second;
- }
- }
- }
- else {
- occ = occurence;
- }
-
- //prints combined sample counts
- o << hd;
- for (std::unordered_map::iterator iter = occ.begin(); iter != occ.end(); ++iter) {
- int cnts = iter->second;
- totCnt += cnts;
- o << "\t";
- o << iter->first << ":" << cnts;
- }
- //counts non-combined sample counts
- for (std::unordered_map::iterator iter = occurence.begin(); iter != occurence.end(); ++iter) {
- //int cnts = iter->second;
- cntspersmpl[iter->first] += iter->second;
- }
- o << endl;
-
- if (totCnt != Count) {
- cerr << "Unequal Counts in Map("<getID()<getIDshort();
- if (!usFmt) {
- NewID += "_" + itos(Count);
- } else {
- NewID += ";size=" + itos(Count) + ";";
- }
- IDfixed = true;
-}
-
-
-///////////////////////////////////////////////////////////////
-//INPUT STREAMER
-
-InputStreamer::~InputStreamer(){
- allStreamClose();
-// for (uint i = 0; i < tdn1.size(); i++) { if (tdn1[i] != NULL) { delete tdn1[i]; } }
-// for (uint i = 0; i < tdn2.size(); i++) { if (tdn2[i] != NULL) { delete tdn2[i]; } }
-}
-
-bool InputStreamer::getFastaQualLine(istream&fna, string&line) {
-
- if (!safeGetline(fna, line)) { return false; }
- while (line[0] == '$') { //$ marks comment
- safeGetline(fna, line);
- }
- return true;
-}
-bool InputStreamer::read_fasta_entry(istream&fna,istream&qual,shared_ptr tdn1, shared_ptrtdn2,int &cnt){
-
- if(fna.eof()){return false;}
-
- //int in_int; //char in_char;
- //int cnt=0;
- string tqual(""),tseq("");
- string line(""), lineQ("");
- if (!getFastaQualLine(fna, line)) { return false; }
- if (line == "" && fna.eof()){return false;}
- if (!qualAbsent) { getFastaQualLine( qual, lineQ); }
- cnt++;
-
- if (cnt == 1) { //fasta description
- if (line[0] != '>' && fna) { cerr << "ERROR: Line 1 in fasta file does not start with \">\" \n"; exit(23); }
- //new DNA, set up in tdn1
- tdn1->newHEad(line.substr(1));
- if (!getFastaQualLine(fna, line)) { return false; }
- if (!qualAbsent) { getFastaQualLine(qual, lineQ); }
- cnt++;
- }
- //continous read in until ">" is hit
- while (line[0] != '>') {
- tseq += line;
- if (!getFastaQualLine(fna, line)) { break; }
- }
- if (!qualAbsent) {
- while (lineQ[0] != '>') {
- tqual += " " + lineQ;
- if (!getFastaQualLine(qual, lineQ)) { break; }
- }
- }
-
- //fna
- tdn1->setSeq(tseq);
- size_t lsize = tseq.size();
- vector Iqual(lsize, 0);
- //qual
- if (!qualAbsent) {
- const char* lQ = rtrim(tqual).c_str();
- uint ii = 0; int nn(0);
- for (; ii < lsize; ii++) {
- nn = parseInt(&lQ);// , posStr);
- Iqual[ii] = nn;
- if (*lQ == '\0') {
- break;
- }
- //issQ >> Iqual[ii];
- }
- if (ii != (lsize - 1)) {
- cerr << "ERROR: quality counts (" << ii << ") not the same length as DNA base counts in Sequence (" << (lsize - 1) << ")\n" << tdn1->getID() << "\n";
- exit(54);
- }
- }
- tdn1->Qappend(Iqual);
-
-
- //since already read in 1 more line, this line needs to be used on new object
- if (fna ) {
- if (!qualAbsent && (line[0] != '>' || lineQ[0] != '>')) { cerr << "ERROR: Desynced fasta reader\n"; exit(23); }
- tdn2->newHEad(line.substr(1));
- } else {
- return false;
- }
- //a new DNA obj was set up, return to process tdni, tdno will be completed next call
- return true;
-}
-inline int InputStreamer::parseInt(const char** p1){//,int& pos){//, const char ** curPos) {
- /*from http://stackoverflow.com/questions/5830868/c-stringstream-is-too-slow-how-to-speed-up*/
- //size_t nxtPos = input.find_first_of(' ', curPos);
- //const char *p = input.substr(curPos,nxtPos).c_str();
- //if (!*p || *p == '?') return 0;
- //int s = 1;
- //p = (const char*)curPos;
- const char* p = *p1;
- while (*p == ' ') p++;
-
- int acc = 0;
- while (*p >= '0' && *p <= '9')
- acc = acc * 10 + *p++ - '0';
-
-
- *p1 = p;
- //curPos = (size_t) p;
- return acc;
-}
-void InputStreamer::jmp_fastq(istream & fna, int &lnCnt) {
- string line;
- string tname = "", tseq = ""; //temporary storage
- size_t cnt = 0, qcnt = 0, DNAlength = 0;
- bool mode = true; //mode=T:fna,F:qual
- bool needsAT = true, needsPlus = false; // checks if quality was completly read in and a '@' char is now expected in the next line
- // getline(fna2,line2,'\n');
-
- while (getline(fna, line, '\n')) {
- lnCnt++;
- if (line == "") { return ; }
- if (line[0] == '@' && needsAT) { //fasta description
- if (cnt != 0) {
- cerr << "Line " << lnCnt << ": Could not find \'@\' when expected: on line \n" << line << endl;
- }
- //tname = line;
- needsAT = false;
- needsPlus = true;
- continue;
- } else if (line[0] == '+' && needsPlus) {
- needsPlus = false;
- mode = false;
- continue;
- } else if (needsAT) {
- cerr << "Line " << lnCnt << " :Could not find \'@\' symbol where expected on line \n " << line << endl;
- exit(6);
- }
-
- istringstream iss(line);
- iss >> tseq;
- if (mode) {
- DNAlength += tseq.length();
- } else {
- qcnt += tseq.length();
- if (DNAlength == qcnt) { return; }
- }
-
- cnt++;
- }
-}
-//reads out single seq + quality entry from fastq file
-shared_ptr InputStreamer::read_fastq_entry(istream & fna, int &minQScore, int& lnCnt,
- bool&corrupt,bool repairInStream) {
- string line;
- //string tseq = ""; //temporary storage
- uint qcnt = 0;
- bool mode = true; //mode=T:fna,F:qual
- shared_ptr tdn = make_shared("", "");
- vector Iqual(0);
- bool needsAT = true, needsPlus = false; // checks if quality was completly read in and a '@' char is now expected in the next line
-
- if (!fna) { return NULL; cerr << "Read Fastq: Input stream does not exist" << lnCnt << endl; exit(53); }
-
- while (safeGetline(fna, line)) {
- lnCnt++;
- while (repairInStream) {
- if (line[0] == '@') {repairInStream = false;}
- else {safeGetline(fna, line);}
- }
- //if (line.length() == 0) { return tdn; }
- if (needsAT) { //fasta description
- if (line[0] == '@') {
- if ((lnCnt-1) % 4 != 0) {
- fqPassedFQsdt = false;
- }
- tdn->newHEad(line.substr(1));
- mode = true; qcnt = 0;
- needsAT = false;
- needsPlus = true;
- continue;
- } else {
- corrupt = true;//try again,could be last empty line in file..
- if (line.length() != 0) {
- IO_Error("Line " + itos(lnCnt) + " :Could not find \'@\' symbol where expected on line \n " +line);// << endl;
- }
- return tdn;
- }
- } else if (needsPlus && line[0] == '+') {
- if ((lnCnt-3) % 4 != 0) {
- fqPassedFQsdt = false;
- }
-
-
- Iqual.resize(tdn->length(), 0);
- needsPlus = false;
- mode = false;
- continue;
- }
-
- //istringstream iss(line);
- //iss >> tseq;
- if (mode) {
- //fna
- if ((lnCnt-2) % 4 != 0) {
- fqPassedFQsdt = false;
- }
-
- //if (std::any_of(line.begin(), line.end(), [](char c) {return (islower(c)); })) {
- if (any_lowered(line)){
- fqPassedFQsdt = false;
- std::transform(line.begin(), line.end(), line.begin(), ::toupper);
- }
- tdn->append(line);
- } else if (!mode) {
- //qual
- if ((lnCnt ) % 4 != 0) {
- fqPassedFQsdt = false;
- }
-
- for (size_t i = 0; i < line.length(); i++) {
- //really 33?
- Iqual[qcnt] = minmaxQscore((qual_score)line[i] - fastQver);
- qcnt++;
- }
- if (qcnt == tdn->length()) {
- //needsAT=true;
- tdn->Qappend(Iqual);
-
- } else if (line.length() + qcnt > tdn->length()) {
- //check that quality gets not more length than DNA
- IO_Error("ERROR: More quality positions than nucleotides detected for sequence\n " + tdn->getID());
- //tdn->setPassed(false);
- corrupt = true;
- return tdn;
- }
- break;
-
- }
- }
- //if (tdn!= NULL && !tdn->control()){delete tdn; tdn=NULL;}
- corrupt = false;
- return tdn;
-
- //to check for fast fastq reader: 1. DNA in uppercase? 2. DNA/QUAL in single line?
-}
-void InputStreamer::IO_Error(string x) {
- cerr << x << endl;
- if (DieOnError) {
- exit(632);
- }
- ErrorLog.push_back(x);
-}
-//reads out single seq + quality entry from fastq file
-shared_ptr InputStreamer::read_fastq_entry_fast(istream & fna, int& lnCnt, bool& corrupt) {
- string line;
- if (!fna) { return NULL; cerr << "Read Fastq_f: Input stream does not exist " << lnCnt << ".\n"; exit(53); }
-
- if (!safeGetline(fna, line)) { return NULL; }
- shared_ptr tdn = make_shared("", "");
- if (line.length() == 0) { return tdn; }
- while (line[0] != '@') {
- IO_Error("ERROR on line " + itos(lnCnt) + ": Could not find \'@\' when expected (file likely corrupt, trying to recover):\n" + line);// << endl;
- //exit(55);
- //recover instead and go to next entry..
- corrupt = true;
- if (!safeGetline(fna, line)) { corrupt = true; return NULL; }//delete tdn;
- }
- tdn->newHEad(line.substr(1));
- //cerr << line.substr(1);
- if (!safeGetline(fna, tdn->getSeq())) { corrupt = true; return NULL; }//delete tdn;
- //if (line.length() == 0) { return NULL; }
- //std::transform(line.begin(), line.end(), line.begin(), ::toupper);
- //tdn->append(line);
- //"+"
- if (!safeGetline(fna, line)) { corrupt = true; return NULL; }//delete tdn;
- while (line[0] != '+') {
- //recovery is hard, just give up this read
- IO_Error("Error input line " + itos(lnCnt + 2) + ": Could not find \'+\' when expected (file likely corrupt, aborting):\n" + line);// << endl;
- corrupt = true;
- return tdn;
- //if (!safeGetline(fna, line)) { delete tdn; return NULL; }
- }
-
- //qual score
- vector Iqual(tdn->mem_length(), 0);
- if (!safeGetline(fna, line)) { corrupt = true; return NULL; }//delete tdn;
- uint qcnt(0); uint lline = (uint)line.length();
- for (; qcnt < lline; qcnt++) {
- Iqual[qcnt] = minmaxQscore((qual_score)line[qcnt] - fastQver);
-
- }
-
- if (qcnt == tdn->mem_length()) {
- //needsAT=true;
- tdn->setQual(Iqual);
- } else if (line.length() + qcnt != tdn->length()) {
- //check that quality gets not more length than DNA
- corrupt = true;
- IO_Error("Error input line " + itos(lnCnt + 3) + ": More quality positions than nucleotides detected for sequence\n " +tdn->getID());// << endl;
-// exit(7);
- }
- lnCnt+=4;
- //if (tdn!= NULL && !tdn->control()){delete tdn; tdn=NULL;}
- corrupt = false;
- return tdn;
-}
-
-int InputStreamer::minmaxQscore(qual_score t) {
- if (t < 0) { ////quick hack, since q<13 scores are the only deviants and uninteresting in most cases..
- if (fqSolexaFmt){
- if (t < -5) {
- cerr << "Unusually low sloexa quality score (" << t << "); setting to 0.\n";
- }
- } else {
- if (t >= -5) {
- cerr << "Resetting auto format to Solexa (illumina 1.0-1.3) format.\n";
- fqSolexaFmt = true;
- } else {
- cerr << "Unusually low quality score (" << t << "); setting to 0.\n";
- }
- }
- t = 0;
- }
- if (minQScore > t) {
- minQScore = t;
- if (minQScore < 0) {
- }
- } else if (maxQScore < t) {
- maxQScore = t;
- }
- return t;
-}
-bool InputStreamer::checkInFileStatus() {
- for (uint i = 0; i < 3; i++) {
- if (fnaRead) {
- if (fna_u[i] != NULL && *fna_u[i]) {
- return true;
- }
- } else {
- if (fastq_u[i] != NULL && *fastq_u[i]) {
- return true;
- }
- }
- }
- return false;
-}
-void InputStreamer::allStreamReset() {
- resetLineCounts();
-#ifdef DEBUG
- cerr << "Resetting input streams" << endl;
-#endif
- //reopen streams in gz case // sdm 1.01: make default
- if (true || openedGZ) {
- allStreamClose();
- setupFastq_2(inFiles_fq[0], inFiles_fq[1], inFiles_fq[2]);
- setupFastaQual2(inFiles_fna[0], inFiles_qual[0]);
- } else {
- for (uint i = 0; i < 3; i++) {
- if (fna_u[i] != NULL && * (fna_u[i])) { fna_u[i]->clear(); fna_u[i]->seekg(0, ios::beg); }
- if (qual_u[i] != NULL && *(qual_u[i])) { qual_u[i]->clear(); qual_u[i]->seekg(0, ios::beg); }
- if (fastq_u[i] != NULL && *(fastq_u[i])) { fastq_u[i]->clear(); fastq_u[i]->seekg(0, ios::beg); }
- }
- }
- //checkInFileStatus();
-}
-void InputStreamer::allStreamClose(){
- for (uint i = 0; i < 3; i++){
-/* if (*(fna[i])){ fna[i]->close(); }
- if (*(qual[i])){ qual[i]->close(); }
- if (*(fastq[i])){ fastq[i]->close(); }
-#ifdef _gzipread
- if (gzfna[i]){ gzfna[i].close(); }
- if (gzqual[i]){ gzqual[i].close(); }
- if (gzfastq[i]){ gzfastq[i].close(); }
-#endif
- */
- if (fna_u[i] != NULL) { delete fna_u[i]; } fna_u[i] = NULL;
- if (qual_u[i] != NULL) { delete qual_u[i]; } qual_u[i] = NULL;
- if (fastq_u[i] != NULL) { delete fastq_u[i]; }fastq_u[i] = NULL;
- }
-
- if (!fnaRead && minQScore < 1000) {
- maxminQualWarns_fq( );
- }
-}
-void InputStreamer::jumpToNextDNA(bool&stillMore, int pos) {
- if (fnaRead) {//get DNA from fasta + qual files
- //shared_ptr ret;
- stillMore = read_fasta_entry(*(fna_u[pos]), *(qual_u[pos]), tdn1[pos], tdn2[pos], lnCnt[pos]);
- //tdn1 will be completed, tdn2 will have a header set up
- //ret = tdn1[pos];
- tdn1[pos] = tdn2[pos]; tdn2[pos].reset(new DNA("", ""));
- } else {
- jmp_fastq(*fastq_u[pos], lnCnt[pos]);
- if (!*(fastq_u[pos])) {
- stillMore = false;
- }
- }
-}
-
-shared_ptr InputStreamer::getDNA(bool& stillMore, int pos, bool& sync){
- //if (sync) {
- // while (desync(pos)) {
- // jumpToNextDNA(stillMore, pos);
- // }
- //}
- if (pos == 1 && numPairs <= 1) {
- return NULL;
- }
- shared_ptr ret;
- bool corrupt(true); //corrupt state isn't implemented for fnaread
-
- bool repairInStream(false);
- while (corrupt) {
- if (fnaRead) {//get DNA from fasta + qual files
- stillMore = read_fasta_entry(*(fna_u[pos]), *(qual_u[pos]), tdn1[pos], tdn2[pos], lnCnt[pos]);
- corrupt = false;
- //tdn1 will be completed, tdn2 will have a header set up
- ret = tdn1[pos];
- tdn1[pos] = tdn2[pos]; tdn2[pos].reset( new DNA("", ""));
- if (!ret->seal() || ret->isEmpty()) { ret = NULL; }
- if (pos == 0 && pairs_read[pos] % 100 == 0) {
- _drawbar(*(fna_u[pos]));
- }
- else if (!stillMore) { _drawbar(*(fna_u[pos])); }
- }
- else { //fqRead
- if (fqReadSafe) {
- ret = read_fastq_entry(*(fastq_u[pos]), minQScore, lnCnt[pos], corrupt, repairInStream);
- if (fastQver == 0 && ret->length() > 5 && !corrupt) {//autodetect
- ret->resetQualOffset(auto_fq_version(), fqSolexaFmt);
- //reset streams
- }
- if (lnCnt[pos] > 100) {//tmp set back to 500
- if (fqPassedFQsdt) {
- fqReadSafe = false;
-#ifdef DEBUG
- cerr << "Switching to fast fastq reader..\n ";
-#endif
- }
- }
- }
- else {
- ret = read_fastq_entry_fast(*(fastq_u[pos]), lnCnt[pos], corrupt);
- }
- if (!stillMore || fastq_u[pos]->eof() || (!*(fastq_u[pos])) ) {
- if (ret != NULL) { if (!ret->seal() || ret->isEmpty()) { ret = NULL; } } //delete ret;
- stillMore = false; break;
- } else if (ret == NULL || !ret->seal() || ret->isEmpty()) {
- corrupt = true;
- }
- if (pos == 0 && pairs_read[pos] % 100 == 0) {
- if (_drawbar(*(fastq_u[pos]))) { stillMore = false; break; }
- }
- else if (!stillMore) { _drawbar(*(fastq_u[pos])); }
-
-
- if (corrupt) {
- //delete ret;
- ret = NULL;
- sync = true;
- repairInStream = true;
- }
- }
- pairs_read[pos]++;
- //last check
- }
- //
- return ret;
-}
-
-void InputStreamer::openMIDseqs(string p,string in){
- if (in==""){return;}
-
- if (fastq_u[2] != NULL){
- cerr << "MID file was already initialized" << endl;
- }
-#ifdef DEBUG
- cerr << "Open Mid Seq file" << endl;
-#endif
-
- string file_type = "MID specific fastq";
- string tmp = (p + in);
- inFiles_fq[2] = tmp;
- if (isGZfile(tmp)){
-#ifdef _gzipread
- fastq_u[2] = new igzstream(tmp.c_str(), ios::in);
- file_type = "MID specific gzipped fastq";
-#else
- cerr << "gzip not supported in your sdm build\n" << tmp; exit(50);
-#endif
- }
- else {
- fastq_u[2] = new ifstream(tmp.c_str(), ios::in);
- }
- if (!*(fastq_u[2])){ cerr << "\nCouldn't find " << file_type<<": " << in << " !\n Aborting..\n"; exit(4); }
-
- hasMIDs=true;
-}
-
-bool InputStreamer::setupFastq(string path, string fileS, int& pairs, string subsPairs,bool simu) {
- allStreamClose(); openedGZ = false;
- minQScore = 1000; maxQScore = -1; fqSolexaFmt = false;
- resetLineCounts();
- vector tfas = splitByCommas(fileS);
- if ( pairs == -1 ) {
- pairs = (int) tfas.size();
- if ( pairs > 1 ) { cerr << "Paired input (\",\" separated) detected\n"; }
- }
- numPairs = pairs;
- string p1(""), p2(""), midp("");
- string xtraMsg = "";
- if (getCurFileN() > 0) {
- xtraMsg = " " + itos(getCurFileN()) + " of " + itos(totalFiles);
- if (BCnumber > 1) {
- xtraMsg = ", looking for " + itos(BCnumber) + "BCs.\n";
- } else {
- xtraMsg = ".\n";
- }
- }
-
- if (tfas.size() != (uint)pairs && subsPairs == "") {
- cerr << "Unequal number of files (" << tfas.size() << ") and option-set paired files (" << pairs << ").\n Aborting...\n"; exit(52);
- }
- if (tfas.size() == 3) {
- if (tfas.size() != 3) { cerr << "Could not detect 3 input files in string\n" << fileS << "\n Aborting.." << endl; exit(76); }
- midp = path + tfas[1];
- p1 = path + tfas[0];
- p2 = path + tfas[2];
-// cerr << p1 << " + " << p2 << " and " << midp << endl;
- } else if (tfas.size() == 2) {
- p1 = path + tfas[0];
- p2 = path + tfas[1];
- } else if (tfas.size() == 1) {
- p1 = path + fileS;
- //cerr << "Reading fastq " << p1 << endl;
- }
- if (subsPairs == "1") {
- p2 = "";
- } else if (subsPairs == "2") {
- p1 = p2; p2 = "";
- }
-
- if (simu) {
- return fileExists(p1) && fileExists(p2) && fileExists(midp);
- }
-
-
- if (p1 != ""&&p2 != "") {
- if (midp != "") {
- cerr << "Reading paired fastq + MID file" << xtraMsg<<"."< 1.f) { _print(_max + 1, 1); _fileLength = 0; return false; }
-
- // Number of #'s as function of current progress "prog"
- int cur((int) ceil(prog * (float) _max));
- if (_last != cur) _last = cur, _print(cur, prog);
- if (prog == 1.f) {
- return true;
- }
- return false;
-
-}
-inline void InputStreamer::_measure(istream& tar) {
- tar.seekg(0, ios_base::end);
- _fileLength = (int) tar.tellg();
- tar.seekg(0, ios_base::beg);
- tar.clear();
-}
-
-bool InputStreamer::setupFastq_2(string p1, string p2, string midp) {
- //setupFastq_2(inFiles_fq[0],inFiles_fq[1],inFiles_fq[2]);
-#ifdef DEBUG
- cerr << "setupFastq2 " << p1<openMIDseqs("", midp);
- }
- return true;
-}
-//mainFile = IS->setupInput(path, uniqueFas[i], FastqF[tarID], FastaF[tarID], QualF[tarID], MIDfq[tarID], fil->isPaired(), cmdArgs["-onlyPair"]);
-string InputStreamer::setupInput(string path, int i, int t, const vector& uF, const vector& FQ, const vector& Fas,
- const vector& Qual, const vector& midf, int &paired, string onlyPair,
- string& shMain, bool simu) {
- string mainFile("");
- if (fnaRead) {
- if (Fas[t] != uF[i]) {
- cerr << "Error in matching FASTA target filenames.\n";
- exit(11);
- }
- this->setupFastaQual(path, Fas[t], Qual[t], paired, onlyPair,simu);
- mainFile = path + Fas[t];
- shMain = Fas[t];
- } else {
- if (FQ[t] != uF[i]) {
- cerr << "Error in matching target filenames.\n";
- exit(11);
- }
- this->setupFastq(path, FQ[t], paired, onlyPair,simu);
- mainFile = path + FQ[t];
- shMain = FQ[t];
- }
- this->openMIDseqs(path, midf[t]);
- return mainFile;
-}
-
-
-bool InputStreamer::setupFastaQual(string path, string Sfil, string Qfil, int& pairs, string subsPairs, bool simu) {
- resetLineCounts();
- allStreamClose();
- vector tfas = splitByCommas(Sfil);
- if ( pairs == -1 ) {
- pairs = (int) tfas.size();
- if ( pairs > 1 ) { cerr << "Paired input (\",\" separated) detected\n"; }
- }
- if (pairs > 1) { cerr << "Paired fasta+qual is currently not implemented\n"; exit(72); }
- if (subsPairs == "1") {
- //
- } else if (subsPairs == "2") {
- //
- }
-
- numPairs = pairs;
- tdn1[0].reset( new DNA("", "")); tdn2[0].reset( new DNA("", ""));
-
- inFiles_fna[0] = path + Sfil;
- inFiles_qual[0] = path + Qfil;
- if (simu) {
- if (!fileExists(inFiles_qual[0],-1,false)) {
- cerr << "Warning: corresponding quality file is missing: " << inFiles_qual[0]<rdbuf()->pubsetbuf(Buffer, RDBUFFER);
- }
- //quality file
- if (fileQ != "") {
- if (isGZfile(fileQ)) {
-#ifdef _gzipread
- qual_u[0] = new igzstream(fileQ.c_str(), ios::in);
- file_typeq = "gzipped quality file";
-#else
- cerr << "gzip not supported in your sdm build \n" << fileQ; exit(50);
-#endif
- } else if (fileQ == "") {
- //setup empty stream
- qual_u[0] = new ifstream();
- qualAbsent = true;
- } else {
- qual_u[0] = new ifstream(fileQ.c_str(), ios::in);
- }
- if (!*(qual_u[0])) {
- cerr << "\nCouldn't find " << file_typeq << " file \"" << fileQ << "\"!\n Running in no qual filter mode\n";
- qualAbsent = true;
- //exit(55);
- }
- if (getCurFileN() > 0) {
- cerr << "Reading Fasta + Quality file " << getCurFileN() << " of " << totalFiles << ".\n";
- } else {
- cerr << "Reading Fasta + Quality file.\n";
- }
- cerr<< file_type << " : " << fileS << endl << file_typeq << " : " << fileQ << endl;
- } else if (fileS != "") {
- if (getCurFileN() > 0) {
- cerr << "Reading Fasta file " << getCurFileN() << " of " << totalFiles << ".\n";
- } else {
- cerr << "Reading Fasta.\n";
- }
- qual_u[0] = new ifstream();
- qualAbsent = true;
- }
- return true;
-}
-void InputStreamer::setupFna(string fileS){
- allStreamClose();
- resetLineCounts();
- numPairs = 1;
- tdn1[0].reset(new DNA("", "")); tdn2[0].reset(new DNA("", ""));
- setupFastaQual2(fileS, "","seq");
- /*cerr << "Reading Fasta file.\n" << fileS << endl;
- string file_type = "seq";
- // INPUT file
- if (isGZfile(fileS)){
-#ifdef _gzipread
- fna_u[0] = new igzstream(fileS.c_str(), ios::in);
- file_type = "gzipped seq";
-#else
- cerr << "gzip not supported in your sdm build"; exit(50);
-#endif
- }
- else {
- //fna[0].open(fileS.c_str(), ios::in);
- fna_u[0] = new ifstream(fileS.c_str(), ios::in);
- }
- if (!fna_u[0]){ cerr << "\nCouldn't find "<= 100 || maxQScore < 2) {
- return fqDiff;
- }
- fqSolexaFmt = false;
- if (minQScore >= 59 && maxQScore > 74){
- fqDiff = (fastQver - 64); fastQver = 64;
- if (minQScore < 64) { //set to illumina1.0 (solexa)
- fqSolexaFmt = true;
- cerr << "\nSetting to illumina 1.0-1.3 (solexa) fastq version (q offset = 64, min Q=-5).\n\n";
- } else {
- cerr << "\nSetting to illumina 1.3-1.8 fastq version (q offset = 64).\n\n";
- }
- } else if (minQScore >= 33 && maxQScore <= 74) {
- fqDiff = (fastQver - 33); fastQver = 33;
- cerr << "\nSetting to Sanger fastq version (q offset = 33).\n\n";
- } else {
- cerr << "\nUndecided fastq version..\n";
- fqDiff = (fastQver - 33); fastQver = 0;
- //exit(53);
- }
- QverSet = true;
- return fqDiff;
-}
-
-void InputStreamer::maxminQualWarns_fq(){
- if (minQScore >= 59-33 && fastQver==33 ){ //set to sanger version, but no low qual over whole dataset -> probably illumina version
- cerr << " WARNING :: \nQuality scores in your dataset are unusually high (min Q=" << minQScore<<"). Please check that you have a fastQ file in NCBI SRA, Sanger or Illumina 1.8+ version.\nIf not, set fastqVersion in option file to \"3\" (Illumina 1.0) or \"2\" (Illumina 1.3 < 1.8) .\n\n";
- }
- if (minQScore < 0 ){ //set to sanger version, but no low qual over whole dataset -> probably illumina version
- cerr << " WARNING :: \nQuality scores in your dataset are unusually low (min Q=" << minQScore << "). Please check that you have a fastQ file in Illumina 1.0 or Illumina 1.3 < 1.8 version.\nIf not, set fastqVersion in option file to \"1\" (NCBI SRA, Sanger or Illumina 1.8+ version).\n\n";
- }
-}
-
-#ifdef _gzipread2
-
-std::vector< char > readline(gzFile f) {
- // gzFile fp =gzopen(fname,"r");
-
- std::vector< char > v(2056);
- int pos = 0;
- for (;;) {
- if (gzgets(f, &v[pos], (int)v.size() - pos) == 0) {
- // end-of-file or error
- int err;
- //const char *msg = gzerror(f, &err);
- if (err != Z_OK) {
- // handle error
- }
- break;
- }
- unsigned read = strlen(&v[pos]);
- if (v[pos + read - 1] == '\n') {
- if (v[pos + read - 2] == '\r') {
- pos = pos + read - 2;
- }
- else {
- pos = pos + read - 1;
- }
- break;
- }
- if (read == 0 || pos + read < v.size() - 1) {
- pos = read + pos;
- break;
- }
- pos = v.size() - 1;
- v.resize(v.size() * 2);
- }
- v.resize(pos);
- return v;
-}
-#endif
\ No newline at end of file
diff --git a/configs/sdm_src/InputStream.h b/configs/sdm_src/InputStream.h
deleted file mode 100644
index 3874649..0000000
--- a/configs/sdm_src/InputStream.h
+++ /dev/null
@@ -1,539 +0,0 @@
-/* sdm: simple demultiplexer
-Copyright (C) 2013 Falk Hildebrand
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-*/
-
-
-#ifndef _InputStr_h
-#define _InputStr_h
-
-
-#include "DNAconsts.h"
-#include
-#include
-#include
-
-extern char DNA_trans[256];
-extern short DNA_amb[256];
-extern short NT_POS[256];
-extern short DNA_IUPAC[256 * 256];
-typedef float matrixUnit;
-
-
-string spaceX(uint k);
-int digitsInt(int x);
-int digitsFlt(float x);
-string intwithcommas(int value);
-std::string itos(int number);
-std::string ftos(float number);
-bool isGZfile(const string fileS);//test if file is gzipped input
-
-static inline std::string &rtrim(std::string &s) {
- s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(std::isspace))).base(), s.end());
- return s;
-}
-
-
-//MOCAT header fix
-std::vector header_string_split(const std::string str, const std::string sep);
-void remove_paired_info(string&, short = -1);
-//MOCAT header fix
-std::string header_stem(string& header);
-std::istream& safeGetline(std::istream& is, std::string& t);
-string reverseTS2(const std::string & Seq);
-void reverseTS(std::string & Seq);
-
-
-bool any_lowered(const string& is);
-//this function changes input string (file location) to have consistent file names
-string applyFileIT(string x, int it, const string xtr = "");
-bool fileExists(const std::string& name, int i=-1,bool extiffail=true);
-//vector orderOfVec(vector&);
-
-
-
-
-class ofbufstream {
-public:
- ofbufstream(const string IF, int mif) :file(IF), modeIO(mif), used(0) {
- if (modeIO == ios::out) {
- remove(file.c_str());
- }
- keeper = new char[bufS];
- }
- ~ofbufstream() {
- writeStream();
- delete[] keeper;
- }
- void operator<< (const string& X) {
- size_t lX(X.length());
- if (lX + used > bufS) {
- writeStream();
- }
- memcpy(keeper + used, X.c_str(), lX);
- used += lX;
- }
-private:
- void writeStream() {
- if (used == 0) { return; }
- ofstream of(file.c_str(), ios::app);
- of.write(keeper, used);
- of.close();
- used = 0;
- }
- string file;
- char *keeper;
- int modeIO;
- size_t used;
- static const size_t bufS = 500000;
-};
-
-
-
-inline vector splitByComma(const string& fileS,bool requireTwo, char SrchStr=','){
- string::size_type pos = fileS.find(SrchStr);
- if (pos == string::npos){
- if (requireTwo){
- cerr< (1,fileS);
- }
- }
- vector tfas(2,"");
- tfas[0] = fileS.substr(0,pos);
- tfas[1] = fileS.substr(pos+1);
- return tfas;
-}
-
-inline vector splitByCommas(const string& fileS, char SrchStr = ',') {
- if (fileS.find(SrchStr) == string::npos) { return vector(1, fileS); }
- vector res = splitByComma(fileS, true, SrchStr);
- vector ret(0); ret.push_back(res[0]);
- while (res[1].find(SrchStr) != string::npos) {
- res = splitByComma(res[1], true, SrchStr);
- ret.push_back(res[0]);
- }
- ret.push_back(res[1]);
- return ret;
-}
-
-
-//requires sorted vector with the entries being actual datapoints
-template
-TYPE calc_median2(vector& in, float perc){
- size_t sum = in.size();
- size_t tar = (size_t)(((float)sum) * perc);
- return in[tar];
-}
-
-
-//returns "i_fna" or "i_fastq"
-string detectSeqFmt(const string);
-
-
-class Filters;
-
-class DNA{
-public:
- DNA(string seq, string names) :Seq(seq), SeqLength(Seq.length()),
- ID(names), NewID(names),
- Qual(0),QualTraf(""),Sample(-1),avgQual(-1.f),
- Qsum(0),AccumError(0.),goodQual(false),midQual(false),
- Read_position(-1),
- FtsDetected(),
- IDfixed(false), tempFloat(0.f){}
- DNA():Seq(""),SeqLength(0),ID(""),NewID(""),Qual(0),QualTraf(""),
- Sample(-1), avgQual(-1.f),
- Qsum(0), AccumError(0.), goodQual(false), midQual(false),
- Read_position(-1),
- FtsDetected(),
- IDfixed(false), tempFloat(0.f) {
- }
- bool operator==(DNA i) {
- if (i.getSeqPseudo() == this->getSeqPseudo()) {
- return true;
- } else {
- return false;
- }
- }
- bool operator==(shared_ptr i) {
- if (i->getSeqPseudo() == this->getSeqPseudo()) {
- return true;
- } else {
- return false;
- }
- }
-
- //~DNA(){}
- void append(const string &s) { Seq += s; SeqLength = Seq.length(); }
- void Qappend(const vector &q);
- void setSeq(string & s) { Seq = s; SeqLength = Seq.length();/*DN = Seq.c_str();*/ }
- string &getSeq() { return Seq; }
- string getSeq_c() { return Seq; }
- string getSeqPseudo() { return Seq.substr(0, SeqLength); }
- void setQual(vector& Q) { Qual = Q; avgQual = -1.f; }
- const string& getID() { if (IDfixed) { return NewID; }return ID; }
- string getID_copy() { string x = getID(); string y = x; return y; }
- string getIDPosFree(); // remove /1 /2 #1:0 etc
- const string& getOldID() { return ID; }
- string getIDshort() { return ID.substr(0, getShorterHeadPos(ID)); }
- string getNewIDshort() { return NewID.substr(0, getShorterHeadPos(NewID)); }
- bool seal();
- bool isEmpty() { if (ID.length() == 0 && Seq.length() == 0) { this->setPassed(false); return true; } return false; }
-
- int numACGT();
- float getAvgQual();
- unsigned int getQsum(){return Qsum;}
- float qualWinfloat(unsigned int,float,int&);
-
-
- float binomialFilter(int, float);
- //float qualWinfloat_hybr(int,float,int,float,int&);
- bool qualWinPos(unsigned int,float);
- bool qualAccumTrim(double d);
- int qualAccumulate(double d);
- double getAccumError(){
- if (AccumError == 0.f) { for (uint i = 0; i < Qual.size(); i++) { if (Qual[i] >= 0) { AccumError += SAqualP[Qual[i]]; } } }
- if (std::isinf((double)AccumError)) {
- AccumError = 5.f;
- }
- return AccumError;}
- int minQual(){int mq=50; for (uint i=0;i&, vector&);
-
-
- inline uint length() { return (uint)SeqLength; }
- inline uint mem_length() { return (uint) Seq.length(); }
- bool cutSeq(int start, int stop=-1, bool = false);
- bool HomoNTRuns(int);
- int matchSeq(string, int, int, int);
- void reverse_transcribe();
- int matchSeqRev(string, int, int, int=0);
- int matchSeq_tot(string, int, int, int&);
- void writeSeq(ostream&,bool singleLine=false);
- void writeQual(ostream&, bool singleLine = false);
- void writeFastQ(ostream&,bool=true);
- void writeFastQ(ofbufstream&, bool = true);
- void writeFastQEmpty(ostream&);
- void setNewID(string x) { NewID = x; }
- void newHEad(string x){NewID=x;ID=x;}
- void changeHeadPver(int ver);
- void setTA_cut(bool x) { FtsDetected.TA_cut = x; }
- bool getTA_cut() { return FtsDetected.TA_cut; }
- void setBarcodeCut() { FtsDetected.Barcode_cut = true; FtsDetected.Barcode_detected = true; }
- bool getBarcodeCut() { return FtsDetected.Barcode_cut; }
- void setBarcodeDetected(bool x){ FtsDetected.Barcode_detected = x; }
- bool getBarcodeDetected() { return FtsDetected.Barcode_detected; }
- bool isMIDseq() { if (Read_position == 3) { return true; } return false; }
- void setMIDseq(bool b){ if (b){ Read_position = 3; } }
- void setpairFWD(){ Read_position = 0; }
- void setpairREV(){ Read_position = 1; }
- int getReadMatePos() { return (int) Read_position; }
- bool sameHead(shared_ptr);
- bool sameHead(const string&);
- //inline void reverseTranscribe();
- void setTempFloat(float i){tempFloat = i;}
- float getTempFloat(){return tempFloat;}
- void adaptHead(shared_ptr,const int,const int);
- void failed(){goodQual=false;midQual=false;}
- bool control(){ if (Qual.size()==0){return false;}return true;}
- void setBCnumber(int i, int BCoff) { if (i < 0) { Sample = i ; FtsDetected.Barcode_detected = false; } else { Sample = i + BCoff; FtsDetected.Barcode_detected = true; } }
- int getBCnumber();//always return BC tag IDX global (no local filter idx accounted for, use getBCoffset() to correct)
-
- void prepareWrite(int fastQver);
- void reset();
- void resetTruncation() { SeqLength = Seq.length(); }
- void setPassed(bool b);
- void setMidQual(bool b) { midQual = b; }
- bool isPassed(void){return goodQual;}
- bool isMidQual(void){return midQual;}
- string getSubSeq(int sta, int sto){return Seq.substr(sta,sto);}
- void resetQualOffset(int off, bool solexaFmt);
-
- //control & check what happened to any primers (if)
- bool has2PrimersDetected() { return (FtsDetected.reverse && FtsDetected.forward); }
- bool getRevPrimCut() { return FtsDetected.reverse; }
- bool getFwdPrimCut() { return FtsDetected.forward; }
- void setRevPrimCut() { FtsDetected.reverse = true; }
- void setFwdPrimCut() { FtsDetected.forward = true; }
- //only used in pre best seed step
- //float getSeedScore() { return tempFloat; }
- //void setSeedScore(float i) { tempFloat = (float)i; }
-
- struct QualStats {
- bool maxL; bool PrimerFail; bool AvgQual; //sAvgQual
- bool HomoNT; bool PrimerRevFail; bool minL;
- bool minLqualTrim; //<-sMinQTrim trimmed due to quality
- bool TagFail; bool MaxAmb; bool QualWin;//sQualWin
- bool AccErrTrimmed; bool QWinTrimmed; // either of these makes bool Trimmed;
- bool fail_correct_BC; bool suc_correct_BC; bool
- failedDNAread;
- //bool adapterRem; -> setTA_cut
- bool RevPrimFound;
- bool BinomialErr; bool dblTagFail;
- QualStats() :
- maxL(false), PrimerFail(false), AvgQual(false), HomoNT(false),
- PrimerRevFail(true), minL(false), minLqualTrim(false),
- TagFail(false), MaxAmb(false), QualWin(false),
- AccErrTrimmed(false), QWinTrimmed(false),
- fail_correct_BC(false), suc_correct_BC(false),
- failedDNAread(false), RevPrimFound(false),
- BinomialErr(false),
- dblTagFail(false)
- {}
- } QualCtrl;
-
-protected:
- size_t getShorterHeadPos(const string & x, int fastQheadVer=-1) {
-
- size_t pos(string::npos);
- if (fastQheadVer != 0) {
- if (Read_position == 1) {
- pos = x.find("/2");
- // if (pos == string::npos) { pos = x.find_first_of(" 1:");}
- // if (pos == string::npos) { pos = x.find_first_of("/1"); }
- }
- else if (Read_position == 0) {
- pos = x.find("/1");
- }
- else {
- pos = x.find("/1");
- if (pos == string::npos) { pos = x.find("/2"); }
- }
- }
- //if (pos == string::npos){pos=x.length()-min((size_t)5,x.length());}}
- //if(pos<0){pos=0;}
- if (pos == string::npos) { pos = min(x.find(' '), x.find('\t')); }
-
- if (pos == string::npos) { pos = x.length(); }
- return pos;
- }
- //mainly used to mark if rev/Fwd primer was detected
- string xtraHdStr();
- size_t getSpaceHeadPos(const string & x) {
- size_t pos = x.find(' ');
- if (pos == string::npos) { pos = x.length(); }
- return pos;
- }
- //binomial accumulated error calc
- inline float interpolate(int errors1, float prob1, int errors2, float prob2, float alpha);
- float sum_of_binomials(const float j, int k, float n, int qual_length, const vector& error_probs, const vector< vector> & per_position_accum_probs);
- inline float prob_j_errors(float p, float j, float n);
-
- inline bool matchDNA(char,char);
-
- string Seq;
- size_t SeqLength;
- string ID,NewID; //original and newly constructed ID
- vector Qual;
- string QualTraf;
- int Sample;
-
- //const char* DN;
- float avgQual;
- unsigned int Qsum;
- double AccumError;
- bool goodQual,midQual;
- //bool TA_cut, Barcode_cut; //technical adapter, barcode (tag)
- short Read_position;//-1=unkown; 0=pair1 (fwd primer); 1=pair2 (rev primer); 3=MID seq ;
-
- struct ElementsDetection{
- bool forward; bool reverse;//primers detected
- bool TA_cut; bool Barcode_detected; bool Barcode_cut;
- ElementsDetection() :forward(false), reverse(false), TA_cut(false), Barcode_detected(false), Barcode_cut(false) {}
- void reset() { forward = false; reverse = false; TA_cut = false; Barcode_detected = false; Barcode_cut = false; }
- } FtsDetected;
-
-
-
- bool IDfixed;
- float tempFloat;
-};
-
-struct DNAHasher
-{
- size_t operator()(shared_ptr k) const
- {
- // Compute individual hash values for two data members and combine them using XOR and bit shifting
- return ((hash()(k->getSeqPseudo())) >> 1);
- }
-};
-
-
-
-
-
-class DNAunique : public DNA{//used for dereplication
-public:
- DNAunique() : DNA(), Count(0), pair(0){}//chimeraCnt((matrixUnit) 1),
- DNAunique(string s, string x) :DNA(s, x), Count(1) {}
- DNAunique(shared_ptrd, int BC) : DNA(*d), Count(0), BestSeedLength( (uint)Seq.size()),pair(0){ addSmpl(BC); }
- ~DNAunique() { ; }// if (pair != NULL) { delete pair; }
- //string Seq; string ID;
- void Count2Head(bool);
- void addSmpl(int k);
- void writeMap(ofstream & o, const string&, vector&, const vector&);
- inline int getCount() { return Count; }
- uint getBestSeedLength() { return BestSeedLength; }
- void setBestSeedLength(uint i) { BestSeedLength = i; }
- void setOccurence(int smpl, int N);
- void transferOccurence(shared_ptr);
- const unordered_map & getDerepMap() { return occurence; }
- vector getDerepMapSort(size_t);
- //vector> getDerepMapSort2(size_t wh);
- void getDerepMapSort(vector&, vector&);
- void saveMem() { QualTraf = ""; NewID = ID.substr(0, getSpaceHeadPos(ID)); ID = ""; }
- void attachPair(shared_ptr d) { pair = d; pair->saveMem(); }
- shared_ptr getPair(void) { return pair; }
- //estimates if one sample occurence covers the unique counts required for sample specific derep min counts
- bool pass_deprep_smplSpc( const vector&);
-
- //matrixUnit chimeraSplitNum() { return chimeraCnt; }
- //void setChimSplitNum(matrixUnit x) { chimeraCnt = x; }
- //sort
- //bool operator < (const DNAunique& str) const { return (Count < str.Count); }
-private:
- int Count;
- //matrixUnit chimeraCnt;
- int BestSeedLength;
- unordered_map