From 59d4082ab717c1cce4316bdf7ec0b4ca0ec4c73a Mon Sep 17 00:00:00 2001 From: Falk Hildebrand Date: Fri, 2 Aug 2024 17:40:42 +0100 Subject: [PATCH] sdm 2.18: added AVITI fastq support --- DNAconsts.h | 3 ++- InputStream.cpp | 47 ++++++++++++++++++++++++++++++----------------- InputStream.h | 4 ++-- containers.cpp | 25 ++++++++++++++++++++----- 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/DNAconsts.h b/DNAconsts.h index 650107f..780afd3 100644 --- a/DNAconsts.h +++ b/DNAconsts.h @@ -127,8 +127,9 @@ along with this program. If not, see . //2.14: new model of preallocating space for input strings //2.16: new threading model (no more dynamic threadpool) (reversed to old model) //2.17: fixed PB rev seq checks.. +//2.18: fastq autoversion updated (more robust) -static const float sdm_version = 2.17f; +static const float sdm_version = 2.18f; static const char* sdm_status = "beta"; diff --git a/InputStream.cpp b/InputStream.cpp index 75f8c1a..60cea89 100644 --- a/InputStream.cpp +++ b/InputStream.cpp @@ -1969,10 +1969,10 @@ shared_ptr InputStreamer::read_fastq_entry(istream & fna, qual_score &minQ if ((lnCnt ) % 4 != 0) { fqPassedFQsdt = false; } - + bool printB(false); for (size_t i = 0; i < line.length(); i++) { //really 33? - Iqual[qcnt] = minmaxQscore((qual_score)line[i] - fastQver); + Iqual[qcnt] = minmaxQscore((qual_score)line[i] - fastQver, printB); qcnt++; } if (qcnt == tdn->length()) { @@ -2068,27 +2068,34 @@ shared_ptr InputStreamer::read_fastq_entry_fast(istream & fna, int& lnCnt, corrupt = false; return tdn; } -void InputStreamer::minmaxQscore(shared_ptr t) { +void InputStreamer::minmaxQscore(shared_ptr t, bool& print) { fqverMTX.lock(); vector Qs = t->getQual(); for (size_t i = 0; i < Qs.size(); i++) { - minmaxQscore(Qs[i]); + minmaxQscore(Qs[i], print); } fqverMTX.unlock(); } -qual_score InputStreamer::minmaxQscore(qual_score t) { +qual_score InputStreamer::minmaxQscore(qual_score t,bool& print) { if (t < 0) { ////quick hack, since q<13 scores are the only deviants and uninteresting in most cases.. - if (fqSolexaFmt){ - if (t < -5) { + if (fqSolexaFmt ){ + if (t < -5 && print) { cerr << "Unusually low sloexa quality score (" << t << "); setting to 0.\n"; + print = false; } } else { if (t >= -5) { - cerr << "Resetting auto format to Solexa (illumina 1.0-1.3) format.\n"; - fqSolexaFmt = true; + if (print) { + cerr << "Resetting auto format to Solexa (illumina 1.0-1.3) format.\n"; + print = false; + } + //fqSolexaFmt = true; } else { - cerr << "Unusually low quality score (" << t << "); setting to 0.\n"; + if (print) { + cerr << "Unusually low quality score (" << t << "); setting to 0.\n"; + print = false; + } } } t = 0; @@ -2565,9 +2572,10 @@ bool InputStreamer::getDNAlines(vector& ret, int pos) { }*/ lnCnt[pos] += 4; if (fastQver == 0 || lnCnt[pos] == 4) { // ok first time just has to be done in function, after this can be unloaded outside + bool printB(true); //cerr << "AutoFQ!!"; shared_ptr tdn = make_shared(ret, fastQver); - minmaxQscore(tdn); + minmaxQscore(tdn,printB); auto_fq_version(); //tdn->resetQualOffset(auto_fq_version(), fqSolexaFmt); } @@ -2983,26 +2991,31 @@ void InputStreamer::setupFna(string fileS){ int InputStreamer::auto_fq_version() { //if (QverSet) { return 33; } int fqDiff(0); - if (minQScore >= 100 || maxQScore < 2) { + if (minQScore >= 100 || maxQScore < 2 || fastQver!=0) { + //don;t know what to do here.. return fqDiff; } fqverMTX.lock(); fqSolexaFmt = false; - if (minQScore >= 59 && maxQScore > 74){ + + if (minQScore >= 33 && maxQScore <= 88 ){ + fqDiff = (fastQver - 33); fastQver = 33; + + } else if (minQScore >= 59 && maxQScore > 74) { fqDiff = (fastQver - 64); fastQver = 64; if (minQScore < 64) { //set to illumina1.0 (solexa) fqSolexaFmt = true; //cerr << "Setting to illumina 1.0-1.3 (solexa) fastq version (q offset = 64, min Q=-5)."; - } else { + } + else { //cerr << "Setting to illumina 1.3-1.8 fastq version (q offset = 64)."; } - } else if (minQScore >= 33 && maxQScore <= 74) { - fqDiff = (fastQver - 33); fastQver = 33; //cerr << "Setting to Sanger fastq version (q offset = 33)."; } else { if (verbose) { - cerr << "\nUndecided fastq version..\n"; verbose = 0; + cerr << "\nUndecided fastq version.. reverting to +33\n"; verbose = 0; } + //fallback to sensible std.. fqDiff = (fastQver - 33); fastQver = 33; //exit(53); diff --git a/InputStream.h b/InputStream.h index 85a18f4..605567c 100644 --- a/InputStream.h +++ b/InputStream.h @@ -1227,8 +1227,8 @@ class InputStreamer{ private: string current_infiles(); - inline qual_score minmaxQscore(qual_score t);// , int lnCnt); - void minmaxQscore(shared_ptr t);// , int lnCnt); + inline qual_score minmaxQscore(qual_score t, bool& print);// , int lnCnt); + void minmaxQscore(shared_ptr t, bool&);// , int lnCnt); bool setupFastq_2(string, string, string); bool setupFastaQual2(string, string, string = "fasta file"); shared_ptr read_fastq_entry(istream & fna, qual_score &minQScore, diff --git a/containers.cpp b/containers.cpp index c4a7ae2..c7e469f 100644 --- a/containers.cpp +++ b/containers.cpp @@ -6876,6 +6876,13 @@ bool UClinks::getMAPPERline(string& segs, string& segs2,float& perID, else if (tmp == "noisy_chimera" || tmp == "good_chimera") { //tmp == "perfect_chimera" || if (!doChimeraCnt) { continue; } chimera = true; +//M04428:252:000000000-BM3M5:1:1109:4503:15901;size=52; noisy_chimera +// dqt=20;dqm=2;div=9.0;segs=2;parents=M04428:252:000000000-BM3M5:1:2110:24763:14436(1-78/0)+M04428:252:000000000-BM3M5:1:1116:5359:16203(78-186/0); + + getline(ss, tmp, '\t'); + size_t p1(tmp.find("parents=") + 4);//4 + size_t p2(tmp.find(");", p1) + 2); + segs2 = tmp.substr(p1, p2 - p1 - 1); } else if (tmp == "perfect_chimera") { removeSizeStr(segs); @@ -6915,12 +6922,20 @@ bool UClinks::getMAPPERline(string& segs, string& segs2,float& perID, //finished parsing, now reformat to get matching sample - if ( chimera && UPARSE8up) { - tarsV = splitByComma(segs2, false, '+'); - for ( uint kk = 0; kk < tarsV.size(); kk++ ) { - tarsV[kk] = tarsV[kk].substr(0,tarsV[kk].find_last_of("(")); + if (chimera){ + if (UPARSE9up) { + tarsV = splitByComma(segs2, false, '+'); + for (uint kk = 0; kk < tarsV.size(); kk++) { + tarsV[kk] = tarsV[kk].substr(0, tarsV[kk].find_last_of("(")); + } } - } else if (!chimera){ + else if (UPARSE8up) { + tarsV = splitByComma(segs2, false, '+'); + for (uint kk = 0; kk < tarsV.size(); kk++) { + tarsV[kk] = tarsV[kk].substr(0, tarsV[kk].find_last_of("(")); + } + } + } else { tarsV.push_back(segs2); } matrixUnit splChim = (matrixUnit)tarsV.size();