From 85c8320f85d59f152bf2b7788b1238155725a101 Mon Sep 17 00:00:00 2001
From: Giuseppe Narzisi <gnarzisi@nygenome.org>
Date: Fri, 9 Feb 2018 13:21:44 -0500
Subject: [PATCH] minior release v1.0.3; updated readme; check input start/end
 of input regions for out of range values.

---
 README.md             |  4 ++--
 src/Lancet.cc         | 29 ++++++++++++++++++++++-------
 src/Lancet.hh         |  2 +-
 src/Microassembler.cc |  2 +-
 4 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/README.md b/README.md
index 177161c1..7898aa65 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,6 @@ Lancet is based on the colored de Bruijn graph assembly paradigm where tumor and
 
 Narzisi G, Corvelo A, Arora K, Bergmann E, Shah M, Musunuri R, Emde AK, Robine N, Vacic V, Zody MC. *Lancet: genome-wide somatic variant calling using localized colored DeBruijn graphs.* (2017) bioRxiv 196311; doi: [https://doi.org/10.1101/196311](https://doi.org/10.1101/196311)
 
-* Version: 1.0.1
 * Author: Giuseppe Narzisi, [New York Genome Center](https://www.nygenome.org)
 
 Lancet is freely available for academic and non-commercial research purposes ([`LICENSE.txt`](https://github.com/nygenome/lancet/blob/master/LICENSE.txt)).  
@@ -143,7 +142,7 @@ The final graph (after compression) containing one single variant is depicted be
  _____|\__,_|_|  _|\___|\___|\__|
 
 Program: lancet (micro-assembly somatic variant caller)
-Version: 1.0.2 (beta), Ocotber 17 2017
+Version: 1.0.3 (beta), January 31 2018
 Contact: Giuseppe Narzisi <gnarzisi@nygenome.org>
 
 Usage: lancet [options] --tumor <BAM file> --normal <BAM file> --ref <FASTA file> --reg <chr:start-end>
@@ -169,6 +168,7 @@ Optional
    --max-avg-cov, -u         <int>         : maximum average coverage allowed per region [default: 10000]
    --low-cov, -d             <int>         : low coverage threshold [default: 1]
    --window-size, -w         <int>         : window size of the region to assemble (in base-pairs) [default: 600]
+   --padding, -P             <int>         : left/right padding (in base-pairs) applied to the input genomic regions [default: 300]
    --dfs-limit, -F           <int>         : limit dfs/bfs graph traversal search space [default: 1000000]
    --max-indel-len, -T       <int>         : limit on size of detectable indel [default: 500]
    --max-mismatch, -M        <int>         : max number of mismatches for near-perfect repeats [default: 2]
diff --git a/src/Lancet.cc b/src/Lancet.cc
index f254bdcf..a820925d 100644
--- a/src/Lancet.cc
+++ b/src/Lancet.cc
@@ -70,7 +70,7 @@ void printHelpText(Filters & filters) {
 		"   --max-avg-cov, -u         <int>         : maximum average coverage allowed per region [default: " << MAX_AVG_COV << "]\n"
 		"   --low-cov, -d             <int>         : low coverage threshold [default: " << LOW_COV_THRESHOLD << "]\n"
 		"   --window-size, -w         <int>         : window size of the region to assemble (in base-pairs) [default: " << WINDOW_SIZE << "]\n"
-		"   --padding, -P             <int>         : left/right padding (in base-pairs) for regions in BED file [default: " << PADDING << "]\n"
+		"   --padding, -P             <int>         : left/right padding (in base-pairs) applied to the input genomic regions [default: " << PADDING << "]\n"
 		"   --dfs-limit, -F           <int>         : limit dfs/bfs graph traversal search space [default: " << DFS_LIMIT << "]\n"
 		"   --max-indel-len, -T       <int>         : limit on size of detectable indel [default: " << MAX_INDEL_LEN << "]\n"
 		"   --max-mismatch, -M        <int>         : max number of mismatches for near-perfect repeats [default: " << MAX_MISMATCH << "]\n"
@@ -173,7 +173,6 @@ void printConfiguration(ostream & out, Filters & filters)
 	out << endl;
 }
 
-
 // loadRef
 //////////////////////////////////////////////////////////////
 int loadRefs(const string reference, const string region, vector< map<string, Ref_t *> > &reftable, RefVector &bamrefs, int num_threads, int thread)	
@@ -214,6 +213,22 @@ int loadRefs(const string reference, const string region, vector< map<string, Re
 		CHR  	 = hdr.substr(0,x);
 		START	 = hdr.substr(x+1, y-x-1);
 		END   	 = hdr.substr(y+1, string::npos);
+				
+		int SP = stoi(START) - PADDING;
+		int EP = stoi(END) + PADDING;
+		
+		if(SP<0) {SP=0;} // start position cannnot be negative
+		// check chromosome size
+		std::vector<RefData>::iterator it;
+	    for (it = bamrefs.begin() ; it != bamrefs.end(); ++it) {
+			if (it->RefName == CHR) {
+				if(EP > it->RefLength) { EP = it->RefLength; } 
+				break; 
+			}
+		}		
+		// save updated coordinates
+		START = itos(SP);
+		END = itos(EP);
 	}
 	//cerr << CHR << ":" << START << "-" << END << endl; 
 	string REG = CHR+":"+START+"-"+END;
@@ -692,13 +707,13 @@ int main(int argc, char** argv)
 			case 'Y': MIN_REPORT_LEN   = atoi(optarg); break;
 			case 'D': DIST_FROM_STR    = atoi(optarg); break;
 
-			case 'E': filters.minPhredFisherSTR = atoi(optarg); break;			
-			case 's': filters.minPhredFisher = atoi(optarg); break;
-			case 'f': filters.minStrandBias = atoi(optarg); break;
+			case 'E': filters.minPhredFisherSTR = atof(optarg); break;			
+			case 's': filters.minPhredFisher = atof(optarg); break;
+			case 'f': filters.minStrandBias = atof(optarg); break;
 			case 'a': filters.minAltCntTumor = atoi(optarg); break;
 			case 'm': filters.maxAltCntNormal = atoi(optarg); break;
-			case 'e': filters.minVafTumor = atoi(optarg); break;
-			case 'i': filters.maxVafNormal = atoi(optarg); break;
+			case 'e': filters.minVafTumor = atof(optarg); break;
+			case 'i': filters.maxVafNormal = atof(optarg); break;
 			case 'o': filters.minCovTumor = atoi(optarg); break;
 			case 'y': filters.maxCovTumor = atoi(optarg); break;
 			case 'z': filters.minCovNormal = atoi(optarg); break;
diff --git a/src/Lancet.hh b/src/Lancet.hh
index 3a092c04..659dbd33 100644
--- a/src/Lancet.hh
+++ b/src/Lancet.hh
@@ -26,7 +26,7 @@
 
 #include "Microassembler.hh"
 
-string VERSION = "1.0.3 (beta), January 31 2018";
+string VERSION = "1.0.3 (beta), February 9 2018";
 
 /****  configuration parameters ****/
 int NUM_THREADS = 1;
diff --git a/src/Microassembler.cc b/src/Microassembler.cc
index b94a53eb..d9a50fef 100644
--- a/src/Microassembler.cc
+++ b/src/Microassembler.cc
@@ -293,7 +293,7 @@ bool Microassembler::isActiveRegion(BamReader &reader, Ref_t *refinfo, BamRegion
 				// parse MD string
 				// String for mismatching positions. Regex : [0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*10
 				al.GetTag("MD", md); // get string of mismatching positions
-				//cerr << "Q: " << al.Qualities << endl;
+				//cerr << "MD: " << md << " alstart: " << alstart << " Q: " << al.Qualities << " MinQ " << MIN_QUAL_CALL << endl;
 				parseMD(md, mapX, alstart, al.Qualities, MIN_QUAL_CALL);
 				
 				// add SNV to database