Merge pull request #68 from gymreklab/max_read_thresh

Added max processed reads per sample to avoid getting stuck in telomeres
gymreklab · Apr 22, 2019 · 2a0e229 · 2a0e229
2 parents 7ff3a23 + 075fabf
commit 2a0e229
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 3 deletions.
diff --git a/src/main_gangstr.cpp b/src/main_gangstr.cpp
@@ -78,6 +78,7 @@ void show_help() {
 	   << "\t" << "--numbstrap   <int>           " << "\t" << "Number of bootstrap samples. Default: " << options.num_boot_samp << "\n"
 	   << "\t" << "--grid-threshold <int>        " << "\t" << "Use optimization rather than grid search to find MLE if more than this many possible alleles. Default: " << options.grid_threshold << "\n"
 	   << "\t" << "--rescue-count <int>          " << "\t" << "Number of regions that GangSTR attempts to rescue mates from (excluding off-target regions) Default: " << options.rescue_count << "\n"
+	   << "\t" << "--max-proc-read <int>         " << "\t" << "Maximum number of processed reads per sample before a region is skipped. Default: " << options.max_processed_reads_per_sample << "\n"
 	   << "\n Parameters for local realignment:\n"
 	   << "\t" << "--minscore    <int>           " << "\t" << "Minimum alignment score (out of 100). Default: " << options.min_score << "\n"
 	   << "\t" << "--minmatch    <int>           " << "\t" << "Minimum number of matching basepairs on each end of enclosing reads. Default: " << options.min_match<< "\n"
@@ -104,6 +105,7 @@ void show_help() {
 
 void parse_commandline_options(int argc, char* argv[], Options* options) {
   enum LONG_OPTIONS {
+    OPT_MAXPROCREAD,
     OPT_SKIPQ,
     OPT_MINREAD,
     OPT_PERIOD,
@@ -147,6 +149,7 @@ void parse_commandline_options(int argc, char* argv[], Options* options) {
     OPT_VERSION,
   };
   static struct option long_options[] = {
+    {"max-proc-read", required_argument, NULL, OPT_MAXPROCREAD},
     {"skip-qscore", no_argument, NULL, OPT_SKIPQ},
     {"min-sample-reads", required_argument, NULL, OPT_MINREAD},
     {"period",      required_argument,  NULL, OPT_PERIOD},
@@ -197,6 +200,9 @@ void parse_commandline_options(int argc, char* argv[], Options* options) {
                    long_options, &option_index);
   while (ch != -1) {
     switch (ch) {
+    case OPT_MAXPROCREAD:
+      options->max_processed_reads_per_sample = atoi(optarg);
+      break;
     case OPT_SKIPQ:
       options->skip_qscore = true;
       break;

diff --git a/src/options.cpp b/src/options.cpp
@@ -88,6 +88,7 @@ Options::Options() {
   hist_mode = false;
   rescue_count = 0;
   min_reads_per_sample = 500;
+  max_processed_reads_per_sample = 3000;
   skip_qscore = false;
 }
 

diff --git a/src/options.h b/src/options.h
@@ -102,6 +102,8 @@ class Options {
   int32_t rescue_count;
   // Minimum number of reads per sample for successfull calculation of coverage
   int32_t min_reads_per_sample;
+  // Maximum number of processed reads per sample (to avoid massive coverage regions like telomeres)
+  int32_t max_processed_reads_per_sample;
   // Skip calculation of qscore
   bool skip_qscore;
 };

diff --git a/src/read_extractor.cpp b/src/read_extractor.cpp
@@ -49,14 +49,14 @@ bool ReadExtractor::ExtractReads(BamCramMultiReader* bamreader,
 				 const int32_t& regionsize,
 				 const int32_t& min_match, 
 				 std::map<std::string, LikelihoodMaximizer*> sample_likelihood_maximizers) {
-
+  total_processed_reads = 0;
   // This will keep track of information for each read pair
   std::map<std::string, ReadPair> read_pairs;
 
   if (!ProcessReadPairs(bamreader, locus, regionsize, min_match, &read_pairs, sample_info.GetIsCustomRG())) {
     return false;
   }
-
+  
   int32_t frr = 0, span = 0, encl = 0, flank = 0, offt = 0; // TODO do these need to be per sample?
   if (read_pairs.size() == 0){
     PrintMessageDieOnError("\tNot enough reads extracted. Aborting..", M_PROGRESS, options.quiet);
@@ -243,6 +243,10 @@ bool ReadExtractor::ProcessReadPairs(BamCramMultiReader* bamreader,
   BamAlignment alignment;
 
   while (bamreader->GetNextAlignment(alignment)) {
+    if (total_processed_reads > options.max_processed_reads_per_sample) {
+      PrintMessageDieOnError("Region exceeds maximum total processed reads per sample.", M_WARNING, false);
+      return false;
+    }
     std::string read_group;
     std::string fname = alignment.file_;
     if (debug) {
@@ -678,7 +682,7 @@ bool ReadExtractor::ProcessSingleRead(BamAlignment alignment,
 				      int32_t* score_value,
 				      ReadType* read_type,
 				      SingleReadType* srt) {
-
+  total_processed_reads++;
   /* Pull out sample ID so can get mean/sdev */
   std::string sample, rgid, rgtag;
   /* Pull out sample ID so can get mean/sdev */

diff --git a/src/read_extractor.h b/src/read_extractor.h
@@ -96,6 +96,7 @@ class ReadExtractor {
   const Options options;
   SampleInfo sample_info;
   ofstream readfile_;
+  int32_t total_processed_reads;
   // SSW Objects
   StripedSmithWaterman::Aligner* ssw_aligner;
   StripedSmithWaterman::Filter* ssw_filter;