Skip to content

Commit

Permalink
Merge pull request #68 from gymreklab/max_read_thresh
Browse files Browse the repository at this point in the history
Added max processed reads per sample to avoid getting stuck in telomeres
  • Loading branch information
nmmsv authored Apr 22, 2019
2 parents 7ff3a23 + 075fabf commit 2a0e229
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 3 deletions.
6 changes: 6 additions & 0 deletions src/main_gangstr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ void show_help() {
<< "\t" << "--numbstrap <int> " << "\t" << "Number of bootstrap samples. Default: " << options.num_boot_samp << "\n"
<< "\t" << "--grid-threshold <int> " << "\t" << "Use optimization rather than grid search to find MLE if more than this many possible alleles. Default: " << options.grid_threshold << "\n"
<< "\t" << "--rescue-count <int> " << "\t" << "Number of regions that GangSTR attempts to rescue mates from (excluding off-target regions) Default: " << options.rescue_count << "\n"
<< "\t" << "--max-proc-read <int> " << "\t" << "Maximum number of processed reads per sample before a region is skipped. Default: " << options.max_processed_reads_per_sample << "\n"
<< "\n Parameters for local realignment:\n"
<< "\t" << "--minscore <int> " << "\t" << "Minimum alignment score (out of 100). Default: " << options.min_score << "\n"
<< "\t" << "--minmatch <int> " << "\t" << "Minimum number of matching basepairs on each end of enclosing reads. Default: " << options.min_match<< "\n"
Expand All @@ -104,6 +105,7 @@ void show_help() {

void parse_commandline_options(int argc, char* argv[], Options* options) {
enum LONG_OPTIONS {
OPT_MAXPROCREAD,
OPT_SKIPQ,
OPT_MINREAD,
OPT_PERIOD,
Expand Down Expand Up @@ -147,6 +149,7 @@ void parse_commandline_options(int argc, char* argv[], Options* options) {
OPT_VERSION,
};
static struct option long_options[] = {
{"max-proc-read", required_argument, NULL, OPT_MAXPROCREAD},
{"skip-qscore", no_argument, NULL, OPT_SKIPQ},
{"min-sample-reads", required_argument, NULL, OPT_MINREAD},
{"period", required_argument, NULL, OPT_PERIOD},
Expand Down Expand Up @@ -197,6 +200,9 @@ void parse_commandline_options(int argc, char* argv[], Options* options) {
long_options, &option_index);
while (ch != -1) {
switch (ch) {
case OPT_MAXPROCREAD:
options->max_processed_reads_per_sample = atoi(optarg);
break;
case OPT_SKIPQ:
options->skip_qscore = true;
break;
Expand Down
1 change: 1 addition & 0 deletions src/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Options::Options() {
hist_mode = false;
rescue_count = 0;
min_reads_per_sample = 500;
max_processed_reads_per_sample = 3000;
skip_qscore = false;
}

Expand Down
2 changes: 2 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ class Options {
int32_t rescue_count;
// Minimum number of reads per sample for successfull calculation of coverage
int32_t min_reads_per_sample;
// Maximum number of processed reads per sample (to avoid massive coverage regions like telomeres)
int32_t max_processed_reads_per_sample;
// Skip calculation of qscore
bool skip_qscore;
};
Expand Down
10 changes: 7 additions & 3 deletions src/read_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ bool ReadExtractor::ExtractReads(BamCramMultiReader* bamreader,
const int32_t& regionsize,
const int32_t& min_match,
std::map<std::string, LikelihoodMaximizer*> sample_likelihood_maximizers) {

total_processed_reads = 0;
// This will keep track of information for each read pair
std::map<std::string, ReadPair> read_pairs;

if (!ProcessReadPairs(bamreader, locus, regionsize, min_match, &read_pairs, sample_info.GetIsCustomRG())) {
return false;
}

int32_t frr = 0, span = 0, encl = 0, flank = 0, offt = 0; // TODO do these need to be per sample?
if (read_pairs.size() == 0){
PrintMessageDieOnError("\tNot enough reads extracted. Aborting..", M_PROGRESS, options.quiet);
Expand Down Expand Up @@ -243,6 +243,10 @@ bool ReadExtractor::ProcessReadPairs(BamCramMultiReader* bamreader,
BamAlignment alignment;

while (bamreader->GetNextAlignment(alignment)) {
if (total_processed_reads > options.max_processed_reads_per_sample) {
PrintMessageDieOnError("Region exceeds maximum total processed reads per sample.", M_WARNING, false);
return false;
}
std::string read_group;
std::string fname = alignment.file_;
if (debug) {
Expand Down Expand Up @@ -678,7 +682,7 @@ bool ReadExtractor::ProcessSingleRead(BamAlignment alignment,
int32_t* score_value,
ReadType* read_type,
SingleReadType* srt) {

total_processed_reads++;
/* Pull out sample ID so can get mean/sdev */
std::string sample, rgid, rgtag;
/* Pull out sample ID so can get mean/sdev */
Expand Down
1 change: 1 addition & 0 deletions src/read_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class ReadExtractor {
const Options options;
SampleInfo sample_info;
ofstream readfile_;
int32_t total_processed_reads;
// SSW Objects
StripedSmithWaterman::Aligner* ssw_aligner;
StripedSmithWaterman::Filter* ssw_filter;
Expand Down

0 comments on commit 2a0e229

Please sign in to comment.