Skip to content

Commit

Permalink
Add --max-graph-scale/-g option to surject and make giving up scale w…
Browse files Browse the repository at this point in the history
…ith read size
  • Loading branch information
adamnovak committed Jul 3, 2024
1 parent 88c3ff6 commit a6dfc43
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/subcommand/mpmap_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1850,7 +1850,7 @@ int main_mpmap(int argc, char** argv) {
surjector->adjust_alignments_for_base_quality = qual_adjusted;
if (transcriptomic) {
// FIXME: replicating the behavior in surject_main
surjector->max_subgraph_bases = 16 * 1024 * 1024;
surjector->max_subgraph_bases_per_read_base = Surjector::SPLICED_DEFAULT_SUBGRAPH_LIMIT;
}

if (!ref_paths_name.empty()) {
Expand Down
18 changes: 15 additions & 3 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ void help_surject(char** argv) {
<< " -b, --bam-output write BAM to stdout" << endl
<< " -s, --sam-output write SAM to stdout" << endl
<< " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl
<< " -T, --max-tail-len N do not align read tails longer than N" << endl
<< " -T, --max-tail-len N do not align read tails longer than N" << endl
<< " -g, --max-graph-scale X make reads unmapped if alignment target subgraph size exceeds read length by a factor of X (default: " << Surjector::DEFAULT_SUBGRAPH_LIMIT << " or " << Surjector::SPLICED_DEFAULT_SUBGRAPH_LIMIT << " with -S)"<< endl
<< " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl
<< " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl
<< " -S, --spliced interpret long deletions against paths as spliced alignments" << endl
Expand Down Expand Up @@ -98,6 +99,8 @@ int main_surject(int argc, char** argv) {
size_t watchdog_timeout = 10;
bool subpath_global = true; // force full length alignments in mpmap resolution
size_t max_tail_len = std::numeric_limits<size_t>::max();
// THis needs to be nullable so that we can use the default for spliced if doing spliced mode.
std::unique_ptr<double> max_graph_scale;
bool qual_adj = false;
bool prune_anchors = false;
size_t max_anchors = 200;
Expand All @@ -118,6 +121,7 @@ int main_surject(int argc, char** argv) {
{"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths
{"subpath-local", no_argument, 0, 'l'},
{"max-tail-len", required_argument, 0, 'T'},
{"max-graph-scale", required_argument, 0, 'g'},
{"interleaved", no_argument, 0, 'i'},
{"multimap", no_argument, 0, 'M'},
{"gaf-input", no_argument, 0, 'G'},
Expand All @@ -140,7 +144,7 @@ int main_surject(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hx:p:F:lT:iGmcbsN:R:f:C:t:SPa:ALMVw:",
c = getopt_long (argc, argv, "hx:p:F:lT:g:iGmcbsN:R:f:C:t:SPa:ALMVw:",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -170,6 +174,10 @@ int main_surject(int argc, char** argv) {
max_tail_len = parse<size_t>(optarg);
break;

case 'g':
max_graph_scale.reset(new double(parse<double>(optarg)));
break;

case 'i':
interleaved = true;
break;
Expand Down Expand Up @@ -305,13 +313,17 @@ int main_surject(int argc, char** argv) {
if (spliced) {
surjector.min_splice_length = min_splice_length;
// we have to bump this up to be sure to align most splice junctions
surjector.max_subgraph_bases = 16 * 1024 * 1024;
surjector.max_subgraph_bases_per_read_base = Surjector::SPLICED_DEFAULT_SUBGRAPH_LIMIT;
}
else {
surjector.min_splice_length = numeric_limits<int64_t>::max();
}
surjector.max_tail_length = max_tail_len;
surjector.annotate_with_all_path_scores = annotate_with_all_path_scores;
if (max_graph_scale) {
// We have an override
surjector.max_subgraph_bases_per_read_base = *max_graph_scale;
}

// Count our threads
int thread_count = vg::get_thread_count();
Expand Down
4 changes: 2 additions & 2 deletions src/surjector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3006,14 +3006,14 @@ using namespace std;
#endif

size_t subgraph_bases = aln_graph->get_total_length();
if (subgraph_bases > max_subgraph_bases) {
if (source.sequence().size() > 0 && subgraph_bases / (double) source.sequence().size() > max_subgraph_bases_per_read_base) {
#ifdef debug_always_warn_on_too_long
cerr << "gave up on too long read " + source.name() + "\n";
#endif
if (!warned_about_subgraph_size.test_and_set()) {
cerr << "warning[vg::Surjector]: Refusing to perform very large alignment against "
<< subgraph_bases << " bp strand split subgraph for read " << source.name()
<< "; suppressing further warnings." << endl;
<< " length " << source.sequence().size() << "; suppressing further warnings." << endl;
}
surjected = move(make_null_alignment(source));
return surjected;
Expand Down
9 changes: 7 additions & 2 deletions src/surjector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,13 @@ using namespace std;
/// the maximum length of a tail that we will try to align
size_t max_tail_length = std::numeric_limits<size_t>::max();

/// How big of a graph in bp should we ever try to align against for realigning surjection?
size_t max_subgraph_bases = 100 * 1024;
/// We have a different default max_subgraph_bases_per_read_base to use for spliced alignment.
static constexpr double SPLICED_DEFAULT_SUBGRAPH_LIMIT = 16 * 1024 * 1024 / 125.0;
/// And an accessible default max_subgraph_bases_per_read_base for normal alignment.
static constexpr double DEFAULT_SUBGRAPH_LIMIT = 100 * 1024 / 125.0;
/// How big of a graph (in graph bases per read base) should we ever try to align against for realigning surjection?
double max_subgraph_bases_per_read_base = DEFAULT_SUBGRAPH_LIMIT;


/// in spliced surject, downsample if the base-wise average coverage by chunks is this high
int64_t min_fold_coverage_for_downsample = 8;
Expand Down

1 comment on commit a6dfc43

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch lr-giraffe. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17587 seconds

Please sign in to comment.