From a6dfc43d7c2b47cbbf8c30c139ad603dd011dabd Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 3 Jul 2024 09:15:42 -0700 Subject: [PATCH] Add --max-graph-scale/-g option to surject and make giving up scale with read size --- src/subcommand/mpmap_main.cpp | 2 +- src/subcommand/surject_main.cpp | 18 +++++++++++++++--- src/surjector.cpp | 4 ++-- src/surjector.hpp | 9 +++++++-- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp index 773159fdbb8..2058ec05c9a 100644 --- a/src/subcommand/mpmap_main.cpp +++ b/src/subcommand/mpmap_main.cpp @@ -1850,7 +1850,7 @@ int main_mpmap(int argc, char** argv) { surjector->adjust_alignments_for_base_quality = qual_adjusted; if (transcriptomic) { // FIXME: replicating the behavior in surject_main - surjector->max_subgraph_bases = 16 * 1024 * 1024; + surjector->max_subgraph_bases_per_read_base = Surjector::SPLICED_DEFAULT_SUBGRAPH_LIMIT; } if (!ref_paths_name.empty()) { diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 78e397431c8..f00b1472301 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -47,7 +47,8 @@ void help_surject(char** argv) { << " -b, --bam-output write BAM to stdout" << endl << " -s, --sam-output write SAM to stdout" << endl << " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl - << " -T, --max-tail-len N do not align read tails longer than N" << endl + << " -T, --max-tail-len N do not align read tails longer than N" << endl + << " -g, --max-graph-scale X make reads unmapped if alignment target subgraph size exceeds read length by a factor of X (default: " << Surjector::DEFAULT_SUBGRAPH_LIMIT << " or " << Surjector::SPLICED_DEFAULT_SUBGRAPH_LIMIT << " with -S)"<< endl << " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl << " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl << " -S, --spliced interpret long deletions against paths as spliced alignments" << endl @@ -98,6 +99,8 @@ int main_surject(int argc, char** argv) { size_t watchdog_timeout = 10; bool subpath_global = true; // force full length alignments in mpmap resolution size_t max_tail_len = std::numeric_limits::max(); + // THis needs to be nullable so that we can use the default for spliced if doing spliced mode. + std::unique_ptr max_graph_scale; bool qual_adj = false; bool prune_anchors = false; size_t max_anchors = 200; @@ -118,6 +121,7 @@ int main_surject(int argc, char** argv) { {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths {"subpath-local", no_argument, 0, 'l'}, {"max-tail-len", required_argument, 0, 'T'}, + {"max-graph-scale", required_argument, 0, 'g'}, {"interleaved", no_argument, 0, 'i'}, {"multimap", no_argument, 0, 'M'}, {"gaf-input", no_argument, 0, 'G'}, @@ -140,7 +144,7 @@ int main_surject(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:p:F:lT:iGmcbsN:R:f:C:t:SPa:ALMVw:", + c = getopt_long (argc, argv, "hx:p:F:lT:g:iGmcbsN:R:f:C:t:SPa:ALMVw:", long_options, &option_index); // Detect the end of the options. @@ -170,6 +174,10 @@ int main_surject(int argc, char** argv) { max_tail_len = parse(optarg); break; + case 'g': + max_graph_scale.reset(new double(parse(optarg))); + break; + case 'i': interleaved = true; break; @@ -305,13 +313,17 @@ int main_surject(int argc, char** argv) { if (spliced) { surjector.min_splice_length = min_splice_length; // we have to bump this up to be sure to align most splice junctions - surjector.max_subgraph_bases = 16 * 1024 * 1024; + surjector.max_subgraph_bases_per_read_base = Surjector::SPLICED_DEFAULT_SUBGRAPH_LIMIT; } else { surjector.min_splice_length = numeric_limits::max(); } surjector.max_tail_length = max_tail_len; surjector.annotate_with_all_path_scores = annotate_with_all_path_scores; + if (max_graph_scale) { + // We have an override + surjector.max_subgraph_bases_per_read_base = *max_graph_scale; + } // Count our threads int thread_count = vg::get_thread_count(); diff --git a/src/surjector.cpp b/src/surjector.cpp index 9e993815bac..7929ff8a427 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3006,14 +3006,14 @@ using namespace std; #endif size_t subgraph_bases = aln_graph->get_total_length(); - if (subgraph_bases > max_subgraph_bases) { + if (source.sequence().size() > 0 && subgraph_bases / (double) source.sequence().size() > max_subgraph_bases_per_read_base) { #ifdef debug_always_warn_on_too_long cerr << "gave up on too long read " + source.name() + "\n"; #endif if (!warned_about_subgraph_size.test_and_set()) { cerr << "warning[vg::Surjector]: Refusing to perform very large alignment against " << subgraph_bases << " bp strand split subgraph for read " << source.name() - << "; suppressing further warnings." << endl; + << " length " << source.sequence().size() << "; suppressing further warnings." << endl; } surjected = move(make_null_alignment(source)); return surjected; diff --git a/src/surjector.hpp b/src/surjector.hpp index 0b45591a4b1..6fb5a4462c0 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -111,8 +111,13 @@ using namespace std; /// the maximum length of a tail that we will try to align size_t max_tail_length = std::numeric_limits::max(); - /// How big of a graph in bp should we ever try to align against for realigning surjection? - size_t max_subgraph_bases = 100 * 1024; + /// We have a different default max_subgraph_bases_per_read_base to use for spliced alignment. + static constexpr double SPLICED_DEFAULT_SUBGRAPH_LIMIT = 16 * 1024 * 1024 / 125.0; + /// And an accessible default max_subgraph_bases_per_read_base for normal alignment. + static constexpr double DEFAULT_SUBGRAPH_LIMIT = 100 * 1024 / 125.0; + /// How big of a graph (in graph bases per read base) should we ever try to align against for realigning surjection? + double max_subgraph_bases_per_read_base = DEFAULT_SUBGRAPH_LIMIT; + /// in spliced surject, downsample if the base-wise average coverage by chunks is this high int64_t min_fold_coverage_for_downsample = 8;