Skip to content

Commit

Permalink
Merge pull request #4350 from vgteam/limit-surject-tails
Browse files Browse the repository at this point in the history
Turn surject's hard filter for long tails into a soft filter
  • Loading branch information
jeizenga authored Jul 18, 2024
2 parents c1bcdef + dfac868 commit a656a64
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 150 deletions.
352 changes: 223 additions & 129 deletions src/multipath_alignment_graph.cpp

Large diffs are not rendered by default.

19 changes: 12 additions & 7 deletions src/multipath_alignment_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,11 @@ namespace vg {
/// order, even if this MultipathAlignmentGraph is. You MUST sort it
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies,
size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr,
SnarlDistanceIndex* dist_index = nullptr, const function<pair<id_t, bool>(id_t)>* project = nullptr,
bool allow_negative_scores = false, unordered_map<handle_t, bool>* left_align_strand = nullptr);
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out,
SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);

/// Do intervening and tail alignments between the anchoring paths and
/// store the result in a multipath_alignment_t. Reachability edges must
Expand All @@ -212,8 +213,8 @@ namespace vg {
/// order, even if this MultipathAlignmentGraph is. You MUST sort it
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies,
size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);
Expand Down Expand Up @@ -312,12 +313,16 @@ namespace vg {
/// Alignments of the tail off of that subpath. Also computes the
/// source subpaths and adds their numbers to the given set if not
/// null.
///
/// If a tail is longer than max_tail_length, produces an alignment
/// softclipping it.
///
/// If dynamic alignment count is also selected, can indicate a minimum number
/// of paths that must be in the extending graph in order to do an alignment
unordered_map<bool, unordered_map<size_t, vector<Alignment>>>
align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier,
size_t min_paths, unordered_set<size_t>* sources = nullptr);
size_t min_paths, size_t max_tail_length, unordered_set<size_t>* sources = nullptr);

/// Removes alignments that follow the same path through the graph, retaining only the
/// highest scoring ones. If deduplicating leftward, then also removes paths that take a
Expand Down
10 changes: 5 additions & 5 deletions src/multipath_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6256,9 +6256,9 @@ namespace vg {

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, *align_dag, aligner, true, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies,
max_tail_merge_supress_length, choose_band_padding, multipath_aln_out, snarl_manager,
distance_index, &translator);
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out,
snarl_manager, distance_index, &translator);

// Note that we do NOT topologically order the multipath_alignment_t. The
// caller has to do that, after it is finished breaking it up into
Expand Down Expand Up @@ -6313,8 +6313,8 @@ namespace vg {

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, subgraph, aligner, false, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies,
max_tail_merge_supress_length, choose_band_padding, multipath_aln_out);
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out);

for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) {
translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), translator);
Expand Down
14 changes: 11 additions & 3 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ void help_surject(char** argv) {
<< " -b, --bam-output write BAM to stdout" << endl
<< " -s, --sam-output write SAM to stdout" << endl
<< " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl
<< " -T, --max-tail-len N only align up to N bases of read tails (default: 10000)" << endl
<< " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl
<< " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl
<< " -S, --spliced interpret long deletions against paths as spliced alignments" << endl
Expand Down Expand Up @@ -96,6 +97,7 @@ int main_surject(int argc, char** argv) {
int min_splice_length = 20;
size_t watchdog_timeout = 10;
bool subpath_global = true; // force full length alignments in mpmap resolution
size_t max_tail_len = 10000;
bool qual_adj = false;
bool prune_anchors = false;
size_t max_anchors = 200;
Expand All @@ -114,7 +116,8 @@ int main_surject(int argc, char** argv) {
{"into-path", required_argument, 0, 'p'},
{"into-paths", required_argument, 0, 'F'},
{"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths
{"subpath-local", required_argument, 0, 'l'},
{"subpath-local", no_argument, 0, 'l'},
{"max-tail-len", required_argument, 0, 'T'},
{"interleaved", no_argument, 0, 'i'},
{"multimap", no_argument, 0, 'M'},
{"gaf-input", no_argument, 0, 'G'},
Expand All @@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:",
c = getopt_long (argc, argv, "hx:p:F:lT:iGmcbsN:R:f:C:t:SPa:ALMVw:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -163,6 +166,10 @@ int main_surject(int argc, char** argv) {
subpath_global = false;
break;

case 'T':
max_tail_len = parse<size_t>(optarg);
break;

case 'i':
interleaved = true;
break;
Expand Down Expand Up @@ -309,8 +316,9 @@ int main_surject(int argc, char** argv) {
else {
surjector.min_splice_length = numeric_limits<int64_t>::max();
}
surjector.max_tail_length = max_tail_len;
surjector.annotate_with_all_path_scores = annotate_with_all_path_scores;

// Count our threads
int thread_count = vg::get_thread_count();

Expand Down
1 change: 1 addition & 0 deletions src/surjector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3095,6 +3095,7 @@ using namespace std;
false, // dynamic alt alns
numeric_limits<int64_t>::max(), // max gap
0.0, // pessimistic tail gap multiplier
max_tail_length, // max length of tail to align
false, // simplify topologies
0, // unmergeable len
1, // band padding
Expand Down
3 changes: 3 additions & 0 deletions src/surjector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ using namespace std;

/// the minimum length apparent intron that we will try to repair
int64_t min_splice_repair_length = 250;

/// the maximum length of a tail that we will try to align
size_t max_tail_length = 10000;

/// How big of a graph in bp should we ever try to align against for realigning surjection?
size_t max_subgraph_bases = 100 * 1024;
Expand Down
12 changes: 6 additions & 6 deletions src/unittest/multipath_alignment_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
mpg.resect_snarls_from_paths(&snarl_manager, identity, 5);
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 5, out);
// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -148,7 +148,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
SECTION("Handles tails when anchors for them are not generated") {
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 5, out);
// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -208,7 +208,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
MultipathAlignmentGraph::create_projector(identity), 5);

// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -244,7 +244,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath

SECTION("Handles tails when anchors for them are not generated") {
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -301,7 +301,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
MultipathAlignmentGraph::create_projector(identity), 5);

// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -338,7 +338,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath

SECTION("Handles tails when anchors for them are not generated") {
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down

1 comment on commit a656a64

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

14 tests passed, 2 tests failed and 0 tests skipped in 15725 seconds

Failed tests:

  • test_sim_mhc_snp1kg (504 seconds)
  • test_sim_mhc_snp1kg_mpmap (0 seconds)

Please sign in to comment.