Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/no-tail-surject' into limit-su…
Browse files Browse the repository at this point in the history
…rject-tails
  • Loading branch information
jeizenga committed Jul 15, 2024
2 parents d3f9dec + 094b956 commit 755a634
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 82 deletions.
175 changes: 114 additions & 61 deletions src/multipath_alignment_graph.cpp

Large diffs are not rendered by default.

19 changes: 12 additions & 7 deletions src/multipath_alignment_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,11 @@ namespace vg {
/// order, even if this MultipathAlignmentGraph is. You MUST sort it
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies,
size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr,
SnarlDistanceIndex* dist_index = nullptr, const function<pair<id_t, bool>(id_t)>* project = nullptr,
bool allow_negative_scores = false, unordered_map<handle_t, bool>* left_align_strand = nullptr);
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out,
SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);

/// Do intervening and tail alignments between the anchoring paths and
/// store the result in a multipath_alignment_t. Reachability edges must
Expand All @@ -212,8 +213,8 @@ namespace vg {
/// order, even if this MultipathAlignmentGraph is. You MUST sort it
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies,
size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);
Expand Down Expand Up @@ -312,12 +313,16 @@ namespace vg {
/// Alignments of the tail off of that subpath. Also computes the
/// source subpaths and adds their numbers to the given set if not
/// null.
///
/// If a tail is longer than max_tail_length, produces an alignment
/// softclipping it.
///
/// If dynamic alignment count is also selected, can indicate a minimum number
/// of paths that must be in the extending graph in order to do an alignment
unordered_map<bool, unordered_map<size_t, vector<Alignment>>>
align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier,
size_t min_paths, unordered_set<size_t>* sources = nullptr);
size_t min_paths, size_t max_tail_length, unordered_set<size_t>* sources = nullptr);

/// Removes alignments that follow the same path through the graph, retaining only the
/// highest scoring ones. If deduplicating leftward, then also removes paths that take a
Expand Down
10 changes: 5 additions & 5 deletions src/multipath_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6256,9 +6256,9 @@ namespace vg {

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, *align_dag, aligner, true, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies,
max_tail_merge_supress_length, choose_band_padding, multipath_aln_out, snarl_manager,
distance_index, &translator);
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out,
snarl_manager, distance_index, &translator);

// Note that we do NOT topologically order the multipath_alignment_t. The
// caller has to do that, after it is finished breaking it up into
Expand Down Expand Up @@ -6313,8 +6313,8 @@ namespace vg {

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, subgraph, aligner, false, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies,
max_tail_merge_supress_length, choose_band_padding, multipath_aln_out);
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out);

for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) {
translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), translator);
Expand Down
14 changes: 11 additions & 3 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ void help_surject(char** argv) {
<< " -b, --bam-output write BAM to stdout" << endl
<< " -s, --sam-output write SAM to stdout" << endl
<< " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl
<< " -T, --max-tail-len N do not align read tails longer than N" << endl
<< " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl
<< " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl
<< " -S, --spliced interpret long deletions against paths as spliced alignments" << endl
Expand Down Expand Up @@ -96,6 +97,7 @@ int main_surject(int argc, char** argv) {
int min_splice_length = 20;
size_t watchdog_timeout = 10;
bool subpath_global = true; // force full length alignments in mpmap resolution
size_t max_tail_len = std::numeric_limits<size_t>::max();
bool qual_adj = false;
bool prune_anchors = false;
size_t max_anchors = 200;
Expand All @@ -114,7 +116,8 @@ int main_surject(int argc, char** argv) {
{"into-path", required_argument, 0, 'p'},
{"into-paths", required_argument, 0, 'F'},
{"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths
{"subpath-local", required_argument, 0, 'l'},
{"subpath-local", no_argument, 0, 'l'},
{"max-tail-len", required_argument, 0, 'T'},
{"interleaved", no_argument, 0, 'i'},
{"multimap", no_argument, 0, 'M'},
{"gaf-input", no_argument, 0, 'G'},
Expand All @@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:",
c = getopt_long (argc, argv, "hx:p:F:lT:iGmcbsN:R:f:C:t:SPa:ALMVw:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -163,6 +166,10 @@ int main_surject(int argc, char** argv) {
subpath_global = false;
break;

case 'T':
max_tail_len = parse<size_t>(optarg);
break;

case 'i':
interleaved = true;
break;
Expand Down Expand Up @@ -303,8 +310,9 @@ int main_surject(int argc, char** argv) {
else {
surjector.min_splice_length = numeric_limits<int64_t>::max();
}
surjector.max_tail_length = max_tail_len;
surjector.annotate_with_all_path_scores = annotate_with_all_path_scores;

// Count our threads
int thread_count = vg::get_thread_count();

Expand Down
1 change: 1 addition & 0 deletions src/surjector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3095,6 +3095,7 @@ using namespace std;
false, // dynamic alt alns
numeric_limits<int64_t>::max(), // max gap
0.0, // pessimistic tail gap multiplier
max_tail_length, // max length of tail to align
false, // simplify topologies
0, // unmergeable len
1, // band padding
Expand Down
3 changes: 3 additions & 0 deletions src/surjector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ using namespace std;

/// the minimum length apparent intron that we will try to repair
int64_t min_splice_repair_length = 250;

/// the maximum length of a tail that we will try to align
size_t max_tail_length = std::numeric_limits<size_t>::max();

/// How big of a graph in bp should we ever try to align against for realigning surjection?
size_t max_subgraph_bases = 100 * 1024;
Expand Down
12 changes: 6 additions & 6 deletions src/unittest/multipath_alignment_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
mpg.resect_snarls_from_paths(&snarl_manager, identity, 5);
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 5, out);
// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -148,7 +148,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
SECTION("Handles tails when anchors for them are not generated") {
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 5, out);
// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -208,7 +208,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
MultipathAlignmentGraph::create_projector(identity), 5);

// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -244,7 +244,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath

SECTION("Handles tails when anchors for them are not generated") {
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -301,7 +301,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath
MultipathAlignmentGraph::create_projector(identity), 5);

// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down Expand Up @@ -338,7 +338,7 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath

SECTION("Handles tails when anchors for them are not generated") {
// Make it align, with alignments per gap/tail
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, false, 0, 5, out);
mpg.align(query, vg, &aligner, true, 2, false, 100, 0.0, std::numeric_limits<size_t>::max(), false, 0, 5, out);

// Make sure to topologically sort the resulting alignment. TODO: Should
// the MultipathAlignmentGraph guarantee this for us by construction?
Expand Down

0 comments on commit 755a634

Please sign in to comment.