Skip to content

Commit

Permalink
Add --max-tail-len option to vg surject
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed Jun 20, 2024
1 parent b7983e9 commit 9a837c6
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 29 deletions.
45 changes: 37 additions & 8 deletions src/multipath_alignment_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2289,7 +2289,7 @@ namespace vg {
// Align the tails, not collecting a set of source subpaths.
// TODO: factor of 1/2 is arbitray, but i do think it should be fewer than the max
auto tail_alignments = align_tails(alignment, align_graph, aligner, max<size_t>(1, max_alt_alns / 2),
dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, max_alt_alns, nullptr);
dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, max_alt_alns, std::numeric_limits<size_t>::max(), nullptr);


for (bool handling_right_tail : {false, true}) {
Expand Down Expand Up @@ -4225,8 +4225,9 @@ namespace vg {

void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner,
bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap,
double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len,
size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls,
double pessimistic_tail_gap_multiplier, size_t max_tail_length, bool simplify_topologies,
size_t unmergeable_len, size_t band_padding,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls,
SnarlDistanceIndex* dist_index, const function<pair<id_t, bool>(id_t)>* project,
bool allow_negative_scores, unordered_map<handle_t, bool>* left_align_strand) {

Expand All @@ -4242,6 +4243,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap
dynamic_alt_alns,
max_gap,
pessimistic_tail_gap_multiplier,
max_tail_length,
simplify_topologies,
unmergeable_len,
constant_padding,
Expand Down Expand Up @@ -5182,7 +5184,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap

void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner,
bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap,
double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len,
double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len,
function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls,
SnarlDistanceIndex* dist_index, const function<pair<id_t, bool>(id_t)>* project,
Expand Down Expand Up @@ -5448,7 +5451,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap

// Actually align the tails
auto tail_alignments = align_tails(alignment, align_graph, aligner, max_alt_alns, dynamic_alt_alns,
max_gap, pessimistic_tail_gap_multiplier, 0, &sources);
max_gap, pessimistic_tail_gap_multiplier, 0, max_tail_length, &sources);

// TODO: merge and simplify the tail alignments? rescoring would be kind of a pain...

Expand Down Expand Up @@ -5992,7 +5995,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap
unordered_map<bool, unordered_map<size_t, vector<Alignment>>>
MultipathAlignmentGraph::align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier,
size_t min_paths, unordered_set<size_t>* sources) {
size_t min_paths, size_t max_tail_length, unordered_set<size_t>* sources) {

#ifdef debug_multipath_alignment
cerr << "doing tail alignments to:" << endl;
Expand Down Expand Up @@ -6108,7 +6111,20 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap

// align against the graph
auto& alt_alignments = right_alignments[j];
if (num_alt_alns == 1) {
if (right_tail_sequence.sequence().size() > max_tail_length) {
#ifdef debug_multipath_alignment
cerr << "softclip long right" << endl;
#endif
alt_alignments.emplace_back(std::move(right_tail_sequence));
Mapping* m = alt_alignments.back().mutable_path()->add_mapping();
m->mutable_position()->set_node_id(id(end_pos));
m->mutable_position()->set_is_reverse(is_rev(end_pos));
m->mutable_position()->set_offset(offset(end_pos));
Edit* e = m->add_edit();
e->set_to_length(alt_alignments.back().sequence().size());
e->set_sequence(alt_alignments.back().sequence());
}
else if (num_alt_alns == 1) {
#ifdef debug_multipath_alignment
cerr << "align right with dozeu with gap " << gap << endl;
#endif
Expand Down Expand Up @@ -6229,7 +6245,20 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap

// align against the graph
auto& alt_alignments = left_alignments[j];
if (num_alt_alns == 1) {
if (left_tail_sequence.sequence().size() > max_tail_length) {
#ifdef debug_multipath_alignment
cerr << "softclip long left" << endl;
#endif
alt_alignments.emplace_back(std::move(left_tail_sequence));
Mapping* m = alt_alignments.back().mutable_path()->add_mapping();
m->mutable_position()->set_node_id(id(begin_pos));
m->mutable_position()->set_is_reverse(is_rev(begin_pos));
m->mutable_position()->set_offset(offset(begin_pos));
Edit* e = m->add_edit();
e->set_to_length(alt_alignments.back().sequence().size());
e->set_sequence(alt_alignments.back().sequence());
}
else if (num_alt_alns == 1) {
#ifdef debug_multipath_alignment
cerr << "align left with dozeu using gap " << gap << endl;
#endif
Expand Down
19 changes: 12 additions & 7 deletions src/multipath_alignment_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,11 @@ namespace vg {
/// order, even if this MultipathAlignmentGraph is. You MUST sort it
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies,
size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr,
SnarlDistanceIndex* dist_index = nullptr, const function<pair<id_t, bool>(id_t)>* project = nullptr,
bool allow_negative_scores = false, unordered_map<handle_t, bool>* left_align_strand = nullptr);
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out,
SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);

/// Do intervening and tail alignments between the anchoring paths and
/// store the result in a multipath_alignment_t. Reachability edges must
Expand All @@ -212,8 +213,8 @@ namespace vg {
/// order, even if this MultipathAlignmentGraph is. You MUST sort it
/// with topologically_order_subpaths() before trying to run DP on it.
void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies,
size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length,
bool simplify_topologies, size_t unmergeable_len, function<size_t(const Alignment&,const HandleGraph&)> band_padding_function,
multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr,
const function<pair<id_t, bool>(id_t)>* project = nullptr, bool allow_negative_scores = false,
unordered_map<handle_t, bool>* left_align_strand = nullptr);
Expand Down Expand Up @@ -312,12 +313,16 @@ namespace vg {
/// Alignments of the tail off of that subpath. Also computes the
/// source subpaths and adds their numbers to the given set if not
/// null.
///
/// If a tail is longer than max_tail_length, produces an alignment
/// softclipping it.
///
/// If dynamic alignment count is also selected, can indicate a minimum number
/// of paths that must be in the extending graph in order to do an alignment
unordered_map<bool, unordered_map<size_t, vector<Alignment>>>
align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner,
size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier,
size_t min_paths, unordered_set<size_t>* sources = nullptr);
size_t min_paths, size_t max_tail_length, unordered_set<size_t>* sources = nullptr);

/// Removes alignments that follow the same path through the graph, retaining only the
/// highest scoring ones. If deduplicating leftward, then also removes paths that take a
Expand Down
10 changes: 5 additions & 5 deletions src/multipath_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6256,9 +6256,9 @@ namespace vg {

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, *align_dag, aligner, true, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies,
max_tail_merge_supress_length, choose_band_padding, multipath_aln_out, snarl_manager,
distance_index, &translator);
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out,
snarl_manager, distance_index, &translator);

// Note that we do NOT topologically order the multipath_alignment_t. The
// caller has to do that, after it is finished breaking it up into
Expand Down Expand Up @@ -6313,8 +6313,8 @@ namespace vg {

// do the connecting alignments and fill out the multipath_alignment_t object
multi_aln_graph.align(alignment, subgraph, aligner, false, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap,
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies,
max_tail_merge_supress_length, choose_band_padding, multipath_aln_out);
use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, std::numeric_limits<size_t>::max(),
simplify_topologies, max_tail_merge_supress_length, choose_band_padding, multipath_aln_out);

for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) {
translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), translator);
Expand Down
14 changes: 11 additions & 3 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ void help_surject(char** argv) {
<< " -b, --bam-output write BAM to stdout" << endl
<< " -s, --sam-output write SAM to stdout" << endl
<< " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl
<< " -T, --max-tail-len N do not align read tails longer than N" << endl
<< " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl
<< " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl
<< " -S, --spliced interpret long deletions against paths as spliced alignments" << endl
Expand Down Expand Up @@ -96,6 +97,7 @@ int main_surject(int argc, char** argv) {
int min_splice_length = 20;
size_t watchdog_timeout = 10;
bool subpath_global = true; // force full length alignments in mpmap resolution
size_t max_tail_len = std::numeric_limits<size_t>::max();
bool qual_adj = false;
bool prune_anchors = false;
size_t max_anchors = 200;
Expand All @@ -114,7 +116,8 @@ int main_surject(int argc, char** argv) {
{"into-path", required_argument, 0, 'p'},
{"into-paths", required_argument, 0, 'F'},
{"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths
{"subpath-local", required_argument, 0, 'l'},
{"subpath-local", no_argument, 0, 'l'},
{"max-tail-len", no_argument, 0, 'T'},
{"interleaved", no_argument, 0, 'i'},
{"multimap", no_argument, 0, 'M'},
{"gaf-input", no_argument, 0, 'G'},
Expand All @@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:",
c = getopt_long (argc, argv, "hx:p:F:lT:iGmcbsN:R:f:C:t:SPa:ALMVw:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -163,6 +166,10 @@ int main_surject(int argc, char** argv) {
subpath_global = false;
break;

case 'T':
max_tail_len = parse<size_t>(optarg);
break;

case 'i':
interleaved = true;
break;
Expand Down Expand Up @@ -303,8 +310,9 @@ int main_surject(int argc, char** argv) {
else {
surjector.min_splice_length = numeric_limits<int64_t>::max();
}
surjector.max_tail_length = max_tail_len;
surjector.annotate_with_all_path_scores = annotate_with_all_path_scores;

// Count our threads
int thread_count = vg::get_thread_count();

Expand Down
1 change: 1 addition & 0 deletions src/surjector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3089,6 +3089,7 @@ using namespace std;
false, // dynamic alt alns
numeric_limits<int64_t>::max(), // max gap
0.0, // pessimistic tail gap multiplier
max_tail_length, // max length of tail to align
false, // simplify topologies
0, // unmergeable len
1, // band padding
Expand Down
3 changes: 3 additions & 0 deletions src/surjector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ using namespace std;

/// the minimum length apparent intron that we will try to repair
int64_t min_splice_repair_length = 250;

/// the maximum length of a tail that we will try to align
size_t max_tail_length = std::numeric_limits<size_t>::max();

/// How big of a graph in bp should we ever try to align against for realigning surjection?
size_t max_subgraph_bases = 100 * 1024;
Expand Down
Loading

1 comment on commit 9a837c6

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch no-tail-surject. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17316 seconds

Please sign in to comment.