Skip to content

Commit

Permalink
Merge pull request #4376 from vgteam/left-align-better
Browse files Browse the repository at this point in the history
Even more tweaks to anchor pruning/trimming
  • Loading branch information
jeizenga committed Aug 13, 2024
2 parents e6f0326 + 41fbb59 commit e9fbbc3
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 5 deletions.
14 changes: 9 additions & 5 deletions src/surjector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4059,10 +4059,10 @@ using namespace std;
SeqComplexity<6> context_complexity(read_context_begin, read_context_end);
// TODO: repetitive
for (int order = 1, max_order = 6; order <= max_order; ++order) {

//cerr << "padded anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]), seq " << string(read_context_begin, read_context_end) << ", order " << order << " with p-value " << context_complexity.p_value(order) << ", repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
if (context_complexity.p_value(order) < low_complexity_p_value) {
#ifdef debug_anchored_surject
cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned being for having context with low complexity at order " << order << " with p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned being for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
#endif
// the sequences is repetitive at this order
keep[i] = false;
Expand All @@ -4072,6 +4072,8 @@ using namespace std;
}
else {
for (int order = 1; order <= 6; ++order) {
//cerr << "unpadded anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]), order " << order << ", p-value " << chunk_complexity.p_value(order) << ", repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;

if (chunk_complexity.p_value(order) < low_complexity_p_value) {
#ifdef debug_anchored_surject
cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned for being low complexity at order " << order << " with p-value " << chunk_complexity.p_value(order) << " and repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
Expand Down Expand Up @@ -4151,12 +4153,14 @@ using namespace std;
bool exited_indel = false;
// note: relying on underflow for the break conditions in the reverse direction
for (mapping_idx = left_end ? 0 : path_chunk.second.mapping_size() - 1;
(mapping_idx < path_chunk.second.mapping_size() && (walked_to_length < max_low_complexity_anchor_prune || found_indel));
(mapping_idx < path_chunk.second.mapping_size() &&
(walked_to_length < max_low_complexity_anchor_trim || walked_from_length < max_low_complexity_anchor_trim || found_indel));
mapping_idx += incr) {

const auto& mapping = path_chunk.second.mapping(mapping_idx);
for (edit_idx = left_end ? 0 : mapping.edit_size() - 1;
edit_idx < mapping.edit_size() && (walked_to_length < max_low_complexity_anchor_prune || found_indel);
(edit_idx < mapping.edit_size() &&
(walked_to_length < max_low_complexity_anchor_trim || walked_from_length < max_low_complexity_anchor_trim || found_indel));
edit_idx += incr) {

const auto& edit = mapping.edit(edit_idx);
Expand All @@ -4175,7 +4179,7 @@ using namespace std;
}
}

if (found_indel && walked_to_length < max_low_complexity_anchor_prune && walked_from_length < max_low_complexity_anchor_prune) {
if (found_indel && (walked_to_length < max_low_complexity_anchor_trim || walked_from_length < max_low_complexity_anchor_trim)) {
#ifdef debug_anchored_surject
cerr << "anchor " << i << " at read pos " << (path_chunk.first.first - sequence.begin()) << " has indel ending at at mapping " << mapping_idx << ", edit " << edit_idx << ", walked to length " << walked_to_length << ", walked from length " << walked_from_length << ", which is within " << max_low_complexity_anchor_prune << " of the " << (left_end ? "left" : "right") << " end of the anchor" << endl;
#endif
Expand Down
1 change: 1 addition & 0 deletions src/surjector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ using namespace std;
int64_t max_tail_anchor_prune = 4;
double low_complexity_p_value = .0075;
int64_t max_low_complexity_anchor_prune = 40;
int64_t max_low_complexity_anchor_trim = 60;
int64_t pad_suspicious_anchors_to_length = 12;

// A function for computing band padding
Expand Down

1 comment on commit e9fbbc3

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17481 seconds

Please sign in to comment.