Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ref-path stubbification option -S to vg clip #4061

Merged
merged 1 commit into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions src/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1157,5 +1157,55 @@ void clip_contained_stubs(MutablePathMutableHandleGraph* graph, PathPositionHand

}

void stubbify_ref_paths(MutablePathMutableHandleGraph* graph, const vector<string>& ref_prefixes, int64_t min_fragment_len, bool verbose) {
unordered_set<edge_t> edges_to_delete;
int64_t stubbified_path_count = 0; // just for logging
graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
for (const string& ref_prefix : ref_prefixes) {
bool was_stubbified = false;
if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) {
step_handle_t first_step = graph->path_begin(path_handle);
handle_t first_handle = graph->get_handle_of_step(first_step);
graph->follow_edges(first_handle, !graph->get_is_reverse(first_handle), [&](handle_t next_handle) {
edge_t edge = graph->get_is_reverse(first_handle) ? graph->edge_handle(first_handle, next_handle) :
graph->edge_handle(next_handle, first_handle);
edges_to_delete.insert(edge);
was_stubbified = true;
});

step_handle_t last_step = graph->path_back(path_handle);
handle_t last_handle = graph->get_handle_of_step(last_step);
graph->follow_edges(last_handle, graph->get_is_reverse(last_handle), [&](handle_t next_handle) {
edge_t edge = graph->get_is_reverse(last_handle) ? graph->edge_handle(next_handle, last_handle) :
graph->edge_handle(last_handle, next_handle);
edges_to_delete.insert(edge);
was_stubbified = true;
});
}
if (was_stubbified) {
++stubbified_path_count;
}
}
});

// just for logging
unordered_map<string, size_t> clip_counts;

if (verbose) {
cerr << "[vg-clip]: Clipping " << edges_to_delete.size() << " edges to stubbify " << stubbified_path_count
<< " reference paths" << endl;
}

// delete the edges
delete_nodes_and_chop_paths(graph, {}, edges_to_delete, min_fragment_len, verbose ? &clip_counts : nullptr);

if (verbose) {
for (const auto& kv : clip_counts) {
cerr << "[vg-clip]: Ref path stubbification creating " << kv.second << " fragments from path " << kv.first << endl;
}
clip_counts.clear();
}
}

}
5 changes: 5 additions & 0 deletions src/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ void clip_contained_stubs(MutablePathMutableHandleGraph* graph, PathPositionHand
SnarlManager& snarl_manager, bool include_endpoints, int64_t min_fragment_len, bool verbose);


/**
* stubbify reference
*/
void stubbify_ref_paths(MutablePathMutableHandleGraph* graph, const vector<string>& ref_prefixes, int64_t min_fragment_len, bool verbose);

}


Expand Down
32 changes: 25 additions & 7 deletions src/subcommand/clip_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ void help_clip(char** argv) {
<< " -d, --depth N Clip out nodes and edges with path depth below N" << endl
<< "stub clipping options:" << endl
<< " -s, --stubs Clip out all stubs (nodes with degree-0 sides that aren't on reference)" << endl
<< " -S, --stubbify-paths Clip out all edges necessary to ensure selected reference paths have exactly two stubs" << endl
<< "snarl complexity clipping options: [default mode]" << endl
<< " -n, --max-nodes N Only clip out snarls with > N nodes" << endl
<< " -e, --max-edges N Only clip out snarls with > N edges" << endl
Expand Down Expand Up @@ -58,6 +59,7 @@ int main_clip(int argc, char** argv) {
bool verbose = false;
bool depth_clipping = false;
bool stub_clipping = false;
bool stubbify_reference = false;

size_t max_nodes = 0;
size_t max_edges = 0;
Expand Down Expand Up @@ -86,6 +88,7 @@ int main_clip(int argc, char** argv) {
{"bed", required_argument, 0, 'b'},
{"depth", required_argument, 0, 'd'},
{"stubs", no_argument, 0, 's'},
{"stubbify-paths", no_argument, 0, 'S'},
{"max-nodes", required_argument, 0, 'n'},
{"max-edges", required_argument, 0, 'e'},
{"max-nodes-shallow", required_argument, 0, 'N'},
Expand All @@ -105,7 +108,7 @@ int main_clip(int argc, char** argv) {

};
int option_index = 0;
c = getopt_long (argc, argv, "hb:d:sn:e:N:E:a:l:L:D:c:P:r:m:Bt:v",
c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bt:v",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -128,6 +131,9 @@ int main_clip(int argc, char** argv) {
case 's':
stub_clipping = true;
break;
case 'S':
stubbify_reference = true;
break;
case 'n':
max_nodes = parse<size_t>(optarg);
snarl_option = true;
Expand Down Expand Up @@ -197,8 +203,8 @@ int main_clip(int argc, char** argv) {
return 1;
}

if ((min_depth >= 0 || max_deletion >= 0 || stub_clipping) && (snarl_option || out_bed)) {
cerr << "error:[vg-clip] bed output (-B) and snarl complexity options (-n, -e, -N, -E, -a, -l, -L) cannot be used with -d, -D or -s" << endl;
if ((min_depth >= 0 || max_deletion >= 0 || stub_clipping || stubbify_reference) && (snarl_option || out_bed)) {
cerr << "error:[vg-clip] bed output (-B) and snarl complexity options (-n, -e, -N, -E, -a, -l, -L) cannot be used with -d, -D, -s or -S" << endl;
return 1;
}

Expand All @@ -209,8 +215,8 @@ int main_clip(int argc, char** argv) {
}

// ditto about combining
if (stub_clipping && (min_depth >= 0 || max_deletion >= 0)) {
cerr << "error:[vg-clip] -s cannot (yet?) be used with -d or -D" << endl;
if ((stub_clipping || stubbify_reference) && (min_depth >= 0 || max_deletion >= 0)) {
cerr << "error:[vg-clip] -s and -S cannot (yet?) be used with -d or -D" << endl;
return 1;
}

Expand All @@ -219,6 +225,11 @@ int main_clip(int argc, char** argv) {
return 1;
}

if (stubbify_reference && ref_prefixes.empty()) {
cerr << "error:[vg-clip] -S can only be used with -P" << endl;
return 1;
}

// default to same
if (max_deletion > 0 && context_steps < 0) {
context_steps = max_deletion;
Expand Down Expand Up @@ -337,12 +348,19 @@ int main_clip(int argc, char** argv) {
} else if (max_deletion >= 0) {
// run the deletion edge clipping on the whole graph
clip_deletion_edges(graph.get(), max_deletion, context_steps, ref_prefixes, min_fragment_len, verbose);
} else if (stub_clipping) {
} else if (stub_clipping || stubbify_reference) {
// run the stub clipping
if (bed_path.empty()) {
// do the whole graph
clip_stubs(graph.get(), ref_prefixes, min_fragment_len, verbose);
if (stubbify_reference) {
// important that this is done first, as it can actually create non-reference stubs that'd need removal below
stubbify_ref_paths(graph.get(), ref_prefixes, min_fragment_len, verbose);
}
if (stub_clipping) {
clip_stubs(graph.get(), ref_prefixes, min_fragment_len, verbose);
}
} else {
assert(stub_clipping && !stubbify_reference);
// do the contained snarls
clip_contained_stubs(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_fragment_len, verbose);
}
Expand Down
8 changes: 7 additions & 1 deletion test/t/53_clip.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 15
plan tests 17

vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg

Expand Down Expand Up @@ -88,4 +88,10 @@ is "$?" 0 "stub clipping removed all stubs"
printf "x\t5\t25\n" > region.bed
is $(vg clip tiny-stubs.gfa -s -b region.bed | vg stats -N -) "17" "region clipping filtered out only 2 / 4 stub nodes"

printf "L\t100\t+\t2\t-\t0M\n" >> tiny-stubs.gfa
printf "L\t15\t+\t13\t-\t0M\n" >> tiny-stubs.gfa
is $(vg clip tiny-stubs.gfa -sS -P x | vg stats -HT - | sort -nk 2 | awk '{print $2}' | head -1) "1" "Correct head after path stubbification"
is $(vg clip tiny-stubs.gfa -sS -P x | vg stats -HT - | sort -nk 2 | awk '{print $2}' | tail -1) "15" "Correct tail after path stubbification"


rm -f tiny.gfa tiny-stubs.gfa region.bed tiny-nostubs.gfa
Loading