Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into rna-crash
Browse files Browse the repository at this point in the history
  • Loading branch information
jeizenga committed Sep 19, 2023
2 parents a77b723 + 89eefd9 commit 191fc3e
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 54 deletions.
62 changes: 36 additions & 26 deletions .gitlab-ci.yml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion deps/backward-cpp
2 changes: 1 addition & 1 deletion deps/gbwtgraph
2 changes: 1 addition & 1 deletion deps/libbdsg
31 changes: 24 additions & 7 deletions src/subcommand/convert_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ int main_convert(int argc, char** argv) {
bool rgfa_pline = false;
bool wline = true;
algorithm_type gfa_output_algorithm = ALGORITHM_DEFAULT;
int num_threads = omp_get_max_threads(); // For GBWTGraph to GFA.

// For GBWTGraph to GFA.
int num_threads = omp_get_max_threads();
bool use_translation = true;

if (argc == 2) {
help_convert(argv);
Expand All @@ -74,6 +77,7 @@ int main_convert(int argc, char** argv) {
constexpr int OPT_REF_SAMPLE = 1000;
constexpr int OPT_GBWTGRAPH_ALGORITHM = 1001;
constexpr int OPT_VG_ALGORITHM = 1002;
constexpr int OPT_NO_TRANSLATION = 1003;

int c;
optind = 2; // force optind past command positional argument
Expand All @@ -98,6 +102,7 @@ int main_convert(int argc, char** argv) {
{"no-wline", no_argument, 0, 'W'},
{"gbwtgraph-algorithm", no_argument, 0, OPT_GBWTGRAPH_ALGORITHM},
{"vg-algorithm", no_argument, 0, OPT_VG_ALGORITHM},
{"no-translation", no_argument, 0, OPT_NO_TRANSLATION},
{"gam-to-gaf", required_argument, 0, 'G'},
{"gaf-to-gam", required_argument, 0, 'F'},
{"threads", required_argument, 0, 't'},
Expand Down Expand Up @@ -173,6 +178,9 @@ int main_convert(int argc, char** argv) {
case OPT_VG_ALGORITHM:
gfa_output_algorithm = algorithm_vg;
break;
case OPT_NO_TRANSLATION:
use_translation = false;
break;
case 'G':
no_multiple_inputs(input);
input = input_gam;
Expand Down Expand Up @@ -411,6 +419,7 @@ int main_convert(int argc, char** argv) {

gbwtgraph::GFAExtractionParameters parameters;
parameters.num_threads = num_threads;
parameters.use_translation = use_translation;
gbwtgraph::gbwt_to_gfa(*gbwt_graph, std::cout, parameters);
} else if (gfa_output_algorithm == algorithm_vg) {
// Use HandleGraph GFA conversion code
Expand Down Expand Up @@ -468,14 +477,22 @@ void help_convert(char** argv) {
<< " -p, --packed-out output in PackedGraph format [default]" << endl
<< " -x, --xg-out output in XG format" << endl
<< " -f, --gfa-out output in GFA format" << endl
<< " -H, --drop-haplotypes do not include haplotype paths in the output (useful with GBWTGraph / GBZ inputs)" << endl
<< " -H, --drop-haplotypes do not include haplotype paths in the output" << endl
<< " (useful with GBWTGraph / GBZ inputs)" << endl
<< "gfa output options (use with -f):" << endl
<< " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl
<< " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl
<< " -P, --rgfa-path STR write given path as rGFA tags instead of lines" << endl
<< " (multiple allowed, only rank-0 supported)" << endl
<< " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines" << endl
<< " (multiple allowed, only rank-0 supported)" << endl
<< " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl
<< " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl
<< " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl
<< " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl
<< " -W, --no-wline Write all paths as GFA P-lines instead of W-lines." << endl
<< " Allows handling multiple phase blocks and subranges used together." << endl
<< " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm." << endl
<< " Not compatible with other GFA output options or non-GBWT graphs." << endl
<< " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types," << endl
<< " but can't preserve original GFA coordinates." << endl
<< " --no-translation When using the GBWTGraph algorith, convert the graph directly to GFA." << endl
<< " Do not use the translation to preserve original coordinates." << endl
<< "alignment options:" << endl
<< " -G, --gam-to-gaf FILE convert GAM FILE to GAF" << endl
<< " -F, --gaf-to-gam FILE convert GAF FILE to GAM" << endl
Expand Down
49 changes: 35 additions & 14 deletions src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ void help_paths(char** argv) {
<< " -Q, --paths-by STR select the paths with the given name prefix" << endl
<< " -S, --sample STR select the haplotypes or reference paths for this sample" << endl
<< " -a, --variant-paths select the variant paths added by 'vg construct -a'" << endl
<< " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl;
<< " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl
<< " -R, --reference-paths select the reference paths" << endl
<< " -H, --haplotype-paths select the haplotype paths paths" << endl;
}

/// Chunk a path and emit it in Graph messages.
Expand Down Expand Up @@ -106,11 +108,8 @@ int main_paths(int argc, char** argv) {
string path_file;
bool select_alt_paths = false;
// What kinds of paths are we interested in?
unordered_set<PathSense> path_senses {
PathSense::REFERENCE,
PathSense::GENERIC,
PathSense::HAPLOTYPE
};
// Starts empty, but if the options put nothing in it we will add all senses.
unordered_set<PathSense> path_senses;
bool list_lengths = false;
bool list_metadata = false;
bool list_cyclicity = false;
Expand Down Expand Up @@ -143,6 +142,8 @@ int main_paths(int argc, char** argv) {
{"sample", required_argument, 0, 'S'},
{"variant-paths", no_argument, 0, 'a'},
{"generic-paths", no_argument, 0, 'G'},
{"reference-paths", no_argument, 0, 'R'},
{"haplotype-paths", no_argument, 0, 'H'},
{"coverage", no_argument, 0, 'c'},

// Hidden options for backward compatibility.
Expand All @@ -153,7 +154,7 @@ int main_paths(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGp:c",
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -239,8 +240,6 @@ int main_paths(int argc, char** argv) {

case 'S':
sample_name = optarg;
// We only care about things with references now.
path_senses = {PathSense::REFERENCE, PathSense::HAPLOTYPE};
selection_criteria++;
break;

Expand All @@ -250,9 +249,15 @@ int main_paths(int argc, char** argv) {
break;

case 'G':
// We only care about generic paths now.
path_senses = {PathSense::GENERIC};
selection_criteria++;
path_senses.insert(PathSense::GENERIC);
break;

case 'R':
path_senses.insert(PathSense::REFERENCE);
break;

case 'H':
path_senses.insert(PathSense::HAPLOTYPE);
break;

case 'c':
Expand Down Expand Up @@ -281,6 +286,22 @@ int main_paths(int argc, char** argv) {
}
}

if (path_senses.empty()) {
// No path senses were asked for explicitly.
// Fill in default ones.
path_senses = {
PathSense::REFERENCE,
PathSense::HAPLOTYPE
};
if (sample_name.empty()) {
// We can support paths with no sample.
path_senses.insert(PathSense::GENERIC);
}
} else {
// We asked for path senses specifically
selection_criteria++;
}

if (input_formats != 1 && input_formats != 2) {
std::cerr << "error: [vg paths] at least one input format (-x, -g) must be specified" << std::endl;
std::exit(EXIT_FAILURE);
Expand All @@ -303,7 +324,7 @@ int main_paths(int argc, char** argv) {
std::exit(EXIT_FAILURE);
}
if (selection_criteria > 1) {
std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G, -p) cannot be used" << std::endl;
std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p) cannot be used" << std::endl;
std::exit(EXIT_FAILURE);
}
if (select_alt_paths && !gbwt_file.empty()) {
Expand Down Expand Up @@ -363,7 +384,7 @@ int main_paths(int argc, char** argv) {

string line;
while (getline(path_stream, line)) {
path_names.emplace(move(line));
path_names.emplace(std::move(line));
}
}

Expand Down
12 changes: 10 additions & 2 deletions src/subcommand/validate_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,15 @@ int main_validate(int argc, char** argv) {
}
});

unordered_set<string> path_names;
graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
if (path_names.count(path_name)) {
cerr << "graph invalid: duplicate path name, " << path_name <<", detected" << endl;
valid_graph = false;
} else {
path_names.insert(path_name);
}
size_t i = 0;
handle_t prev;
graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
Expand All @@ -151,14 +159,14 @@ int main_validate(int argc, char** argv) {
cerr << "graph invalid: missing edge between " << (i-1) << "th step ("
<< graph->get_id(prev) << ":" << graph->get_is_reverse(prev) << ") and "
<< i << "th step (" << graph->get_id(handle) << ":" << graph->get_is_reverse(handle)
<< ") of path " << graph->get_path_name(path_handle) << endl;
<< ") of path " << path_name << endl;
valid_graph = false;
}
if (!graph->has_edge(graph->flip(handle), graph->flip(prev))) {
cerr << "graph invalid: missing edge between " << (i) << "th step ("
<< graph->get_id(handle) << ":" << !graph->get_is_reverse(handle) << ") and "
<< (i-1) << "th step (" << graph->get_id(prev) << ":" << graph->get_is_reverse(prev)
<< ") of path " << graph->get_path_name(path_handle) << endl;
<< ") of path " << path_name << endl;
valid_graph = false;
}
}
Expand Down
4 changes: 3 additions & 1 deletion test/t/11_vg_paths.t
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg

export LC_ALL="C" # force a consistent sort order

plan tests 18
plan tests 20

vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg
Expand All @@ -16,7 +16,9 @@ vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg
# List path/thread names from various input formats
is "$(vg paths --list -v x2.vg)" "x" "path listing works from vg"
is "$(vg paths --list -x x.xg)" "x" "path listing works from XG"
is "$(vg paths --list -x x.xg -G)" "x" "generic path listing works from XG"
is $(vg paths --list -g x.gbwt | wc -l) 2 "thread listing works from GBWT"
is $(vg paths --list -g x.gbwt -H | wc -l) 2 "haplotype thread listing works from GBWT"

# Select threads by name
is $(vg paths --list -Q "1#0#x#" -g x.gbwt | wc -l) 1 "thread selection by name prefix works correctly"
Expand Down
12 changes: 11 additions & 1 deletion test/t/48_vg_convert.t
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg

export LC_ALL="C" # force a consistent sort order

plan tests 102
plan tests 106

vg construct -r complex/c.fa -v complex/c.vcf.gz > c.vg
cat <(vg view c.vg | grep ^S | sort) <(vg view c.vg | grep L | uniq | wc -l) <(vg paths -v c.vg -E) > c.info
Expand Down Expand Up @@ -361,13 +361,23 @@ vg convert -f components.gbz | sort > sorted.gfa
cmp sorted.gfa correct.gfa
is $? 0 "GBZ to GFA conversion works with multiple threads"

# GFA extraction from GBZ with/without translation.
vg gbwt --gbz-format -g chopping.gbz --max-node 2 -G graphs/chopping_walks.gfa
vg convert -f -t 1 chopping.gbz > with-translation.gfa
is $? 0 "GBZ to GFA with translation"
is "$(grep -c "^S" with-translation.gfa)" "8" "8 segments"
vg convert -f -t 1 --no-translation chopping.gbz > no-translation.gfa
is $? 0 "GBZ to GFA without translation"
is "$(grep -c "^S" no-translation.gfa)" "9" "9 segments"

rm -f components.gbwt components.gg components.gbz
rm -f direct.hg correct_paths.gaf correct_haplotypes.gaf
rm -f components.hg hg_paths.gaf hg_haplotypes.gaf gbz_hg_paths.gaf gbz_hg_haplotypes.gaf
rm -f components.xg xg_paths.gaf xg_haplotypes.gaf gbz_xg_paths.gaf gbz_xg_haplotypes.gaf
rm -f no_haplotypes.xg no_haplotypes.hg
rm -f extracted.gfa gbz.gfa extracted.hg
rm -f sorted.gfa correct.gfa
rm -f chopping.gbz with-translation.gfa no-translation.gfa

#####
# Reference path conversion
Expand Down

2 comments on commit 191fc3e

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch rna-crash. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17498 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch upstream-master. View the full report here.

14 tests passed, 0 tests failed and 0 tests skipped in 15239 seconds

Please sign in to comment.