Skip to content

Commit

Permalink
Require a distance index in vg minimizer
Browse files Browse the repository at this point in the history
  • Loading branch information
jltsiren committed Jul 23, 2023
1 parent 4b12d7d commit cbc6bbe
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 23 deletions.
25 changes: 18 additions & 7 deletions src/subcommand/minimizer_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ int get_default_threads() {
size_t estimate_hash_table_size(const gbwtgraph::GBZ& gbz, bool progress);

void help_minimizer(char** argv) {
std::cerr << "usage: " << argv[0] << " minimizer [options] graph" << std::endl;
std::cerr << "usage: " << argv[0] << " minimizer [options] -d graph.dist -o graph.min graph" << std::endl;
std::cerr << std::endl;
std::cerr << "Builds a (w, k)-minimizer index or a (k, s)-syncmer index of the threads in the GBWT" << std::endl;
std::cerr << "index. The graph can be any HandleGraph, which will be transformed into a GBWTGraph." << std::endl;
std::cerr << "The transformation can be avoided by providing a GBWTGraph or a GBZ graph." << std::endl;
std::cerr << std::endl;
std::cerr << "Required options:" << std::endl;
std::cerr << " -d, --distance-index X annotate the hits with positions in this distance index" << std::endl;
std::cerr << " -o, --output-name X store the index to file X" << std::endl;
std::cerr << std::endl;
std::cerr << "Minimizer options:" << std::endl;
Expand All @@ -81,13 +82,13 @@ void help_minimizer(char** argv) {
std::cerr << " --hash-table N use 2^N-cell hash tables for kmer counting (default: guess)" << std::endl;
std::cerr << std::endl;
std::cerr << "Other options:" << std::endl;
std::cerr << " -d, --distance-index X annotate the hits with positions in this distance index" << std::endl;
std::cerr << " -l, --load-index X load the index from file X and insert the new kmers into it" << std::endl;
std::cerr << " (overrides minimizer / weighted minimizer options)" << std::endl;
std::cerr << " -g, --gbwt-name X use the GBWT index in file X (required with a non-GBZ graph)" << std::endl;
std::cerr << " -p, --progress show progress information" << std::endl;
std::cerr << " -t, --threads N use N threads for index construction (default " << get_default_threads() << ")" << std::endl;
std::cerr << " (using more than " << DEFAULT_MAX_THREADS << " threads rarely helps)" << std::endl;
std::cerr << " --no-dist build the index without distance index annotations (not recommended)" << std::endl;
std::cerr << std::endl;
}

Expand All @@ -105,19 +106,22 @@ int main_minimizer(int argc, char** argv) {
size_t threshold = DEFAULT_THRESHOLD, iterations = DEFAULT_ITERATIONS, hash_table_size = 0;
bool progress = false;
int threads = get_default_threads();
bool require_distance_index = true;

constexpr int OPT_THRESHOLD = 1001;
constexpr int OPT_ITERATIONS = 1002;
constexpr int OPT_FAST_COUNTING = 1003;
constexpr int OPT_SAVE_MEMORY = 1004;
constexpr int OPT_HASH_TABLE = 1005;
constexpr int OPT_NO_DIST = 1100;

int c;
optind = 2; // force optind past command positional argument
while (true) {
static struct option long_options[] =
{
{ "gbwt-name", required_argument, 0, 'g' },
{ "distance-index", required_argument, 0, 'd' },
{ "output-name", required_argument, 0, 'o' },
{ "index-name", required_argument, 0, 'i' }, // deprecated
{ "kmer-length", required_argument, 0, 'k' },
Expand All @@ -131,23 +135,26 @@ int main_minimizer(int argc, char** argv) {
{ "fast-counting", no_argument, 0, OPT_FAST_COUNTING },
{ "save-memory", no_argument, 0, OPT_SAVE_MEMORY },
{ "hash-table", required_argument, 0, OPT_HASH_TABLE },
{ "distance-index", required_argument, 0, 'd' },
{ "load-index", required_argument, 0, 'l' },
{ "gbwt-graph", no_argument, 0, 'G' }, // deprecated
{ "progress", no_argument, 0, 'p' },
{ "threads", required_argument, 0, 't' },
{ "no-dist", no_argument, 0, OPT_NO_DIST },
{ 0, 0, 0, 0 }
};

int option_index = 0;
c = getopt_long(argc, argv, "g:o:i:k:w:bcs:Wd:l:Gpt:h", long_options, &option_index);
c = getopt_long(argc, argv, "g:d:o:i:k:w:bcs:Wl:Gpt:h", long_options, &option_index);
if (c == -1) { break; } // End of options.

switch (c)
{
case 'g':
gbwt_name = optarg;
break;
case 'd':
distance_name = optarg;
break;
case 'o':
output_name = optarg;
break;
Expand Down Expand Up @@ -199,9 +206,6 @@ int main_minimizer(int argc, char** argv) {
}
break;

case 'd':
distance_name = optarg;
break;
case 'l':
load_index = optarg;
break;
Expand All @@ -216,6 +220,9 @@ int main_minimizer(int argc, char** argv) {
threads = std::min(threads, omp_get_max_threads());
threads = std::max(threads, 1);
break;
case OPT_NO_DIST:
require_distance_index = false;
break;

case 'h':
case '?':
Expand All @@ -234,6 +241,10 @@ int main_minimizer(int argc, char** argv) {
return 1;
}
graph_name = argv[optind];
if (require_distance_index && distance_name.empty()) {
std::cerr << "[vg minimizer] error: one of options --distance-index and --no-dist is required" << std::endl;
return 1;
}
if (!load_index.empty() || use_syncmers) {
weighted = false;
}
Expand Down
22 changes: 13 additions & 9 deletions test/t/46_vg_minimizer.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 15
plan tests 16


# Indexing a single graph
Expand All @@ -15,33 +15,37 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
vg index -j x.dist x.xg

# Default construction
vg minimizer -o x.mi -g x.gbwt x.xg
vg minimizer --no-dist -o x.mi -g x.gbwt x.xg
is $? 0 "default parameters"

# Construction fails without a distance index
vg minimizer -o x.mi -g x.gbwt x.xg 2> /dev/null
is $? 1 "distance index or --no-dist is required"

# Single-threaded for deterministic results
vg minimizer -t 1 -o x.mi -g x.gbwt x.xg
vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg
is $? 0 "single-threaded construction"
is $(md5sum x.mi | cut -f 1 -d\ ) 0d75343d78d1e7d9e9fbc3d7d2386ce2 "construction is deterministic"

# Indexing syncmers
vg minimizer -t 1 -o x.mi -c -g x.gbwt x.xg
vg minimizer --no-dist -t 1 -o x.mi -c -g x.gbwt x.xg
is $? 0 "syncmer index"
is $(md5sum x.mi | cut -f 1 -d\ ) 74d836b46799590835c7d61e283df4f0 "construction is deterministic"

# Minimizer parameters
vg minimizer -t 1 -k 7 -w 3 -o x.mi -g x.gbwt -p x.xg
vg minimizer --no-dist -t 1 -k 7 -w 3 -o x.mi -g x.gbwt x.xg
is $? 0 "minimizer parameters"
is $(md5sum x.mi | cut -f 1 -d\ ) 7260e5ea22f063a8dff1f9dd60f92288 "setting -k -w works correctly"

# Construction from GBWTGraph
vg gbwt -x x.xg -g x.gg x.gbwt
vg minimizer -t 1 -g x.gbwt -o x.mi x.gg
vg minimizer --no-dist -t 1 -g x.gbwt -o x.mi x.gg
is $? 0 "construction from GBWTGraph"
is $(md5sum x.mi | cut -f 1 -d\ ) 0d75343d78d1e7d9e9fbc3d7d2386ce2 "construction is deterministic"

# Construction from GBZ
vg gbwt -x x.xg -g x.gbz --gbz-format x.gbwt
vg minimizer -t 1 -o x.mi x.gbz
vg minimizer --no-dist -t 1 -o x.mi x.gbz
is $? 0 "construction from GBZ"
is $(md5sum x.mi | cut -f 1 -d\ ) 0d75343d78d1e7d9e9fbc3d7d2386ce2 "construction is deterministic"

Expand All @@ -62,9 +66,9 @@ vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg
vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg

# Appending to the index
vg minimizer -t 1 -o x.mi -g x.gbwt x.xg
vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg
is $? 0 "multiple graphs: first"
vg minimizer -t 1 -l x.mi -o xy.mi -g y.gbwt y.xg
vg minimizer --no-dist -t 1 -l x.mi -o xy.mi -g y.gbwt y.xg
is $? 0 "multiple graphs: second"
is $(md5sum xy.mi | cut -f 1 -d\ ) 1ca39921b15cc3e7d27919a3ec7f47fa "construction is deterministic"

Expand Down
13 changes: 6 additions & 7 deletions test/t/50_vg_giraffe.t
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ vg gbwt -o x-haps.gbwt -x x.vg -v small/x.vcf.gz
vg gbwt -o x-paths.gbwt -x x.vg --index-paths
vg gbwt -o x-merged.gbwt -m x-haps.gbwt x-paths.gbwt
vg gbwt -o x.gbwt --augment-gbwt -x x.vg x-merged.gbwt
vg snarls --include-trivial x.vg > x.snarls
vg index -j x.dist x.vg
vg minimizer -k 29 -w 11 -g x.gbwt -o x.min x.xg
vg minimizer -k 29 -w 11 -d x.dist -g x.gbwt -o x.min x.xg

vg giraffe -x x.xg -H x.gbwt -m x.min -d x.dist -f reads/small.middle.ref.fq > mapped1.gam
is "${?}" "0" "a read can be mapped with xg + gbwt + min + dist specified without crashing"
Expand Down Expand Up @@ -50,7 +49,7 @@ vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.fq --full-l-bonus 0 > mapp
is "$(vg view -aj mapped-nobonus.gam | jq '.score')" "63" "Mapping without a full length bonus produces the correct score"
rm -f mapped-nobonus.gam

vg minimizer -k 29 -b -s 18 -g x.gbwt -o x.sync x.xg
vg minimizer -k 29 -b -s 18 -d x.dist -g x.gbwt -o x.sync x.xg

vg giraffe -x x.xg -H x.gbwt -m x.sync -d x.dist -f reads/small.middle.ref.fq > mapped.sync.gam
is "${?}" "0" "a read can be mapped with syncmer indexes without crashing"
Expand All @@ -59,7 +58,7 @@ chmod 400 x.dist
vg giraffe -x x.xg -H x.gbwt -m x.min -d x.dist -f reads/small.middle.ref.fq >/dev/null
is "${?}" "0" "a read can be mapped when the distance index is not writable"

rm -f x.vg x.xg x.gbwt x.snarls x.min x.sync x.dist x.gg
rm -f x.vg x.xg x.gbwt x.min x.sync x.dist x.gg
rm -f x.giraffe.gbz


Expand Down Expand Up @@ -114,10 +113,10 @@ is "$(cat surjected.sam | grep -v '^@' | cut -f 1 | sort | uniq | wc -l)" "2" "s
is "$(cat surjected.sam | grep -v '^@' | cut -f 7)" "$(printf '*\n*')" "surjection of unpaired reads to SAM produces absent partner contigs"
is "$(cat surjected.sam | grep -v '^@' | sort -k4 | cut -f 2)" "$(printf '0\n16')" "surjection of unpaired reads to SAM produces correct flags"

rm -f x.vg x.gbwt x.xg x.snarls x.min x.dist x.gg x.fa x.fa.fai x.vcf.gz x.vcf.gz.tbi single.gam paired.gam surjected.sam
rm -f x.vg x.gbwt x.xg x.min x.dist x.gg x.fa x.fa.fai x.vcf.gz x.vcf.gz.tbi single.gam paired.gam surjected.sam
rm -f x.giraffe.gbz

rm -f xy.vg xy.gbwt xy.xg xy.snarls xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi
rm -f xy.vg xy.gbwt xy.xg xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi
cp small/xy.fa .
cp small/xy.vcf.gz .
cp small/xy.vcf.gz.tbi .
Expand Down Expand Up @@ -171,7 +170,7 @@ is "$(cat xy.json | grep "correct-minimizer-coverage" | wc -l)" "2000" "paired r
is "$(cat xy.json | jq '.annotation["correct-minimizer-coverage"] | select(. > 0)' | wc -l)" "2000" "paired reads all have nonzero correct minimizer coverage"

rm -f x.vg xy.sam xy.json
rm -f xy.vg xy.gbwt xy.xg xy.snarls xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi
rm -f xy.vg xy.gbwt xy.xg xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi

vg giraffe -Z xy.giraffe.gbz -G x.gam -o BAM >xy.bam
is $? "0" "vg giraffe can map to BAM using a GBZ alone"
Expand Down

1 comment on commit cbc6bbe

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch require-dist. View the full report here.

15 tests passed, 0 tests failed and 0 tests skipped in 17729 seconds

Please sign in to comment.