diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index ba5266690e6..86f0039cd25 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -57,13 +57,14 @@ int get_default_threads() { size_t estimate_hash_table_size(const gbwtgraph::GBZ& gbz, bool progress); void help_minimizer(char** argv) { - std::cerr << "usage: " << argv[0] << " minimizer [options] graph" << std::endl; + std::cerr << "usage: " << argv[0] << " minimizer [options] -d graph.dist -o graph.min graph" << std::endl; std::cerr << std::endl; std::cerr << "Builds a (w, k)-minimizer index or a (k, s)-syncmer index of the threads in the GBWT" << std::endl; std::cerr << "index. The graph can be any HandleGraph, which will be transformed into a GBWTGraph." << std::endl; std::cerr << "The transformation can be avoided by providing a GBWTGraph or a GBZ graph." << std::endl; std::cerr << std::endl; std::cerr << "Required options:" << std::endl; + std::cerr << " -d, --distance-index X annotate the hits with positions in this distance index" << std::endl; std::cerr << " -o, --output-name X store the index to file X" << std::endl; std::cerr << std::endl; std::cerr << "Minimizer options:" << std::endl; @@ -81,13 +82,13 @@ void help_minimizer(char** argv) { std::cerr << " --hash-table N use 2^N-cell hash tables for kmer counting (default: guess)" << std::endl; std::cerr << std::endl; std::cerr << "Other options:" << std::endl; - std::cerr << " -d, --distance-index X annotate the hits with positions in this distance index" << std::endl; std::cerr << " -l, --load-index X load the index from file X and insert the new kmers into it" << std::endl; std::cerr << " (overrides minimizer / weighted minimizer options)" << std::endl; std::cerr << " -g, --gbwt-name X use the GBWT index in file X (required with a non-GBZ graph)" << std::endl; std::cerr << " -p, --progress show progress information" << std::endl; std::cerr << " -t, --threads N use N threads for index construction (default " << get_default_threads() << ")" << std::endl; std::cerr << " (using more than " << DEFAULT_MAX_THREADS << " threads rarely helps)" << std::endl; + std::cerr << " --no-dist build the index without distance index annotations (not recommended)" << std::endl; std::cerr << std::endl; } @@ -105,12 +106,14 @@ int main_minimizer(int argc, char** argv) { size_t threshold = DEFAULT_THRESHOLD, iterations = DEFAULT_ITERATIONS, hash_table_size = 0; bool progress = false; int threads = get_default_threads(); + bool require_distance_index = true; constexpr int OPT_THRESHOLD = 1001; constexpr int OPT_ITERATIONS = 1002; constexpr int OPT_FAST_COUNTING = 1003; constexpr int OPT_SAVE_MEMORY = 1004; constexpr int OPT_HASH_TABLE = 1005; + constexpr int OPT_NO_DIST = 1100; int c; optind = 2; // force optind past command positional argument @@ -118,6 +121,7 @@ int main_minimizer(int argc, char** argv) { static struct option long_options[] = { { "gbwt-name", required_argument, 0, 'g' }, + { "distance-index", required_argument, 0, 'd' }, { "output-name", required_argument, 0, 'o' }, { "index-name", required_argument, 0, 'i' }, // deprecated { "kmer-length", required_argument, 0, 'k' }, @@ -131,16 +135,16 @@ int main_minimizer(int argc, char** argv) { { "fast-counting", no_argument, 0, OPT_FAST_COUNTING }, { "save-memory", no_argument, 0, OPT_SAVE_MEMORY }, { "hash-table", required_argument, 0, OPT_HASH_TABLE }, - { "distance-index", required_argument, 0, 'd' }, { "load-index", required_argument, 0, 'l' }, { "gbwt-graph", no_argument, 0, 'G' }, // deprecated { "progress", no_argument, 0, 'p' }, { "threads", required_argument, 0, 't' }, + { "no-dist", no_argument, 0, OPT_NO_DIST }, { 0, 0, 0, 0 } }; int option_index = 0; - c = getopt_long(argc, argv, "g:o:i:k:w:bcs:Wd:l:Gpt:h", long_options, &option_index); + c = getopt_long(argc, argv, "g:d:o:i:k:w:bcs:Wl:Gpt:h", long_options, &option_index); if (c == -1) { break; } // End of options. switch (c) @@ -148,6 +152,9 @@ int main_minimizer(int argc, char** argv) { case 'g': gbwt_name = optarg; break; + case 'd': + distance_name = optarg; + break; case 'o': output_name = optarg; break; @@ -199,9 +206,6 @@ int main_minimizer(int argc, char** argv) { } break; - case 'd': - distance_name = optarg; - break; case 'l': load_index = optarg; break; @@ -216,6 +220,9 @@ int main_minimizer(int argc, char** argv) { threads = std::min(threads, omp_get_max_threads()); threads = std::max(threads, 1); break; + case OPT_NO_DIST: + require_distance_index = false; + break; case 'h': case '?': @@ -234,6 +241,10 @@ int main_minimizer(int argc, char** argv) { return 1; } graph_name = argv[optind]; + if (require_distance_index && distance_name.empty()) { + std::cerr << "[vg minimizer] error: one of options --distance-index and --no-dist is required" << std::endl; + return 1; + } if (!load_index.empty() || use_syncmers) { weighted = false; } diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index 2216d77b515..06f9133f09d 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 15 +plan tests 16 # Indexing a single graph @@ -15,33 +15,37 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg index -j x.dist x.xg # Default construction -vg minimizer -o x.mi -g x.gbwt x.xg +vg minimizer --no-dist -o x.mi -g x.gbwt x.xg is $? 0 "default parameters" +# Construction fails without a distance index +vg minimizer -o x.mi -g x.gbwt x.xg 2> /dev/null +is $? 1 "distance index or --no-dist is required" + # Single-threaded for deterministic results -vg minimizer -t 1 -o x.mi -g x.gbwt x.xg +vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg is $? 0 "single-threaded construction" is $(md5sum x.mi | cut -f 1 -d\ ) 0d75343d78d1e7d9e9fbc3d7d2386ce2 "construction is deterministic" # Indexing syncmers -vg minimizer -t 1 -o x.mi -c -g x.gbwt x.xg +vg minimizer --no-dist -t 1 -o x.mi -c -g x.gbwt x.xg is $? 0 "syncmer index" is $(md5sum x.mi | cut -f 1 -d\ ) 74d836b46799590835c7d61e283df4f0 "construction is deterministic" # Minimizer parameters -vg minimizer -t 1 -k 7 -w 3 -o x.mi -g x.gbwt -p x.xg +vg minimizer --no-dist -t 1 -k 7 -w 3 -o x.mi -g x.gbwt x.xg is $? 0 "minimizer parameters" is $(md5sum x.mi | cut -f 1 -d\ ) 7260e5ea22f063a8dff1f9dd60f92288 "setting -k -w works correctly" # Construction from GBWTGraph vg gbwt -x x.xg -g x.gg x.gbwt -vg minimizer -t 1 -g x.gbwt -o x.mi x.gg +vg minimizer --no-dist -t 1 -g x.gbwt -o x.mi x.gg is $? 0 "construction from GBWTGraph" is $(md5sum x.mi | cut -f 1 -d\ ) 0d75343d78d1e7d9e9fbc3d7d2386ce2 "construction is deterministic" # Construction from GBZ vg gbwt -x x.xg -g x.gbz --gbz-format x.gbwt -vg minimizer -t 1 -o x.mi x.gbz +vg minimizer --no-dist -t 1 -o x.mi x.gbz is $? 0 "construction from GBZ" is $(md5sum x.mi | cut -f 1 -d\ ) 0d75343d78d1e7d9e9fbc3d7d2386ce2 "construction is deterministic" @@ -62,9 +66,9 @@ vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg # Appending to the index -vg minimizer -t 1 -o x.mi -g x.gbwt x.xg +vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg is $? 0 "multiple graphs: first" -vg minimizer -t 1 -l x.mi -o xy.mi -g y.gbwt y.xg +vg minimizer --no-dist -t 1 -l x.mi -o xy.mi -g y.gbwt y.xg is $? 0 "multiple graphs: second" is $(md5sum xy.mi | cut -f 1 -d\ ) 1ca39921b15cc3e7d27919a3ec7f47fa "construction is deterministic" diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 500f199568b..5e3f8dac9eb 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -13,9 +13,8 @@ vg gbwt -o x-haps.gbwt -x x.vg -v small/x.vcf.gz vg gbwt -o x-paths.gbwt -x x.vg --index-paths vg gbwt -o x-merged.gbwt -m x-haps.gbwt x-paths.gbwt vg gbwt -o x.gbwt --augment-gbwt -x x.vg x-merged.gbwt -vg snarls --include-trivial x.vg > x.snarls vg index -j x.dist x.vg -vg minimizer -k 29 -w 11 -g x.gbwt -o x.min x.xg +vg minimizer -k 29 -w 11 -d x.dist -g x.gbwt -o x.min x.xg vg giraffe -x x.xg -H x.gbwt -m x.min -d x.dist -f reads/small.middle.ref.fq > mapped1.gam is "${?}" "0" "a read can be mapped with xg + gbwt + min + dist specified without crashing" @@ -50,7 +49,7 @@ vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.fq --full-l-bonus 0 > mapp is "$(vg view -aj mapped-nobonus.gam | jq '.score')" "63" "Mapping without a full length bonus produces the correct score" rm -f mapped-nobonus.gam -vg minimizer -k 29 -b -s 18 -g x.gbwt -o x.sync x.xg +vg minimizer -k 29 -b -s 18 -d x.dist -g x.gbwt -o x.sync x.xg vg giraffe -x x.xg -H x.gbwt -m x.sync -d x.dist -f reads/small.middle.ref.fq > mapped.sync.gam is "${?}" "0" "a read can be mapped with syncmer indexes without crashing" @@ -59,7 +58,7 @@ chmod 400 x.dist vg giraffe -x x.xg -H x.gbwt -m x.min -d x.dist -f reads/small.middle.ref.fq >/dev/null is "${?}" "0" "a read can be mapped when the distance index is not writable" -rm -f x.vg x.xg x.gbwt x.snarls x.min x.sync x.dist x.gg +rm -f x.vg x.xg x.gbwt x.min x.sync x.dist x.gg rm -f x.giraffe.gbz @@ -114,10 +113,10 @@ is "$(cat surjected.sam | grep -v '^@' | cut -f 1 | sort | uniq | wc -l)" "2" "s is "$(cat surjected.sam | grep -v '^@' | cut -f 7)" "$(printf '*\n*')" "surjection of unpaired reads to SAM produces absent partner contigs" is "$(cat surjected.sam | grep -v '^@' | sort -k4 | cut -f 2)" "$(printf '0\n16')" "surjection of unpaired reads to SAM produces correct flags" -rm -f x.vg x.gbwt x.xg x.snarls x.min x.dist x.gg x.fa x.fa.fai x.vcf.gz x.vcf.gz.tbi single.gam paired.gam surjected.sam +rm -f x.vg x.gbwt x.xg x.min x.dist x.gg x.fa x.fa.fai x.vcf.gz x.vcf.gz.tbi single.gam paired.gam surjected.sam rm -f x.giraffe.gbz -rm -f xy.vg xy.gbwt xy.xg xy.snarls xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi +rm -f xy.vg xy.gbwt xy.xg xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi cp small/xy.fa . cp small/xy.vcf.gz . cp small/xy.vcf.gz.tbi . @@ -171,7 +170,7 @@ is "$(cat xy.json | grep "correct-minimizer-coverage" | wc -l)" "2000" "paired r is "$(cat xy.json | jq '.annotation["correct-minimizer-coverage"] | select(. > 0)' | wc -l)" "2000" "paired reads all have nonzero correct minimizer coverage" rm -f x.vg xy.sam xy.json -rm -f xy.vg xy.gbwt xy.xg xy.snarls xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi +rm -f xy.vg xy.gbwt xy.xg xy.min xy.dist xy.gg xy.fa xy.fa.fai xy.vcf.gz xy.vcf.gz.tbi vg giraffe -Z xy.giraffe.gbz -G x.gam -o BAM >xy.bam is $? "0" "vg giraffe can map to BAM using a GBZ alone"