From 6343cca14fb5b7be18ef4b84d7cfaee8e4470bd2 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Thu, 31 Aug 2023 10:08:05 +0200 Subject: [PATCH 1/2] Add CLI option to save paths for alts using plain variant ID (if possible) --- src/constructor.cpp | 2 +- src/constructor.hpp | 1 + src/subcommand/construct_main.cpp | 11 +++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/constructor.cpp b/src/constructor.cpp index f5f4653b921..0c36ceaaae5 100644 --- a/src/constructor.cpp +++ b/src/constructor.cpp @@ -696,7 +696,7 @@ namespace vg { // Name the variant and place it in the order that we'll // actually construct nodes in (see utility.hpp) - string variant_name = make_variant_id(*variant); + string variant_name = sha1_variant_name ? make_variant_id(*variant) : get_or_make_variant_id(*variant); if (variants_by_name.count(variant_name)) { // Some VCFs may include multiple variants at the same // position with the same ref and alt. We will only take the diff --git a/src/constructor.hpp b/src/constructor.hpp index ede856cf46a..9b682a42855 100644 --- a/src/constructor.hpp +++ b/src/constructor.hpp @@ -80,6 +80,7 @@ class Constructor : public Progressive, public NameMapper { // _alt_6079b4a76d0ddd6b4b44aeb14d738509e266961c_0 and // _alt_6079b4a76d0ddd6b4b44aeb14d738509e266961c_1? bool alt_paths = false; + bool sha1_variant_name = true; // Should we handle structural variants in the VCF file, // or at least the ones we know how to? diff --git a/src/subcommand/construct_main.cpp b/src/subcommand/construct_main.cpp index 153aa2a2cb3..d41e148affe 100644 --- a/src/subcommand/construct_main.cpp +++ b/src/subcommand/construct_main.cpp @@ -25,7 +25,8 @@ void help_construct(char** argv) { << " -r, --reference FILE input FASTA reference (may repeat)" << endl << " -v, --vcf FILE input VCF (may repeat)" << endl << " -n, --rename V=F match contig V in the VCFs to contig F in the FASTAs (may repeat)" << endl - << " -a, --alt-paths save paths for alts of variants by variant ID" << endl + << " -a, --alt-paths save paths for alts of variants by SHA1 hash" << endl + << " -A, --alt-paths-plain save paths for alts of variants by variant ID (if possible, otherwise SHA1)" << endl << " -R, --region REGION specify a VCF contig name or 1-based inclusive region (may repeat, if on different contigs)" << endl << " -C, --region-is-chrom don't attempt to parse the regions (use when the reference" << endl << " sequence name could be inadvertently parsed as a region)" << endl @@ -87,6 +88,7 @@ int main_construct(int argc, char** argv) { {"drop-msa-paths", no_argument, 0, 'd'}, {"rename", required_argument, 0, 'n'}, {"alt-paths", no_argument, 0, 'a'}, + {"alt-paths-plain", no_argument, 0, 'A'}, {"handle-sv", no_argument, 0, 'S'}, {"insertions", required_argument, 0, 'I'}, {"progress", no_argument, 0, 'p'}, @@ -103,7 +105,7 @@ int main_construct(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "v:r:n:ph?z:t:R:m:aCfl:SI:M:dF:iN", + c = getopt_long (argc, argv, "v:r:n:ph?z:t:R:m:aACfl:SI:M:dF:iN", long_options, &option_index); /* Detect the end of the options. */ @@ -170,6 +172,11 @@ int main_construct(int argc, char** argv) { constructor.alt_paths = true; break; + case 'A': + constructor.alt_paths = true; + constructor.sha1_variant_name = false; + break; + case 'p': show_progress = true; break; From 79fbc66f1c15f598ae238d1420b57a6dc4fe2bb5 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Tue, 19 Sep 2023 09:07:00 +0200 Subject: [PATCH 2/2] Update construct help message (ID uniqueness) --- src/subcommand/construct_main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/subcommand/construct_main.cpp b/src/subcommand/construct_main.cpp index d41e148affe..53ea4a20fd8 100644 --- a/src/subcommand/construct_main.cpp +++ b/src/subcommand/construct_main.cpp @@ -26,7 +26,8 @@ void help_construct(char** argv) { << " -v, --vcf FILE input VCF (may repeat)" << endl << " -n, --rename V=F match contig V in the VCFs to contig F in the FASTAs (may repeat)" << endl << " -a, --alt-paths save paths for alts of variants by SHA1 hash" << endl - << " -A, --alt-paths-plain save paths for alts of variants by variant ID (if possible, otherwise SHA1)" << endl + << " -A, --alt-paths-plain save paths for alts of variants by variant ID if possible, otherwise SHA1" << endl + << " (IDs must be unique across all input VCFs)" << endl << " -R, --region REGION specify a VCF contig name or 1-based inclusive region (may repeat, if on different contigs)" << endl << " -C, --region-is-chrom don't attempt to parse the regions (use when the reference" << endl << " sequence name could be inadvertently parsed as a region)" << endl