Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow GBZ files to contain reference paths without haplotype numbers #4111

Merged
merged 11 commits into from
Oct 5, 2023
10 changes: 4 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,6 @@ LD_STATIC_LIB_FLAGS += $(shell $(PKG_CONFIG) --libs --static $(PKG_CONFIG_STATIC
# So we make a list of prefixes to search for it.
OMP_PREFIXES:=/

# Travis needs -latomic for all builds *but* GCC on Mac
ifeq ($(strip $(shell $(CXX) -latomic /dev/null -o/dev/null 2>&1 | grep latomic | wc -l)), 0)
# Use -latomic if the compiler doesn't complain about it
LD_LIB_FLAGS += -latomic
endif

COMPILER_ID=$(strip $(shell $(CXX) --version 2>&1))
ifeq ($(shell uname -s),Darwin)
$(info OS is Mac)
Expand Down Expand Up @@ -239,6 +233,10 @@ else
# We want to link against the elfutils libraries
LD_LIB_FLAGS += -ldwfl -ldw -ldwelf -lelf -lebl

# We want to link against libatomic which the GNU C++ standard library needs.
# See <https://github.com/nodejs/node/issues/30093> and <https://stackoverflow.com/q/30591313>
LD_LIB_FLAGS += -latomic

# We get OpenMP the normal way, using whatever the compiler knows about
CXXFLAGS += -fopenmp

Expand Down
2 changes: 1 addition & 1 deletion deps/vcflib
Submodule vcflib updated 1 files
+2 −2 src/Variant.h
2 changes: 1 addition & 1 deletion doc/wiki
Submodule wiki updated from acd5d1 to f70ea3
26 changes: 6 additions & 20 deletions src/subcommand/convert_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,16 +609,9 @@ void graph_to_xg_adjusting_paths(const PathHandleGraph* input, xg::XG* output, c
// Compose the new reference-ified metadata
std::string sample = input->get_sample_name(path);
std::string locus = input->get_locus_name(path);
int64_t haplotype;
if (sample_to_haplotypes[sample].size() > 1) {
// We should preserve the haplotype because we have multiple
// haplotype phases of this sample.
haplotype = input->get_haplotype(path);
} else {
// We should drop the haplotype number because this sample has only
// one haplotype phase.
haplotype = PathMetadata::NO_HAPLOTYPE;
}
// We should always preserve the haplotype phase number; we
// will need it if we ever want to go back to haplotype sense.
int64_t haplotype = input->get_haplotype(path);
auto subrange = input->get_subrange(path);

// Make a new name with reference-ified metadata.
Expand Down Expand Up @@ -668,16 +661,9 @@ void add_and_adjust_paths(const PathHandleGraph* input, MutablePathHandleGraph*
// Compose the new reference-ified metadata
std::string sample = input->get_sample_name(path);
std::string locus = input->get_locus_name(path);
int64_t haplotype;
if (sample_to_haplotypes[sample].size() > 1) {
// We should preserve the haplotype because we have multiple
// haplotype phases of this sample.
haplotype = input->get_haplotype(path);
} else {
// We should drop the haplotype number because this sample has only
// one haplotype phase.
haplotype = PathMetadata::NO_HAPLOTYPE;
}
// We should always preserve the haplotype phase number; we
// will need it if we ever want to go back to haplotype sense.
int64_t haplotype = input->get_haplotype(path);
auto subrange = input->get_subrange(path);
bool is_circular = input->get_is_circular(path);

Expand Down
4 changes: 2 additions & 2 deletions test/t/07_vg_map.t
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,13 @@ rm -f x.vg.idx x.vg.gcsa x.vg.gcsa.lcp x.vg x.xg x.gcsa x.gcsa.lcp x.gbwt graphs

vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz >tiny.vg
vg index -k 16 -x tiny.xg -g tiny.gcsa tiny.vg
is $(vg map -d tiny -f tiny/tiny.fa -j | jq -r .identity) 1 "mapper can read FASTA input"
is $(vg map -d tiny -f tiny/tiny.fa -j | jq -r .identity | head -c1) 1 "mapper can read FASTA input"
cat <<EOF >t.fa
>x
CAAATAAGGCTTGGAAATTTTCTGGA
GTTCTATTATATTCCAACTCTCTG
EOF
is $(vg map -d tiny -F t.fa -j | jq -r .identity) 1 "mapper can read multiline FASTA input"
is $(vg map -d tiny -F t.fa -j | jq -r .identity | head -c1) 1 "mapper can read multiline FASTA input"

rm -f tiny.vg tiny.xg tiny.gcsa tiny.gcsa.lcp t.fa t.fa.fai

Expand Down
6 changes: 3 additions & 3 deletions test/t/18_vg_call.t
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ vg call HGSVC_alts.xg -k HGSVC_alts.pack -v call/HGSVC_chr22_17200000_17800000.v
gzip -dc call/HGSVC_chr22_17200000_17800000.vcf.gz | grep -v '#' | awk '{print $10}' | awk -F ':' '{print $1}' > baseline_gts.txt
# extract the called genotypes
grep -v '#' HGSVC.vcf | sort -k1,1d -k2,2n | awk '{print $10}' | awk -F ':' '{print $1}' | sed 's/\//\|/g' > gts.txt
DIFF_COUNT=$(diff -y --suppress-common-lines baseline_gts.txt gts.txt | grep '^' | wc -l)
DIFF_COUNT=$(diff -U1000 <(nl baseline_gts.txt) <(nl gts.txt) | tail -n +4 | grep '^+' | wc -l)
LESS_EIGHT=$(if (( $DIFF_COUNT < 8 )); then echo 1; else echo 0; fi)
is "${LESS_EIGHT}" "1" "Fewer than 8 differences between called and true SV genotypes"

Expand All @@ -76,7 +76,7 @@ vg call HGSVC_alts.xg -k HGSVC_alts.pack -v call/HGSVC_chr22_17200000_17800000.v
gzip -dc call/HGSVC_chr22_17200000_17800000.vcf.gz | grep -v '#' | awk '{print $10}' | awk -F ':' '{print $1}' | awk -F '|' '{print $1}' > baseline_gts1.txt
# extract the called genotypes
grep -v '#' HGSVC1.vcf | sort -k1,1d -k2,2n | awk '{print $10}' | awk -F ':' '{print $1}' | sed 's/\//\|/g' > gts1.txt
DIFF_COUNT=$(diff -y --suppress-common-lines baseline_gts1.txt gts1.txt | grep '^' | wc -l)
DIFF_COUNT=$(diff -U1000 <(nl baseline_gts1.txt) <(nl gts1.txt) | tail -n +4 | grep '^+' | wc -l)
LESS_EIGHT=$(if (( $DIFF_COUNT <= 8 )); then echo 1; else echo 0; fi)
is "${LESS_EIGHT}" "1" "Fewer than 8 differences between called haploid and truncated true SV genotypes"

Expand All @@ -100,7 +100,7 @@ is "$?" "0" "Calling from extracted traversals by way of GBWT produces same geno

grep -v '#' HGSVC_travs.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > calls-travs.txt
grep -v '#' HGSVC_direct.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > calls-direct.txt
DIFF_COUNT=$(diff -y --suppress-common-lines calls-travs.txt calls-direct.txt | grep '^' | wc -l)
DIFF_COUNT=$(diff -U1000 <(nl calls-travs.txt) <(nl calls-direct.txt) | tail -n +4 | grep '^+' | wc -l)
LESS_THREE=$(if (( $DIFF_COUNT < 3 )); then echo 1; else echo 0; fi)
# because call makes an attempt to call multiple snarls at once when outputting traversals (to make bigger traversals)
# there is some wobble here
Expand Down
15 changes: 12 additions & 3 deletions test/t/37_vg_gbwt.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 148
plan tests 149


# Build vg graphs for two chromosomes
Expand Down Expand Up @@ -393,9 +393,18 @@ is $? 0 "GBZ GBWT tag extraction works"
is "$(grep reference_samples tags.tsv | cut -f2 | tr ' ' '\\n' | sort | tr '\\n' ' ')" "GRCh37 GRCh38" "GBWT tags contain the correct reference samples"
vg gbwt -g gfa2.gbz --gbz-format -Z gfa.gbz --set-tag "reference_samples=GRCh38 CHM13"
is $? 0 "GBZ GBWT tag modification works"
is "$(vg paths -M -S GRCh37 -x gfa2.gbz | grep -v "^#" | grep HAPLOTYPE | wc -l)" "1" "Changing reference_samples tag can make a reference a haplotype"
is "$(vg paths -M -S CHM13 -x gfa2.gbz | grep -v "^#" | grep REFERENCE | wc -l)" "1" "Changing reference_samples tag can make a haplotype a reference"
is "$(vg paths -M -S GRCh37 -x gfa2.gbz | grep -v "^#" | cut -f2 | grep HAPLOTYPE | wc -l)" "1" "Changing reference_samples tag can make a reference a haplotype"
is "$(vg paths -M -S CHM13 -x gfa2.gbz | grep -v "^#" | cut -f2 | grep REFERENCE | wc -l)" "1" "Changing reference_samples tag can make a haplotype a reference"
vg gbwt -g gfa2.gbz --gbz-format -Z gfa.gbz --set-tag "reference_samples=GRCh38#1 CHM13" 2>/dev/null
is $? 1 "GBZ GBWT tag modification validation works"

rm -f gfa.gbz gfa2.gbz tags.tsv

# Build a GBZ from a graph with a reference but no haplotype phase number
# TODO: When <https://github.com/vgteam/vg/issues/4110> is fixed, actually parse this as GFA
vg gbwt -g gfa.gbz --gbz-format -E -x graphs/gfa_two_part_reference.gfa
is "$(vg paths -M --reference-paths -x gfa.gbz | grep -v "^#" | cut -f4 | grep NO_HAPLOTYPE | wc -l)" "2" "GBZ can represent reference paths without haplotype numbers"

rm -f gfa.gbz


2 changes: 1 addition & 1 deletion test/t/48_vg_convert.t
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ vg convert x.vg -F mut.gaf -t 1 > mut-back.gam
vg view -a mut.gam | jq .path > mut.path
vg view -a mut-back.gam | jq .path > mut-back.path
# Json comparison that is not order dependent: https://stackoverflow.com/a/31933234
is $(jq --argfile a mut.path --argfile b mut-back.path -n 'def post_recurse(f): def r: (f | select(. != null) | r), .; r; def post_recurse: post_recurse(.[]?); ($a | (post_recurse | arrays) |= sort) as $a | ($b | (post_recurse | arrays) |= sort) as $b | $a == $b') true "vg convert gam -> gaf -> gam produces same gam Paths with snps and indels"
is $(jq --slurpfile a mut.path --slurpfile b mut-back.path -n 'def post_recurse(f): def r: (f | select(. != null) | r), .; r; def post_recurse: post_recurse(.[]?); ($a | (post_recurse | arrays) |= sort) as $a | ($b | (post_recurse | arrays) |= sort) as $b | $a == $b') true "vg convert gam -> gaf -> gam produces same gam Paths with snps and indels"

vg view -a mut.gam | jq .sequence > mut.seq
vg view -a mut-back.gam | jq .sequence > mut-back.seq
Expand Down
Loading