Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix vcf-merge when using COLLAPSE_NONE #1624

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
9 changes: 9 additions & 0 deletions test/merge.10.a.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.1
##contig=<ID=1,assembly=b37,length=249250621>
##reference=file:///ref.fa
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotypes">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A
1 10 . C G . . AN=4;AC=2 GT 0/1
1 12 . T A . . AN=4;AC=2 GT 0/1
8 changes: 8 additions & 0 deletions test/merge.10.b.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.1
##contig=<ID=1,assembly=b37,length=249250621>
##reference=file:///ref.fa
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotypes">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B
1 10 . CGT CGA,AGT . . AN=4;AC=1 GT 1/2
11 changes: 11 additions & 0 deletions test/merge.10.none.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1,assembly=b37,length=249250621>
##reference=file:///ref.fa
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotypes">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
1 10 . C G . . AN=2;AC=1 GT 0/1 ./.
1 10 . CGT CGA,AGT . . AN=2;AC=1,1 GT ./. 1/2
1 12 . T A . . AN=2;AC=1 GT 0/1 ./.
1 change: 1 addition & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
test_vcf_merge($opts,in=>['merge.8.a','merge.8.b'],out=>'merge.8.out',args=>'-i AN:sum,AC:sum');
test_vcf_merge($opts,in=>['merge.9.a','merge.9.b'],out=>'merge.9.1.out',args=>'');
test_vcf_merge($opts,in=>['merge.9.a','merge.9.b'],out=>'merge.9.2.out',args=>'-i AN:sum,AC:sum');
test_vcf_merge($opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.none.out',args=>'-m none');
# test_vcf_merge_big($opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed
test_vcf_query($opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided,_conflicting_interpretations"']);
test_vcf_query($opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" || CLNREVSTAT="_conflicting_interpretations"']);
Expand Down
12 changes: 8 additions & 4 deletions vcfmerge.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ typedef struct
maux_t *maux;
regidx_t *regs; // apply regions only after the blocks are expanded
regitr_t *regs_itr;
int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index, non_normalize_alleles;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
faidx_t *gvcf_fai;
info_rule_t *rules;
Expand Down Expand Up @@ -973,7 +973,7 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
for (i=0; i<ma->nals; i++)
if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]);
assert( k==ma->nout_als );
normalize_alleles(ma->out_als, ma->nout_als);
if (args->non_normalize_alleles != 1) normalize_alleles(ma->out_als, ma->nout_als);
bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als);
free(al_idxs);
for (i=0; i<ma->nout_als; i++) free(ma->out_als[i]);
Expand Down Expand Up @@ -2799,7 +2799,7 @@ int can_merge(args_t *args)
// All alleles of the tested record must be present in the
// selected maux record plus variant types must be the same
if ( (maux->var_types & line_type) != line_type ) continue;
if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) <= 0 ) continue; // refs not perfect match
for (k=1; k<line->n_allele; k++)
{
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
Expand Down Expand Up @@ -3134,6 +3134,7 @@ static void usage(void)
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, " -N, --non_normalize_alleles Do not normalize_alleles\n");
fprintf(stderr, "\n");
exit(1);
}
Expand All @@ -3150,6 +3151,7 @@ int main_vcfmerge(int argc, char *argv[])
args->record_cmd_line = 1;
args->collapse = COLLAPSE_BOTH;
args->clevel = -1;
args->non_normalize_alleles = 0;
int regions_is_file = 0;
int regions_overlap = 1;

Expand All @@ -3175,11 +3177,13 @@ int main_vcfmerge(int argc, char *argv[])
{"no-version",no_argument,NULL,8},
{"no-index",no_argument,NULL,10},
{"filter-logic",required_argument,NULL,'F'},
{"non_normalize_alleles",no_argument,NULL,'N'},
{NULL,0,NULL,0}
};
char *tmp;
while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:N",loptions,NULL)) >= 0) {
switch (c) {
case 'N': args->non_normalize_alleles = 1; break;
case 'L':
args->local_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
Expand Down
25 changes: 13 additions & 12 deletions vcmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2)

char *a = ref1, *b = ref2;
while ( *a && *b && toupper(*a)==toupper(*b) ) { a++; b++; }
if ( !*a && !*b ) return 0;
if ( !*a && !*b ) return 1; // perfect match
if ( *a && *b ) return -1; // refs not compatible

int i;
Expand All @@ -70,18 +70,19 @@ int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2)
hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref1[vcmp->nmatch+i]);
vcmp->dref[vcmp->ndref] = 0;
return 0;
return 0; // compatible
} else if ( *b ) { // ref2 is longer
vcmp->nmatch = a-ref1;
while ( *b ) b++;
vcmp->ndref = (b-ref2) - vcmp->nmatch;
hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref2[vcmp->nmatch+i]);
vcmp->dref[vcmp->ndref] = 0;
vcmp->ndref *= -1;
return 0; // compatible
} else {
return -1; // should not happen
}

// ref2 is longer
vcmp->nmatch = a-ref1;
while ( *b ) b++;
vcmp->ndref = (b-ref2) - vcmp->nmatch;
hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref);
for (i=0; i<vcmp->ndref; i++) vcmp->dref[i] = toupper(ref2[vcmp->nmatch+i]);
vcmp->dref[vcmp->ndref] = 0;
vcmp->ndref *= -1;
return 0;
}

int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2)
Expand Down