diff --git a/preprocessor/preprocessor/utilities.py b/preprocessor/preprocessor/utilities.py index 9160b3c4..1a9af114 100644 --- a/preprocessor/preprocessor/utilities.py +++ b/preprocessor/preprocessor/utilities.py @@ -320,9 +320,19 @@ def _check_for_gvcf(in_f) -> bool: else: line = line.rstrip('\n') fields: List[str] = line.split('\t') - # check whether input is a block gVCF - if re.search('END', fields[7]): - return True + v_len: int = abs(len(fields[4]) - len(fields[3])) + v_start: int = int(fields[1]) + end_in_info = re.search(r"[;|]?END=(\d+)", fields[7]) + if end_in_info: + end_pos: int = int(end_in_info.group(1)) + # calculate the length of the variant block indicated by the END annotation in the INFO column + end_block = end_pos - v_start + if end_block > v_len: + return True + elif end_block == v_len: # not a gVCF if an END block simply annotates the variant length + return False + else: # when end_len < v_len, it is unclear what END annotation stands for + return False return False diff --git a/preprocessor/tests/test_gvcf_block.vcf b/preprocessor/tests/test_gvcf_block.vcf new file mode 100644 index 00000000..8315a3f7 --- /dev/null +++ b/preprocessor/tests/test_gvcf_block.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##source=PharmCAT allele definitions +##reference=hg38 +##contig= +##FILTER= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1 +chr1 123456 . G A . PASS END=223456 GT 0|0 diff --git a/preprocessor/tests/test_not_gvcf_block.vcf b/preprocessor/tests/test_not_gvcf_block.vcf new file mode 100644 index 00000000..8f439114 --- /dev/null +++ b/preprocessor/tests/test_not_gvcf_block.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##source=PharmCAT allele definitions +##reference=hg38 +##contig= +##FILTER= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1 +chr1 123456 . A T . PASS P|ABEND5|ABEND5 GT 0|1 +chr1 123456 . GGT G . PASS END=123458;annotation_of_variant_ending_position GT 0|1 diff --git a/preprocessor/tests/test_utilities.py b/preprocessor/tests/test_utilities.py index 4d171219..e1d1d132 100644 --- a/preprocessor/tests/test_utilities.py +++ b/preprocessor/tests/test_utilities.py @@ -160,8 +160,14 @@ def test_is_vcf_file(): def test_is_gvcf_file(): test_vcf_file = helpers.test_dir / 'raw.vcf.bgz' test_gvcf_file = helpers.test_dir / 'test.g.vcf.bgz' + test_vcf_end_file = helpers.test_dir / 'test_not_gvcf_block.vcf' + test_gvcf_end_file = helpers.test_dir / 'test_gvcf_block.vcf' with tempfile.TemporaryDirectory() as td: assert not utils.is_gvcf_file(test_vcf_file) + # is gVCF as the END annotation indicates a large genomic block + assert utils.is_gvcf_file(test_gvcf_end_file) + # is not gVCF as the END annotation simply indicates the variant length + assert not utils.is_gvcf_file(test_vcf_end_file) tmp_dir = Path(td) f1 = tmp_dir / 'test.g.vcf.bgz'