From 488d7cb0355f0550b33f2f87a296a7814a6df044 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 15 Nov 2024 10:43:43 -0500 Subject: [PATCH] Fix multiallelic genotypes for biallelic CNVs (#748) * Initial commit * Modified dockstore to track cleanvcf5 * Minor change to trigger sync * Removed redundant dockstore module * Modifid +setGT call to use accurate GT expression * Modified to use sample-specific genotype filtering * Documented bug in cleanvcfpart5 py script * Removed dockstore sync --- .github/.dockstore.yml | 2 +- .../scripts/clean_vcf_part5_update_records.py | 2 +- wdl/CleanVcf5.wdl | 12 ++++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 259be5b57..035d1cc84 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -214,4 +214,4 @@ workflows: branches: - main tags: - - /.*/ + - /.*/ \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py index 51675b5ab..a5bf42982 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py @@ -136,7 +136,7 @@ def main(): if gt5kb_dup: for sample_obj in record.samples.itervalues(): - # Leave no-calls + # Leave no-calls - also causes bug that skips multiallelic genotypes for a biallelic variant if sample_obj['GT'] == (None, None): continue if not sample_obj['GQ'] is None and \ diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl index 085aaa5e5..f9be79029 100644 --- a/wdl/CleanVcf5.wdl +++ b/wdl/CleanVcf5.wdl @@ -247,8 +247,16 @@ task Polish { --exclude 'ID=@ids_to_remove.list' \ --output-type z -o polished.need_reheader.vcf.gz --threads ~{threads} + # replace multiallelic genotypes for CNVs with homref + bcftools +setGT polished.need_reheader.vcf.gz -- \ + -t q \ + -n c:'1/1' \ + -i '(INFO/SVTYPE="DEL" | INFO/SVTYPE="DUP") & (FMT/GT~"[2-9]" | FMT/GT~"[1-9][0-9]+") & FMT/RD_CN>3' > polished.need_reheader.regenotyped.vcf + + bgzip polished.need_reheader.regenotyped.vcf + # do the last bit of header cleanup - bcftools view -h polished.need_reheader.vcf.gz > original_header.vcf + bcftools view -h polished.need_reheader.regenotyped.vcf.gz > original_header.vcf cat original_header.vcf | fgrep '##fileformat' > new_header.vcf cat original_header.vcf \ | egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=> new_header.vcf cat original_header.vcf | fgrep '#CHROM' >> new_header.vcf - bcftools reheader polished.need_reheader.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz + bcftools reheader polished.need_reheader.regenotyped.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz >>> output {