From 941c0ac3ed95f4afe2b68920ed72e4aeef9fd61f Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Tue, 8 Oct 2019 09:33:40 -1000 Subject: [PATCH] improve handling of DeepVariant non-called records Compared to other callers, DeepVariant generates more non-called (./.) gVCF records to express uncertainty; including both reference bands and variant records. In 497277a5 we made sure to preserve the non-called status for reference bands, albeit in a heavy-handed way that censored all the QC fields (RNC=PartialData). Here we introduce a new RNC InputNonCalled, used for both reference bands and variant records when applicable, which preserves the associated QC fields in the pVCF. --- include/genotyper.h | 3 +- src/genotyper.cc | 7 +- src/service.cc | 2 +- test/data/gvcf_test_cases/DP0_noAD.yml | 38 ++++---- test/data/gvcf_test_cases/deepvariant.yml | 45 ++++++++- test/data/gvcf_test_cases/deepvariant2.yml | 8 +- .../dv_1000G_chr21_5583275.yml | 95 +++++++++++++++++++ test/gvcf_test_cases.cc | 20 ++-- 8 files changed, 182 insertions(+), 36 deletions(-) create mode 100644 test/data/gvcf_test_cases/dv_1000G_chr21_5583275.yml diff --git a/include/genotyper.h b/include/genotyper.h index 7e4f7418..a0ffd89b 100644 --- a/include/genotyper.h +++ b/include/genotyper.h @@ -34,7 +34,8 @@ enum class NoCallReason { LostAllele, /// unrepresentable allele (other than overlapping deletion) UnphasedVariants, /// site spans multiple unphased variants OverlappingVariants, /// site spans multiple variants which overlap each other - MonoallelicSite /// site is monoallelic; no assertion about the presence of either ref or alt allele + MonoallelicSite, /// site is monoallelic; no assertion about the presence of either ref or alt allele + InputNonCalled, /// the relevant input gVCF record is itself non-called }; /// A single allele call and metadata; diploid samples each have two calls diff --git a/src/genotyper.cc b/src/genotyper.cc index 8429750f..ba08568a 100644 --- a/src/genotyper.cc +++ b/src/genotyper.cc @@ -306,7 +306,7 @@ Status prepare_dataset_records(const genotyper_config& cfg, const unified_site& ref_records, depth, min_ref_depth)); // ex post facto check for reference confidence records whose GT is other - // than 0/0 (probably ./.), which we'll translate to PartialData non-calls + // than 0/0 (probably ./.), which we'll translate to InputNonCalled // We exclude 'haploid' records from this treatment for now as observed // examples (e.g. in Strelka2 gVCFs) don't seem to require it, but this // may need to be configurable in the future. @@ -314,7 +314,7 @@ Status prepare_dataset_records(const genotyper_config& cfg, const unified_site& if (rp->is_ref && !rp->was_haploid) { for (unsigned i = 0; i < 2*rp->p->n_sample; i++) { if (bcf_gt_is_missing(rp->gt[i]) || bcf_gt_allele(rp->gt[i]) != 0) { - rnc = NoCallReason::PartialData; + rnc = NoCallReason::InputNonCalled; return Status::OK(); } } @@ -499,6 +499,7 @@ static Status translate_genotypes(const genotyper_config& cfg, const unified_sit // TODO: are depth and allele_mapping checks inside-out???? #define fill_allele(rec,depth,in_ofs,out_ofs) \ assert(rec); \ + genotypes[2*ij.second+(out_ofs)].RNC = NoCallReason::InputNonCalled; \ if (rec->gt[2*ij.first+in_ofs] != bcf_int32_vector_end && \ !bcf_gt_is_missing(rec->gt[2*ij.first+(in_ofs)])) { \ auto al = bcf_gt_allele(rec->gt[2*ij.first+(in_ofs)]); \ @@ -620,6 +621,7 @@ static Status translate_monoallelic(const genotyper_config& cfg, const unified_s assert(ij.second < min_ref_depth.size()); #define fill_monoallelic(ofs) \ + genotypes[2*ij.second+(ofs)].RNC = NoCallReason::InputNonCalled; \ if (record->gt[2*ij.first+ofs] != bcf_int32_vector_end && \ !bcf_gt_is_missing(record->gt[2*ij.first+(ofs)])) { \ auto al = bcf_gt_allele(record->gt[2*ij.first+(ofs)]); \ @@ -926,6 +928,7 @@ Status genotype_site(const genotyper_config& cfg, MetadataCache& cache, BCFData& RNC_CASE(UnphasedVariants,"U") RNC_CASE(OverlappingVariants,"O") RNC_CASE(MonoallelicSite,"1") + RNC_CASE(InputNonCalled, "I") default: assert(c.RNC == NoCallReason::MissingData); } diff --git a/src/service.cc b/src/service.cc index 72f39e7e..31d22a7a 100644 --- a/src/service.cc +++ b/src/service.cc @@ -199,7 +199,7 @@ static Status prepare_bcf_header(const vector >& contigs, hdr_lines.push_back("##INFO="); hdr_lines.push_back("##FILTER="); hdr_lines.push_back("##FORMAT="); - hdr_lines.push_back("##FORMAT="); + hdr_lines.push_back("##FORMAT="); for (auto& format_field : format_fields) { hdr_lines.push_back(format_field.description); } diff --git a/test/data/gvcf_test_cases/DP0_noAD.yml b/test/data/gvcf_test_cases/DP0_noAD.yml index e74671ba..794a95ec 100644 --- a/test/data/gvcf_test_cases/DP0_noAD.yml +++ b/test/data/gvcf_test_cases/DP0_noAD.yml @@ -31,24 +31,28 @@ input: A 1000 . C CG, 1.38 . DP=0 GT:GQ:PL:SB 1/1:3:35,3,0,35,3,35:0,0,0,0 A 1001 . G . . END=2000 GT:DP:GQ:MIN_DP:PL 0/0:2:0:1:0,0,0 +config_preset: gatk_unfiltered + genotyper_config: + required_dp: 0 + revise_genotypes: false liftover_fields: - - orig_names: [MIN_DP, DP] - name: DP - description: '##FORMAT=' - type: int - combi_method: min - number: basic - count: 1 - ignore_non_variants: true - - orig_names: [AD] - name: AD - description: '##FORMAT=' - type: int - number: alleles - combi_method: min - default_type: zero - count: 0 + - orig_names: [MIN_DP, DP] + name: DP + description: '##FORMAT=' + type: int + combi_method: min + number: basic + count: 1 + ignore_non_variants: true + - orig_names: [AD] + name: AD + description: '##FORMAT=' + type: int + number: alleles + combi_method: min + default_type: zero + count: 0 truth_unified_sites: - range: {ref: A, beg: 1000, end: 1000} @@ -69,4 +73,4 @@ truth_output_vcf: ##FORMAT= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B - A 1000 . C CG 35 . . GT:DP:AD:RNC ./.:.:.:MM 1/1:.:0,0:.. + A 1000 A_1000_C_CG C CG 35 . AF=0.5;AQ=35 GT:DP:AD:RNC ./.:.:0,0:II 1/1:.:0,0:.. diff --git a/test/data/gvcf_test_cases/deepvariant.yml b/test/data/gvcf_test_cases/deepvariant.yml index 38b0be3f..e123a127 100644 --- a/test/data/gvcf_test_cases/deepvariant.yml +++ b/test/data/gvcf_test_cases/deepvariant.yml @@ -84,7 +84,48 @@ input: chr21 16188965 . TAC T,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 chr21 16188967 . CATAT TAT,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 -config_preset: DeepVariant +unifier_config: + min_AQ1: 0 + min_AQ2: 0 + min_GQ: 0 + monoallelic_sites_for_lost_alleles: true + +genotyper_config: + required_dp: 0 + revise_genotypes: false + liftover_fields: + - orig_names: [MIN_DP, DP] + name: DP + description: '##FORMAT=' + type: int + combi_method: min + number: basic + count: 1 + ignore_non_variants: true + - orig_names: [AD] + name: AD + description: '##FORMAT=' + type: int + number: alleles + combi_method: min + default_type: zero + count: 0 + - orig_names: [GQ] + name: GQ + description: '##FORMAT=' + type: int + number: basic + combi_method: min + count: 1 + ignore_non_variants: true + - orig_names: [PL] + name: PL + description: '##FORMAT=' + type: int + number: genotype + combi_method: missing + count: 0 + ignore_non_variants: true truth_unified_sites: - range: {ref: chr21, beg: 9412193, end: 9412193} @@ -197,7 +238,7 @@ truth_output_vcf: chr21 9412435 chr21_9412435_T_A T A 23 . AF=0.25;AQ=23 GT:GQ:PL:DP:AD:RNC 0/1:24:23,0,37:44:22,22:.. 0/0:14:0,13,42:103:89,14:.. chr21 9412441 chr21_9412441_TATA_T TATA T 31 . AF=0.5;AQ=31 GT:GQ:PL:DP:AD:RNC 0/1:26:31,0,27:47:25,22:.. 0/1:27:27,0,46:98:69,29:.. chr21 9418260 chr21_9418260_A_T;chr21_9418260_A_C A T,C 43 . AF=0.75,0.25;AQ=43,42 GT:GQ:PL:DP:AD:RNC 1/1:35:36,40,0,.,.,.:91:0,91,0:.. 1/2:38:45,48,42,48,0,43:135:0,106,29:.. - chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC ./.:.:.:.:.:PP ./.:.:.:20:.:OO + chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC ./.:0:.:1:1,0,0:II ./.:.:.:20:.:OO chr21 16188963 chr21_16188963_CAT_C CAT C 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./0:32:20,0:3:.:-. 1/1:32:20,12:3:2,3,0:.. chr21 16188965 chr21_16188965_TAC_T TAC T 2 MONOALLELIC AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.,12:.:.:OO ./.:32:.,0:3:.:11 chr21 16188967 chr21_16188967_C_G C G 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.:.:.:OO 1/1:32:20,12:3:2,3,0:.. diff --git a/test/data/gvcf_test_cases/deepvariant2.yml b/test/data/gvcf_test_cases/deepvariant2.yml index d638c91b..fc9f57ed 100644 --- a/test/data/gvcf_test_cases/deepvariant2.yml +++ b/test/data/gvcf_test_cases/deepvariant2.yml @@ -861,11 +861,7 @@ input: chr12 111766623 . A G,<*> 61.1 PASS . GT:GQ:DP:AD:VAF:PL 0/1:61:49:25,24,0:0.489796,0:61,0,72,990,990,990 chr12 111766624 . A <*> 0 . END=111767000 GT:GQ:MIN_DP:PL 0/0:48:39:0,117,1409 -unifier_config: - min_AQ1: 10 - min_AQ2: 10 - min_GQ: 0 - monoallelic_sites_for_lost_alleles: true +config_preset: DeepVariant genotyper_config: required_dp: 0 @@ -1039,7 +1035,7 @@ truth_output_vcf: chr12 111760217 chr12_111760217_C_G C G 65 . AF=0.25;AQ=65 GT:DP:AD:GQ:PL:RNC 0/0:23:23,0:50:0,114,1139:.. 0/0:25:25,0:50:0,99,989:.. 0/0:21:21,0:50:0,81,809:.. 1/1:30:0,30:60:65,61,0:.. chr12 111760567 chr12_111760567_ATATTAT_A ATATTAT A 47 . AF=0.125;AQ=47 GT:DP:AD:GQ:PL:RNC 0/0:22:22,0:48:0,75,749:.. 0/0:25:25,0:50:0,99,989:.. 0/0:28:28,0:50:0,111,1109:.. 0/1:32:12,20:48:47,0,59:.. chr12 111762346 chr12_111762346_T_C T C 77 . AF=0.5;AQ=77 GT:DP:AD:GQ:PL:RNC 0/0:18:18,0:50:0,84,839:.. 0/1:39:20,19:60:61,0,66:.. 0/1:32:14,18:57:58,0,61:.. 1/1:39:0,39:66:77,66,0:.. - chr12 111762598 chr12_111762598_CAA_C CAA C 32 . AF=0.125;AQ=32 GT:DP:AD:GQ:PL:RNC ./.:.:.:.:0,0,0:PP 0/0:17:11,0:40:0,0,0:.. 0/0:20:15,0:44:0,0,0:.. 0/1:28:18,8:32:32,0,39:.. + chr12 111762598 chr12_111762598_CAA_C CAA C 32 . AF=0.125;AQ=32 GT:DP:AD:GQ:PL:RNC ./.:19:19,0:0:0,0,0:II 0/0:17:11,0:40:0,0,0:.. 0/0:20:15,0:44:0,0,0:.. 0/1:28:18,8:32:32,0,39:.. chr12 111763383 chr12_111763383_C_CA C CA 34 . AF=0.125;AQ=34 GT:DP:AD:GQ:PL:RNC 0/0:34:34,0:42:0,42,899:.. 0/0:34:34,0:12:0,12,839:.. 0/0:32:32,0:50:0,135,1349:.. 0/1:29:10,18:33:34,0,37:.. chr12 111763759 chr12_111763759_CA_C CA C 56 . AF=0.25;AQ=56 GT:DP:AD:GQ:PL:RNC 0/0:31:31,0:48:0,102,1019:.. 0/0:44:44,0:42:0,42,1139:.. 0/0:32:32,0:50:0,135,1349:.. 1/1:29:0,27:40:56,39,0:.. chr12 111764982 chr12_111764982_TTGTGTGTG_T;chr12_111764982_TTGTG_T;chr12_111764982_TTG_T;chr12_111764982_T_TTG;chr12_111764982_TTGTGTG_T TTGTGTGTG T,TTGTG,TTGTGTG,TTGTGTGTGTG,TTG 42 . AF=0.25,0.125,0.125,0.125,0.125;AQ=42,42,41,38,38 GT:DP:AD:GQ:PL:RNC 0/0:19:19,0,0,0,0,0:30:0,30,539,30,539,539,30,539,539,539,30,539,539,539,539,30,539,539,539,539,539:.. 1/3:22:1,7,0,14,0,0:38:65,41,50,990,990,990,41,0,990,50,990,990,990,990,990,990,990,990,990,990,990:.. 4/5:32:3,0,0,0,18,8:34:59,990,990,990,990,990,990,990,990,990,38,990,990,990,51,38,990,990,990,0,39:.. 1/2:25:0,8,16,0,0,0:39:66,42,53,42,0,58,990,990,990,990,990,990,990,990,990,990,990,990,990,990,990:.. diff --git a/test/data/gvcf_test_cases/dv_1000G_chr21_5583275.yml b/test/data/gvcf_test_cases/dv_1000G_chr21_5583275.yml new file mode 100644 index 00000000..c6a5c3fc --- /dev/null +++ b/test/data/gvcf_test_cases/dv_1000G_chr21_5583275.yml @@ -0,0 +1,95 @@ +readme: | + 1000G chr21_5583275_G_T + + A site exercising corner behaviors with a low-quality allele, non-called reference bands, + and RefCall variant records. + +input: + header : |- + ##fileformat=VCFv4.2 + ##FILTER= + ##FILTER= + ##FILTER= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##INFO= + ##contig= + ##contig= + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + body: + - HG00096.gvcf: | + HG00096 + chr21 5583273 . G <*> 0 . END=5583274 GT:GQ:MIN_DP:PL 0/0:15:5:0,15,149 + chr21 5583275 . G T,<*> 1 RefCall . GT:GQ:DP:AD:VAF:PL ./.:7:5:2,3,0:0.6,0:0,12,7,990,990,990 + chr21 5583276 . T <*> 0 . END=5583283 GT:GQ:MIN_DP:PL 0/0:15:5:0,15,149 + + - HG00097.gvcf: | + HG00097 + chr21 5583273 . G <*> 0 . END=5583274 GT:GQ:MIN_DP:PL 0/0:9:3:0,9,89 + chr21 5583275 . G <*> 0 . END=5583275 GT:GQ:MIN_DP:PL ./.:0:3:20,0,50 + chr21 5583276 . T <*> 0 . END=5583280 GT:GQ:MIN_DP:PL 0/0:9:3:0,9,89 + + - HG00099.gvcf: | + HG00099 + chr21 5583256 . G <*> 0 . END=5583274 GT:GQ:MIN_DP:PL 0/0:18:6:0,18,179 + chr21 5583275 . G T,<*> 0.5 RefCall . GT:GQ:DP:AD:VAF:PL ./.:10:6:3,3,0:0.5,0:0,9,21,990,990,990 + chr21 5583276 . T <*> 0 . END=5583290 GT:GQ:MIN_DP:PL 0/0:18:6:0,18,179 + + - HG01377.gvcf: | + HG01377 + chr21 5583188 . C <*> 0 . END=5583274 GT:GQ:MIN_DP:PL 0/0:6:2:0,9,89 + chr21 5583275 . G T,<*> 10.7 PASS . GT:GQ:DP:AD:VAF:PL 1/1:9:3:1,2,0:0.666667,0:10,11,0,990,990,990 + chr21 5583276 . T <*> 0 . END=5583277 GT:GQ:MIN_DP:PL 0/0:9:3:0,9,89 + +config_preset: DeepVariant + +unifier_config: + min_AQ1: 10 + min_AQ2: 10 + min_GQ: 10 + monoallelic_sites_for_lost_alleles: true + +truth_discovered_alleles: +- range: {ref: chr21, beg: 5583275, end: 5583275} + dna: 'G' + is_ref: true + all_filtered: false + top_AQ: [0] + zygosity_by_GQ: [[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0]] +- range: {ref: chr21, beg: 5583275, end: 5583275} + dna: 'T' + is_ref: false + all_filtered: false + top_AQ: [10] + zygosity_by_GQ: [[0,1],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0]] + +truth_unified_sites: +- range: {ref: chr21, beg: 5583275, end: 5583275} + alleles: + - dna: G + - dna: T + quality: 10 + frequency: 0.125 # unifier puts 1/2N floor under frequency as long as AQ is sufficient + quality: 10 + +truth_output_vcf: + - truth.vcf: | + ##fileformat=VCFv4.2 + ##INFO= + ##INFO= + ##FILTER= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##FORMAT= + ##contig= + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG01377 + chr21 5583275 chr21_5583275_G_T G T 10 . AF=0.125;AQ=10 GT:DP:AD:GQ:PL:RNC ./.:5:2,3:7:0,12,7:II ./.:3:3,0:0:.:II ./.:6:3,3:10:0,9,21:II 1/1:3:1,2:1:10,11,0:.. diff --git a/test/gvcf_test_cases.cc b/test/gvcf_test_cases.cc index 04282998..a9bc8c0b 100644 --- a/test/gvcf_test_cases.cc +++ b/test/gvcf_test_cases.cc @@ -229,13 +229,13 @@ class GVCFTestCase { const auto n_config_preset = yaml["config_preset"]; if (n_config_preset) { - string crc32c; + string txt, crc32c; V(n_config_preset.IsScalar(), "config_preset invalid"); S(GLnexus::cli::utils::load_config(spdlog::null_logger_st(name), n_config_preset.Scalar(), unifier_cfg, genotyper_cfg, - crc32c)); + txt, crc32c)); } const auto n_unifier_config = yaml["unifier_config"]; @@ -755,7 +755,7 @@ TEST_CASE("join records with unifier preference for small alleles") { } TEST_CASE("DP0_noAD") { - vector v_formats = {"GT", "RNC", "DP", "SB", "AD", "GQ"}; + vector v_formats = {"GT", "RNC", "DP", "SB", "AD", "GQ", "RNC"}; vector v_infos = {}; GVCFTestCase DP0_case("DP0_noAD", v_formats, v_infos); DP0_case.perform_gvcf_test(); @@ -798,14 +798,14 @@ TEST_CASE("xAtlas") { } TEST_CASE("weCall") { - vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "FT", "SBPV"}; + vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "FT", "SBPV", "RNC"}; vector v_infos = {"ANR","AF","AQ"}; GVCFTestCase("weCall", v_formats, v_infos, false).perform_gvcf_test(); } TEST_CASE("weCall_squeeze") { - vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "FT", "SBPV"}; + vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "FT", "SBPV", "RNC"}; vector v_infos = {"ANR","AF","AQ"}; GVCFTestCase("weCall_squeeze", v_formats, v_infos, false).perform_gvcf_test(); } @@ -836,17 +836,23 @@ TEST_CASE("edge_spanning_deletion3") { } TEST_CASE("DeepVariant") { - vector v_formats = {"DP", "GT", "GQ", "PL", "AD"}; + vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "RNC"}; vector v_infos = {"ANR","AF","AQ"}; GVCFTestCase("deepvariant", v_formats, v_infos, false).perform_gvcf_test(); } TEST_CASE("DeepVariant2") { - vector v_formats = {"DP", "GT", "GQ", "PL", "AD"}; + vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "RNC"}; vector v_infos = {"ANR","AF","AQ"}; GVCFTestCase("deepvariant2", v_formats, v_infos, false).perform_gvcf_test(); } +TEST_CASE("dv_1000G_chr21_5583275") { + vector v_formats = {"DP", "GT", "GQ", "PL", "AD", "RNC"}; + vector v_infos = {"ANR","AF","AQ"}; + GVCFTestCase("dv_1000G_chr21_5583275", v_formats, v_infos, true).perform_gvcf_test(); +} + TEST_CASE("strelka2") { vector v_formats = {"DP", "GT", "GQ", "AD", "FT"}; vector v_infos = {"AF","AQ"};