diff --git a/bam2bcf.c b/bam2bcf.c index d9e55904..07fcd353 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -382,6 +382,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // Note baseQ changes some output fields such as I16, but has no // significant affect on "call". baseQ = p->aux>>8&0xff; + + // Can we reuse baseQ as indelQ1 instead of indelQ? + // So we can distinguish between likelihood of any indel vs + // likelihood of this specific indel? } else { diff --git a/bam2bcf.h b/bam2bcf.h index 9d464d60..e4ed6628 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -131,6 +131,7 @@ typedef struct __bcf_callaux_t { void *rghash; float indel_bias; // adjusts indel score threshold; lower => call more. float del_bias; // (-.9 < x < .9) error profile; >0 => more del, <0 => more ins + float vs_ref; // 0 to 1. 0: score vs next-best. 1: score vs ref int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm} unsigned int nnm[2]; // number of nm observations float nm[2]; // cumulative count of mismatches in ref and alt reads diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index 9c69d749..34c68680 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -1069,6 +1069,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // FIXME: n_types has a maximum; no need to alloc - use a #define? int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp; memset(sumq, 0, n_types * sizeof(int)); + int sum_indelQ1[100] = {0}; // n + int sum_indelQ2[100] = {0}; // n // Confusing variable naming and bit usage. // @@ -1085,7 +1087,7 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // sct is short for score. // sc is score + t(type) // Why aren't these variable names reversed? - int *sct = &score[K*n_types], seqQ, indelQ; + int *sct = &score[K*n_types], seqQ, indelQ1=0, indelQ2=0, indelQ=0; for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; for (t = 1; t < n_types; ++t) // insertion sort for (j = t; j > 0 && sc[j] < sc[j-1]; --j) @@ -1106,15 +1108,34 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, indelQ = (sc[1]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run, str_len1); } else { -// // Simplest code in edlib10f outputs -// indelQ = (sc[1]>>14) - (sc[0]>>14); +#if 0 + // Simplest code in edlib10f outputs + indelQ = (sc[1]>>14) - (sc[0]>>14); +#endif + +// Maybe just an option of x*indelQ1 + (1-x)*indelQ2 so we can adjust +// based on indel vs genotype accuracy, per instrument / dataset? + + +#if 1 +// Orig code; indelQ1 +// Good on HG002.GRCh38.PacBio_CCS_15Kb.bam + // look for the reference type + for (t = 0; t < n_types; ++t) { + if ((sc[t]&0x3f) == ref_type) + break; + } + indelQ = indelQ1 = (sc[t]>>14) - (sc[0]>>14); +// fprintf(stderr, "IndelQ = %d: %d-%d", +// indelQ, (sc[t]>>14), (sc[0]>>14)); +#endif -// Orig code -// for (t = 0; t < n_types; ++t) // look for the reference type -// if ((sc[t]&0x3f) == ref_type) break; -// indelQ = (sc[t]>>14) - (sc[0]>>14); +#if 1 +// 10e; indelQ + + // Revised code in edlib10e outputs + // Good on most other data sets, including the 53x CCS SequelII data. -// Revised code in edlib10e outputs // Best call is non-ref, compare vs next best non-ref, // or ref if it's just 2 choices (most common case). for (t = 1; t < n_types; t++) @@ -1123,13 +1144,108 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, else break; if (t == n_types) t--; // it's ref, but it'll do as next best. - indelQ = (sc[t]>>14) - (sc[0]>>14); + indelQ2 = (sc[t]>>14) - (sc[0]>>14); +// fprintf(stderr, "\tNEW %d: %d-%d\n", +// indelQ, (sc[t]>>14), (sc[0]>>14)); +#endif + +#if 0 + // Best call is non-ref, get the ref score and (if different) + // the next best non-ref. Average the two and then compare vs + // this call. It means we assign ADs better, but don't + // overly lose variants either when we have many choices. + int ref_t = -1, next_t = -1; + for (t = 1; t < n_types; t++) { + if ((sc[t]&0x3f) == ref_type) { + ref_t = t; + } else { + if (next_t <= 0) + next_t = t; + } + } + if (next_t < 0) + next_t = ref_t; + assert(MIN(ref_t, next_t) == 1); + + // Could also try avg(ref,1) too rather than avg(ref,next)? + // Tried - no better + + indelQ = (((sc[ref_t]>>14)+(sc[next_t]>>14))>>1) - (sc[0]>>14); +#endif + +#if 0 + int ref_t = -1, next_t = -1; + for (t = 1; t < n_types; t++) { + if ((sc[t]&0x3f) == ref_type) { + ref_t = t; + } else { + if (next_t <= 0) + next_t = t; + } + } + if (next_t < 0) + next_t = ref_t; + assert(MIN(ref_t, next_t) == 1); + + if (ref_t <= next_t) + indelQ = (sc[ref_t]>>14) - (sc[0]>>14); + else + //indelQ = (sc[next_t]>>14) - (sc[0]>>14); + indelQ = (((sc[ref_t]>>14)+(sc[next_t]>>14))>>1) - (sc[0]>>14); +#endif + +#if 0 + // Best call is non-ref, get the ref score and (if different) + // the next best non-ref. Average the two and then compare vs + // this call. It means we assign ADs better, but don't + // overly lose variants either when we have many choices. + int ref_t = -1, next_t = -1; + for (t = 1; t < n_types; t++) { + if ((sc[t]&0x3f) == ref_type) { + ref_t = t; + } else { + if (next_t <= 0) + next_t = t; + } + } + if (next_t < 0) + next_t = ref_t; + assert(MIN(ref_t, next_t) == 1); + + indelQ = (sc[next_t]>>14) - (sc[0]>>14); + indelQ += (sc[ref_t]>>14) > (sc[next_t]>>14); +// indelQ >>= (sc[ref_t]>>14) > (sc[next_t]>>14); // VBAD +#endif + + // Maybe boost seqQ if ref_t != next_t and sc[ref_t]-sc[0] is high + // while sc[next_t]-sc[0] is not? So we're saying the indel is good + // but this allele is not? Not sure how we explore that. + // It's the difference between making an indel-exists call vs making + // a call on this specific genotype. + // ORIG seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1); + + // TO TEST + //seqQ += MAX(0, (indelQ1 - indelQ)/4); + //indelQ += MAX(0, (indelQ1 - indelQ2)/4); + + indelQ = bca->vs_ref*indelQ1 + (1-bca->vs_ref)*indelQ2; + +#if 0 + // If ref_t scores higher than next_t then we have more + // than 1 ALT allele. This requires a slightly higher + // threshold of confidence to call it as it's more likely + // down to sequencing error. + if (ref_t >= 2) + seqQ *= pow(0.9, ref_t - 1); +#endif } + // So we lower qual in some, but raise the average to keep FN/FP // ratios up. indelQ /= bca->indel_bias; + indelQ1 /= bca->indel_bias; // Or maybe just *2 if bca->poly_mqual and be done with it? // Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ? @@ -1147,6 +1263,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, uint8_t *seq = bam_get_seq(p->b); uint8_t *qual = bam_get_qual(p->b); int min_q = qual[qpos]; + int nbase = 0; + int sumq = 0; // scan homopolymer left char baseL = bam_seqi(seq, qpos+1 < p->b->core.l_qseq @@ -1165,8 +1283,19 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, min_q = qual[l]; if (bam_seqi(seq, l) != base) break; + nbase++; + sumq += qual[l]; } +// if (nbase) { +// sumq = sumq / (double)nbase; +// if (min_q < sumq / 4) +// min_q = sumq / 4; +// } + +// if (min_q < 10) +// min_q = 10; + // We reduce -h so homopolymers get reduced likelihood of being // called, but then optionally increase or decrease from there // based on base quality. Hence lack of low quality bases in @@ -1176,33 +1305,64 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // now these work well (tuned on PB HiFi). seqQ += MIN(qavg/20, min_q - qavg/10); indelQ += MIN(qavg/20, min_q - qavg/5); + indelQ1+= MIN(qavg/20, min_q - qavg/5); if (seqQ < 0) seqQ = 0; if (indelQ < 0) indelQ = 0; + if (indelQ1< 0) indelQ1= 0; } // This is the length-normalised score from bcf_cgp_align_score tmp = sc[0]>>6 & 0xff; // reduce indelQ - // high score = bad, low score = good. + // high score = bad, low score = good; flip for indelQ // low normalised scores leave indelQ unmodified // high normalised scores set indelQ to 0 // inbetween scores have a linear scale from indelQ to 0 indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + indelQ1= tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1+ .499); // Doesn't really help accuracy, but permits -h to take // affect still. if (indelQ > seqQ) indelQ = seqQ; if (indelQ > 255) indelQ = 255; + if (indelQ1> 255) indelQ1= 255; if (seqQ > 255) seqQ = 255; - // use 22 bits in total + // Use 22 bits in total. + // 0-7 IndelQ + // 8-15 SeqQ + // 16-22 Score-per-base p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; sumq[sc[0]&0x3f] += indelQ; + + // Experiment in p->aux vs sumq. + // One gives likelihood of an indel being here, while the other + // is likelihood of a specific genotype? But which is which? + + sum_indelQ1[s] += indelQ1; + sum_indelQ2[s] += indelQ; } + +// double m = (double)sum_indelQ1[s] / sum_indelQ2[s]; +// if (m > 1) { +// m = sqrt(m); +// fprintf(stderr, "Sum %d %d m=%f\n", sum_indelQ1, sum_indelQ, m); +// for (i = 0; i < n_plp[s]; ++i) { +// bam_pileup1_t *p = plp[s] + i; +// int indelQ = p->aux & 0xff; +// indelQ *= m; +// if (indelQ > 255) +// indelQ = 255; +// p->aux = (p->aux & ~0xff) | indelQ; +// } +// } } - // determine bca->indel_types[] and bca->inscns + + // Determine bca->indel_types[] and bca->inscns. + // Sumq[0] is always reference. + // Sumq[1] is best non-ref (and maybe better than ref) bca->maxins = max_ins; bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); if (bca->maxins && !bca->inscns) @@ -1214,20 +1374,27 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; for (t = 0; t < n_types; ++t) // look for the reference type if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first tmp = sumq[t]; for (; t > 0; --t) sumq[t] = sumq[t-1]; sumq[0] = tmp; } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; for (t = 0; t < 4 && t < n_types; ++t) { bca->indel_types[t] = types[sumq[t]&0x3f]; - if (bca->maxins) + if (bca->maxins) // potentially an insertion memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); } - // update p->aux + + // Update p->aux. + // If per-alignment type isn't found, then indelQ/seqQ is 0, + // otherwise unchanged. for (s = n_alt = 0; s < n; ++s) { +// double m = sqrt((double)sum_indelQ1[s] / sum_indelQ2[s]); +// if (m < 1) m = 1; for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; int x = types[p->aux>>16&0x3f]; @@ -1235,6 +1402,15 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (x == bca->indel_types[j]) break; p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); if ((p->aux>>16&0x3f) > 0) ++n_alt; + +// Poor +// // We recorded indelQ based on this allele, but +// // scale now by quality of any non-ref allele. +// // This reduces FN while keeping GT accurate (maybe!) +// int indelQ = (p->aux & 0xff) * m; +// if (indelQ > 255) indelQ = 255; +// p->aux = (p->aux & ~0xff) | indelQ; + //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } diff --git a/mpileup.c b/mpileup.c index c023a9b2..b5e02c3d 100644 --- a/mpileup.c +++ b/mpileup.c @@ -75,6 +75,7 @@ typedef struct { double min_frac; // for indels double indel_bias, poly_mqual; double del_bias; // compensate for diff deletion vs insertion error rates + double vs_ref; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; faidx_t *fai; @@ -877,6 +878,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->indel_win_size = conf->indel_win_size; conf->bca->edlib = conf->edlib; conf->bca->poly_mqual = conf->poly_mqual; + conf->bca->vs_ref = conf->vs_ref; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -1277,6 +1279,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); fprintf(fp, " --del-bias FLOAT Relative likelihood of insertion to deletion [%.2f]\n", mplp->del_bias); + fprintf(fp, + " --score-vs-ref FLOAT\n" + " Ratio of score vs ref (1) or 2nd-best allele (0) [%.2f]\n", mplp->vs_ref); fprintf(fp, " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp, @@ -1300,7 +1305,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -B -Q5 --max-BQ 50 -F0.10 -o25 -e1 -h300 --delta-BQ 10 \\\n" " --del-bias 0.4 --poly-mqual --edlib\n" " ultima or ultima-1.20:\n" - " -B -Q4 --max-BQ 40 -F0.15 -o20 -e15 -h250 --delta-BQ 99 \\\n" + " -B --max-BQ 30 -F0.15 -o20 -e15 -h250 --delta-BQ 10 \\\n" " --del-bias 0.3 --poly-mqual --edlib\n" "\n" "Notes: Assuming diploid individuals.\n" @@ -1331,6 +1336,7 @@ int main_mpileup(int argc, char *argv[]) mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.vs_ref = 0; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; @@ -1423,6 +1429,8 @@ int main_mpileup(int argc, char *argv[]) {"write-index",no_argument,NULL,21}, {"del-bias", required_argument, NULL, 23}, {"poly-mqual", no_argument, NULL, 24}, + {"no-poly-mqual", no_argument, NULL, 26}, + {"score-vs-ref",required_argument, NULL, 27}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1533,6 +1541,11 @@ int main_mpileup(int argc, char *argv[]) else mplp.indel_bias = 1/atof(optarg); break; + case 27: + mplp.vs_ref = atof(optarg); + //if (mplp.vs_ref < 0) mplp.vs_ref = 0; + if (mplp.vs_ref > 1) mplp.vs_ref = 1; + break; case 15: { char *tmp; mplp.indel_win_size = strtol(optarg,&tmp,10); @@ -1550,6 +1563,7 @@ int main_mpileup(int argc, char *argv[]) case 25: mplp.edlib = 0; break; case 23: mplp.del_bias = atof(optarg); break; case 24: mplp.poly_mqual = 1; break; + case 26: mplp.poly_mqual = 0; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1585,8 +1599,10 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.4; + mplp.indel_bias = 1.2; mplp.poly_mqual = 1; mplp.edlib = 1; + mplp.vs_ref = 0.7; } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); @@ -1611,16 +1627,17 @@ int main_mpileup(int argc, char *argv[]) } else if (strcasecmp(optarg, "ultima") == 0 || strcasecmp(optarg, "ultima-1.20") == 0) { mplp.min_frac = 0.15; - mplp.min_baseQ = 3; - mplp.max_baseQ = 40; - mplp.delta_baseQ = 99; + mplp.min_baseQ = 1; + mplp.max_baseQ = 30; + mplp.delta_baseQ = 10; mplp.openQ = 20; - mplp.extQ = 15; + mplp.extQ = 10; mplp.tandemQ = 250; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.3; mplp.poly_mqual = 1; mplp.edlib = 1; + mplp.vs_ref = 0.3; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002; @@ -1634,8 +1651,16 @@ int main_mpileup(int argc, char *argv[]) mplp.flag |= MPLP_REALN_PARTIAL; } else if (strcasecmp(optarg, "illumina") == 0 || strcasecmp(optarg, "illumina-1.20") == 0) { + mplp.edlib = 1; + mplp.indel_win_size = 110; mplp.flag |= MPLP_REALN_PARTIAL; + } else if (strcasecmp(optarg, "bgi") == 0 || + strcasecmp(optarg, "bgi-1.20") == 0) { + // Largely as per Illumina mplp.edlib = 1; + mplp.indel_win_size = 110; + mplp.indel_bias = 0.9; + mplp.flag |= MPLP_REALN_PARTIAL; } else { fprintf(stderr, "Unknown configuration name '%s'\n" "Please choose from 1.12, illumina, pacbio-ccs or ont\n",