From 18e10afc5c6318d4bb0ce331fde0013a0efcbdd1 Mon Sep 17 00:00:00 2001 From: Christopher Chang Date: Sat, 3 Feb 2024 21:04:16 -0800 Subject: [PATCH] 10+ ALT allele --vcf-half-call missing bugfix, --hardy/--hwe log-pvalues --- 2.0/include/plink2_base.h | 2 +- 2.0/include/plink2_stats.cc | 1419 +++++++++++++++++++++++++++++++---- 2.0/include/plink2_stats.h | 29 +- 2.0/plink2.cc | 58 +- 2.0/plink2_filter.cc | 39 +- 2.0/plink2_filter.h | 2 +- 2.0/plink2_help.cc | 5 +- 2.0/plink2_import.cc | 10 + 2.0/plink2_ld.cc | 19 +- 2.0/plink2_misc.cc | 109 ++- 2.0/plink2_misc.h | 35 +- 11 files changed, 1462 insertions(+), 265 deletions(-) diff --git a/2.0/include/plink2_base.h b/2.0/include/plink2_base.h index 0d3a12ee..1950ef12 100644 --- a/2.0/include/plink2_base.h +++ b/2.0/include/plink2_base.h @@ -106,7 +106,7 @@ // 10000 * major + 100 * minor + patch // Exception to CONSTI32, since we want the preprocessor to have access // to this value. Named with all caps as a consequence. -#define PLINK2_BASE_VERNUM 812 +#define PLINK2_BASE_VERNUM 813 #define _FILE_OFFSET_BITS 64 diff --git a/2.0/include/plink2_stats.cc b/2.0/include/plink2_stats.cc index a2acbf6c..99559063 100644 --- a/2.0/include/plink2_stats.cc +++ b/2.0/include/plink2_stats.cc @@ -22,8 +22,8 @@ namespace plink2 { // Thread-unsafe portions of plink_stats.c have been replaced, mostly by code // derived from boost/math/special_functions/gamma.hpp and -// boost/math/special_functions/detail/igamma_inverse.hpp in Boost 1.60 -// (Maddock et al.). The derived portions are subject to the following +// boost/math/special_functions/detail/igamma_inverse.hpp in Boost 1.60 and +// 1.84 (Maddock et al.). The derived portions are subject to the following // license: // // ***** @@ -345,78 +345,122 @@ double finite_half_gamma_q2(uint32_t a_minus_half, double xx, double* p_derivati return ee; } -static const double kLanczosSumNumer[6] = {8706.3495925490091, 8523.650341121874, 3338.0292194764235, 653.64249942940087, 63.999518449381870, 2.5066282746310063}; -static const double kLanczosSumDenom[6] = {0, 24, 50, 35, 10, 1}; -static const double kLanczosSumExpgNumer[6] = {32.812445410297834, 32.123889414443320, 12.580347294552161, 2.4634444783532414, 0.2412010548258800, 0.0094469677045392}; +static const double kLanczosFloatSumNumer[6] = {8706.3495925490091, 8523.650341121874, 3338.0292194764235, 653.64249942940087, 63.999518449381870, 2.5066282746310063}; +static const double kLanczosFloatSumDenom[6] = {0, 24, 50, 35, 10, 1}; +static const double kLanczosFloatSumExpgNumer[6] = {32.812445410297834, 32.123889414443320, 12.580347294552161, 2.4634444783532414, 0.2412010548258800, 0.0094469677045392}; // this depends on the polynomial coefficients above -static const double kLanczosG = 5.581; +static const double kLanczosFloatG = 5.581; -double lanczos_sum(double zz) { +double lanczos_sum_f(double zz) { double s1; double s2; if (zz <= 1) { - s1 = kLanczosSumNumer[5]; - s2 = kLanczosSumDenom[5]; + s1 = kLanczosFloatSumNumer[5]; + s2 = kLanczosFloatSumDenom[5]; for (int32_t ii = 4; ii >= 0; --ii) { s1 *= zz; s2 *= zz; - s1 += kLanczosSumNumer[S_CAST(uint32_t, ii)]; - s2 += kLanczosSumDenom[S_CAST(uint32_t, ii)]; + s1 += kLanczosFloatSumNumer[S_CAST(uint32_t, ii)]; + s2 += kLanczosFloatSumDenom[S_CAST(uint32_t, ii)]; } } else { zz = 1 / zz; - s1 = kLanczosSumNumer[0]; - s2 = kLanczosSumDenom[0]; + s1 = kLanczosFloatSumNumer[0]; + s2 = kLanczosFloatSumDenom[0]; for (uint32_t uii = 1; uii != 6; ++uii) { s1 *= zz; s2 *= zz; - s1 += kLanczosSumNumer[uii]; - s2 += kLanczosSumDenom[uii]; + s1 += kLanczosFloatSumNumer[uii]; + s2 += kLanczosFloatSumDenom[uii]; } } return s1 / s2; } -double lanczos_sum_expg_scaled_imp(double zz, double* s2_ptr) { +double lanczos_sum_f_expg_scaled_imp(double zz, double* s2_ptr) { double s1; double s2; if (zz <= 1) { - s1 = kLanczosSumExpgNumer[5]; - s2 = kLanczosSumDenom[5]; + s1 = kLanczosFloatSumExpgNumer[5]; + s2 = kLanczosFloatSumDenom[5]; for (int32_t ii = 4; ii >= 0; --ii) { s1 *= zz; s2 *= zz; - s1 += kLanczosSumExpgNumer[S_CAST(uint32_t, ii)]; - s2 += kLanczosSumDenom[S_CAST(uint32_t, ii)]; + s1 += kLanczosFloatSumExpgNumer[S_CAST(uint32_t, ii)]; + s2 += kLanczosFloatSumDenom[S_CAST(uint32_t, ii)]; } } else { zz = 1 / zz; - s1 = kLanczosSumExpgNumer[0]; - s2 = kLanczosSumDenom[0]; + s1 = kLanczosFloatSumExpgNumer[0]; + s2 = kLanczosFloatSumDenom[0]; for (uint32_t uii = 1; uii != 6; ++uii) { s1 *= zz; s2 *= zz; - s1 += kLanczosSumExpgNumer[uii]; - s2 += kLanczosSumDenom[uii]; + s1 += kLanczosFloatSumExpgNumer[uii]; + s2 += kLanczosFloatSumDenom[uii]; } } *s2_ptr = s2; return s1; } -double lanczos_sum_expg_scaled(double zz) { +double lanczos_sum_f_expg_scaled(double zz) { double s2; - double s1 = lanczos_sum_expg_scaled_imp(zz, &s2); + double s1 = lanczos_sum_f_expg_scaled_imp(zz, &s2); return s1 / s2; } -double lanczos_sum_expg_scaled_recip(double zz) { +double lanczos_sum_f_expg_scaled_recip(double zz) { double s2; - double s1 = lanczos_sum_expg_scaled_imp(zz, &s2); + double s1 = lanczos_sum_f_expg_scaled_imp(zz, &s2); return s2 / s1; } +// We want better-than-float precision for our log-factorial function. +static const double kLanczosDoubleSumDenom[13] = {0, 39916800, 120543840, 150917976, 105258076, 45995730, 13339535, 2637558, 357423, 32670, 1925, 66, 1}; +static const double kLanczosDoubleSumExpgNumer[13] = {56906521.913471564, 103794043.11634455, 86363131.288138591, 43338889.324676138, 14605578.087685068, 3481712.1549806459, 601859.61716810988, 75999.293040145426, 6955.9996025153761, 449.94455690631681, 19.519927882476175, 0.50984166556566762, 0.0060618423462489065}; + +// this depends on the polynomial coefficients above +static const double kLanczosDoubleG = 6.0246800407767296; + +double lanczos_sum_d_expg_scaled_imp(double zz, double* s2_ptr) { + double s1; + double s2; + // Only called by Lfact() for now, where zz is guaranteed to be > 1. + /* + if (zz <= 1) { + s1 = kLanczosDoubleSumExpgNumer[12]; + s2 = kLanczosDoubleSumDenom[12]; + for (int32_t ii = 11; ii >= 0; --ii) { + s1 *= zz; + s2 *= zz; + s1 += kLanczosDoubleSumExpgNumer[S_CAST(uint32_t, ii)]; + s2 += kLanczosDoubleSumDenom[S_CAST(uint32_t, ii)]; + } + } else { + */ + zz = 1 / zz; + s1 = kLanczosDoubleSumExpgNumer[0]; + s2 = kLanczosDoubleSumDenom[0]; + for (uint32_t uii = 1; uii != 13; ++uii) { + s1 *= zz; + s2 *= zz; + s1 += kLanczosDoubleSumExpgNumer[uii]; + s2 += kLanczosDoubleSumDenom[uii]; + } + // } + *s2_ptr = s2; + return s1; +} + +double lanczos_sum_d_expg_scaled(double zz) { + double s2; + double s1 = lanczos_sum_d_expg_scaled_imp(zz, &s2); + return s1 / s2; +} + + double log1pmx(double xx) { // log(1+x) - x // assumes abs(xx) < 0.95 @@ -443,17 +487,18 @@ double log1pmx(double xx) { // compute (z^a)(e^{-z})/tgamma(a) double regularized_gamma_prefix(double aa, double zz) { // assumes a == 0.5 if a < 1. assumes z > 0. - // we are fine with float-level precision, so lanczos_n=6, kLanczosG=5.581 + // we are fine with float-level precision, so lanczos_n=6, + // kLanczosFloatG=5.581 if (aa < 1) { return sqrt(zz) * exp(-zz) * (1.0 / kSqrtPi); } - const double agh = aa + kLanczosG - 0.5; + const double agh = aa + kLanczosFloatG - 0.5; const double agh_recip = 1.0 / agh; - const double dd = ((zz - aa) - (kLanczosG - 0.5)) * agh_recip; + const double dd = ((zz - aa) - (kLanczosFloatG - 0.5)) * agh_recip; double prefix; if ((fabs(dd * dd * aa) <= 100) && (aa > 150)) { // abs(dd) < sqrt(2/3) < 0.95 - prefix = aa * log1pmx(dd) + zz * (0.5 - kLanczosG) * agh_recip; + prefix = aa * log1pmx(dd) + zz * (0.5 - kLanczosFloatG) * agh_recip; prefix = exp(prefix); } else { const double alz = aa * log(zz * agh_recip); @@ -479,7 +524,7 @@ double regularized_gamma_prefix(double aa, double zz) { prefix = pow(zz * agh_recip, aa) * exp(amz); } } - prefix *= sqrt(agh * kRecipE) * lanczos_sum_expg_scaled_recip(aa); + prefix *= sqrt(agh * kRecipE) * lanczos_sum_f_expg_scaled_recip(aa); return prefix; } @@ -731,21 +776,22 @@ static const double kLnSqrtPi = 0.5723649429247001; // compute -log((z^a)(e^{-z})/tgamma(a)) double regularized_gamma_prefix_ln(double aa, double zz) { // assumes a == 0.5 if a < 1. assumes z > 0. - // we are fine with float-level precision, so lanczos_n=6, kLanczosG=5.581 + // we are fine with float-level precision, so lanczos_n=6, + // kLanczosFloatG=5.581 if (aa < 1) { return -zz + 0.5 * log(zz) - kLnSqrtPi; } - const double agh = aa + kLanczosG - 0.5; + const double agh = aa + kLanczosFloatG - 0.5; const double agh_recip = 1.0 / agh; - const double dd = ((zz - aa) - (kLanczosG - 0.5)) * agh_recip; + const double dd = ((zz - aa) - (kLanczosFloatG - 0.5)) * agh_recip; double prefix_ln; if ((fabs(dd * dd * aa) <= 100) && (aa > 150)) { // abs(dd) < sqrt(2/3) < 0.95 - prefix_ln = aa * log1pmx(dd) + zz * (0.5 - kLanczosG) * agh_recip; + prefix_ln = aa * log1pmx(dd) + zz * (0.5 - kLanczosFloatG) * agh_recip; } else { prefix_ln = (aa - zz) + aa * log(zz * agh_recip); } - const double scaled_recip = lanczos_sum_expg_scaled_recip(aa); + const double scaled_recip = lanczos_sum_f_expg_scaled_recip(aa); return prefix_ln + 0.5 * log(agh * kRecipE * scaled_recip * scaled_recip); } @@ -1204,15 +1250,15 @@ double ibeta_series_ln(double aa, double bb, double xx, uint32_t inv) { const double cc = aa + bb; - const double agh = aa + kLanczosG - 0.5; - const double bgh = bb + kLanczosG - 0.5; - const double cgh = cc + kLanczosG - 0.5; + const double agh = aa + kLanczosFloatG - 0.5; + const double bgh = bb + kLanczosFloatG - 0.5; + const double cgh = cc + kLanczosFloatG - 0.5; double numer_a; - double denom_a = lanczos_sum_expg_scaled_imp(aa, &numer_a); + double denom_a = lanczos_sum_f_expg_scaled_imp(aa, &numer_a); double numer_b; - double denom_b = lanczos_sum_expg_scaled_imp(bb, &numer_b); + double denom_b = lanczos_sum_f_expg_scaled_imp(bb, &numer_b); double denom_c; - double numer_c = lanczos_sum_expg_scaled_imp(cc, &denom_c); + double numer_c = lanczos_sum_f_expg_scaled_imp(cc, &denom_c); double result = (numer_a * numer_b * numer_c) / (denom_a * denom_b * denom_c); double l1 = log(cgh / bgh) * (bb - 0.5); double l2 = log(xx * cgh / agh) * aa; @@ -1265,10 +1311,10 @@ double tgamma_delta_ratio(double zz, double delta) { } assert(delta == 0.5); // Can skip z < epsilon and z + delta == z branches for now. - double zgh = zz + kLanczosG - 0.5; + double zgh = zz + kLanczosFloatG - 0.5; // Also skip fabs(delta) >= 10 branch for now. double result = exp((0.5 - zz) * log1p(delta / zgh)); - result *= lanczos_sum(zz) / lanczos_sum(zz + delta); + result *= lanczos_sum_f(zz) / lanczos_sum_f(zz + delta); // exploit forced delta == 0.5 result *= sqrt(kE / (zgh + delta)); return result; @@ -1361,15 +1407,15 @@ double ibeta_power_terms_ln(double aa, double bb, double xx, double yy) { // prefix always 1 double cc = aa + bb; - const double agh = aa + kLanczosG - 0.5; - const double bgh = bb + kLanczosG - 0.5; - const double cgh = cc + kLanczosG - 0.5; + const double agh = aa + kLanczosFloatG - 0.5; + const double bgh = bb + kLanczosFloatG - 0.5; + const double cgh = cc + kLanczosFloatG - 0.5; double numer_a; - double denom_a = lanczos_sum_expg_scaled_imp(aa, &numer_a); + double denom_a = lanczos_sum_f_expg_scaled_imp(aa, &numer_a); double numer_b; - double denom_b = lanczos_sum_expg_scaled_imp(bb, &numer_b); + double denom_b = lanczos_sum_f_expg_scaled_imp(bb, &numer_b); double denom_c; - double numer_c = lanczos_sum_expg_scaled_imp(cc, &denom_c); + double numer_c = lanczos_sum_f_expg_scaled_imp(cc, &denom_c); double result = (numer_a * numer_b * numer_c) / (denom_a * denom_b * denom_c); result *= sqrt(agh * bgh * kRecipE / cgh); double result_ln = log(result); @@ -1717,12 +1763,122 @@ double QuantileToZscore(double pp) { (((((kIvnB[0]*r+kIvnB[1])*r+kIvnB[2])*r+kIvnB[3])*r+kIvnB[4])*r+1); } +static const double kLogFactorials[99] = { + 0, + 0, + 0.69314718055994529, + 1.791759469228055, + 3.1780538303479458, + 4.7874917427820458, + 6.5792512120101012, + 8.5251613610654129, + 10.604602902745251, + 12.801827480081471, + 15.104412573075518, + 17.502307845873887, + 19.987214495661888, + 22.552163853123428, + 25.19122118273868, + 27.89927138384089, + 30.671860106080675, + 33.505073450136891, + 36.395445208033053, + 39.339884187199495, + 42.335616460753485, + 45.380138898476915, + 48.471181351835227, + 51.606675567764377, + 54.784729398112312, + 58.003605222980525, + 61.261701761002001, + 64.557538627006323, + 67.88974313718154, + 71.257038967168015, + 74.658236348830172, + 78.092223553315307, + 81.557959456115029, + 85.05446701758153, + 88.580827542197667, + 92.136175603687093, + 95.719694542143202, + 99.330612454787428, + 102.9681986145138, + 106.63176026064346, + 110.32063971475741, + 114.03421178146171, + 117.77188139974507, + 121.53308151543862, + 125.31727114935688, + 129.12393363912722, + 132.95257503561629, + 136.80272263732635, + 140.67392364823425, + 144.56574394634487, + 148.47776695177302, + 152.40959258449737, + 156.3608363030788, + 160.3311282166309, + 164.3201122631952, + 168.32744544842768, + 172.35279713916282, + 176.39584840699732, + 180.45629141754378, + 184.53382886144945, + 188.6281734236716, + 192.73904728784493, + 196.86618167289001, + 201.00931639928149, + 205.16819948264117, + 209.34258675253682, + 213.53224149456327, + 217.73693411395425, + 221.95644181913033, + 226.19054832372763, + 230.43904356577696, + 234.70172344281826, + 238.97838956183432, + 243.26884900298268, + 247.57291409618685, + 251.89040220972319, + 256.22113555000954, + 260.56494097186322, + 264.92164979855278, + 269.29109765101975, + 273.67312428569375, + 278.06757344036617, + 282.47429268763045, + 286.89313329542699, + 291.32395009427029, + 295.7666013507606, + 300.22094864701415, + 304.68685676566872, + 309.1641935801469, + 313.65282994987905, + 318.1526396202093, + 322.66349912672621, + 327.18528770377526, + 331.71788719692847, + 336.26118197919845, + 340.81505887079902, + 345.37940706226686, + 349.95411804077025, + 354.53908551944079 +}; + +// port of Boost 1.84 implementation, double-precision +double Lfact(double xx) { + if (xx < 99.0) { + const uint32_t uii = S_CAST(int32_t, xx); + return kLogFactorials[uii]; + } + const double zz = xx + 1; + double result = (log(zz + kLanczosDoubleG - 0.5) - 1) * (zz - 0.5); + result += log(lanczos_sum_d_expg_scaled(zz)); + return result; +} -// HweP() and HweXchrP() are now licensed as GPL 2+. -// possible todo: variant which returns -ln(pval), at least up to long double -// range (i.e. normally use faster notlong-double code, but centerp > DBL_MAX -// launches the slower long double code path) -double HweP(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp) { +// obs_hets + obs_hom1 + obs_hom2 assumed to be <2^31. +double HweLnP(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp) { // This function implements an exact SNP test of Hardy-Weinberg // Equilibrium as described in Wigginton, JE, Cutler, DJ, and // Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg @@ -1743,10 +1899,12 @@ double HweP(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp) // sums. // - Support for the mid-p variant of this test. See Graffelman J, Moreno V // (2013) The mid p-value in exact tests for Hardy-Weinberg equilibrium. + // - Log-p-value return (added Jan 2024). A p-value of 1e-400 may be worth + // distinguishing from 1e-40000 in a biobank-scale dataset. // - // Note that the HweThresh() function below is a lot more efficient for - // testing against a p-value inclusion threshold. HweP() should only be - // used if you need the actual p-value. + // Note that the HweThreshLn() function is a lot more efficient for testing + // against a p-value inclusion threshold. HweLnP() should only be used if + // you need the actual p-value. intptr_t obs_homc; intptr_t obs_homr; if (obs_hom1 < obs_hom2) { @@ -1756,126 +1914,349 @@ double HweP(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp) obs_homc = obs_hom1; obs_homr = obs_hom2; } - const int64_t rare_copies = 2LL * obs_homr + obs_hets; - const int64_t genotypes2 = (obs_hets + obs_homc + obs_homr) * 2LL; - if (!genotypes2) { + const int64_t rare_ct = 2LL * obs_homr + obs_hets; + if (rare_ct < 2) { if (midp) { - return 0.5; + return -kLn2; } - return 1; + return 0; } + // MAF: rare_ct / allele_ct + // modal #hets: + // sample_ct * 2 * MAF * (1 - MAF) + // = rare_ct * (1 - MAF) + const int64_t sample_ct = obs_hom1 + obs_hom2 + obs_hets; + const double rare_ctd = rare_ct; + const double sample_ctd = sample_ct; + const double allele_ctd = sample_ctd * 2; + const double maf = rare_ctd / allele_ctd; + const double modal_nhet = rare_ctd * (1 - maf); + double curr_hets = obs_hets; + double curr_homr = obs_homr; + double curr_homc = obs_homc; + const double c_minus_r = curr_homc - curr_homr; int32_t tie_ct = 1; - double curr_hets_t2 = obs_hets; - double curr_homr_t2 = obs_homr; - double curr_homc_t2 = obs_homc; - double tailp = (1 - kSmallEpsilon) * kExactTestBias; - double centerp = 0; - double lastp2 = tailp; - double lastp1 = tailp; - - if (obs_hets * genotypes2 > rare_copies * (genotypes2 - rare_copies)) { - // tail 1 = upper - while (curr_hets_t2 > 1.5) { - // het_probs[curr_hets] = 1 - // het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) - curr_homr_t2 += 1; - curr_homc_t2 += 1; - lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2); - curr_hets_t2 -= 2; - if (lastp2 < kExactTestBias) { - tie_ct += (lastp2 > (1 - 2 * kSmallEpsilon) * kExactTestBias); - tailp += lastp2; - break; - } - centerp += lastp2; - // doesn't seem to make a difference, but seems best to minimize use of - // INFINITY - if (centerp > DBL_MAX) { + if (curr_hets > modal_nhet) { + const double het_delta = curr_hets - modal_nhet; + if ((!midp) && (het_delta < 2.0)) { + // Fast path for p=1. + if (curr_hets * (curr_hets - 1) <= (4 * (1 + kSmallEpsilon)) * (curr_homr + 1) * (curr_homc + 1)) { return 0; } } - if ((centerp == 0) && (!midp)) { - return 1; + // Tried using old algorithm on rare_ct < 64, but this didn't make a + // noticeable difference to --hardy execution time on 1000 Genomes data, so + // I'd rather not bloat this function further. + + // If we're close enough to the center, we may be best off computing 1 - + // . But this approach is vulnerable to + // catastrophic cancellation. Doubles have 53 bits of precision, while + // kSmallEpsilon (which governs equality checks) is configured to 2^{-44}, + // so it makes sense to insist that the final subtraction does not cost us + // more than ~9 bits. + // If the starting cell has probability > 2^{-10}, that is close enough to + // true; and I think this catches most of the cases where we actively want + // to use the subtract-center strategy? + // + // From e.g. the Wigginton paper, P(N_{AB}=n_{AB} | N, n_A) is + // + // 2^{n_{AB}} N! n_A! n_B! + // ----------------------------- + // n_{AA}! n_{AB}! n_{BB}! (2N)! + const double common_lnprob_component = Lfact(sample_ctd) + Lfact(rare_ctd) + Lfact(allele_ctd - rare_ctd) - Lfact(allele_ctd); + const double starting_lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double starting_lnprob = common_lnprob_component + starting_lnprob_other_component; + double lastp = 1; + // Either obs_hets >= 2, or obs_homr == obs_hets == obs_homc == 1. In the + // latter case, starting_lnprob will be well over -10 * kLn2, so the other + // branch can assume obs_hets >= 2. + if (starting_lnprob > -10 * kLn2) { + // This condition stops triggering once we have more than ~3 million + // samples, but I expect it to be a significant timesaver before that + // point. + double centerp = 0; + // rescale this at the end. + while (curr_hets > 1.5) { + curr_homr += 1; + curr_homc += 1; + lastp *= (curr_hets * (curr_hets - 1)) / (4 * curr_homr * curr_homc); + curr_hets -= 2; + if (lastp < 1 + kSmallEpsilon) { + tie_ct += (lastp > 1 - kSmallEpsilon); + break; + } + centerp += lastp; + } + if (midp) { + centerp += S_CAST(double, tie_ct) * 0.5; + } + const double starting_prob = exp(starting_lnprob); + centerp *= starting_prob; + return log(1 - centerp); } - while (curr_hets_t2 > 1.5) { - curr_homr_t2 += 1; - curr_homc_t2 += 1; - lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2); - curr_hets_t2 -= 2; + double tailp = 1; + curr_hets += 2; + while (curr_homr > 0.5) { + lastp *= (4 * curr_homr * curr_homc) / (curr_hets * (curr_hets - 1)); const double preaddp = tailp; - tailp += lastp2; + tailp += lastp; if (tailp <= preaddp) { break; } + curr_hets += 2; + curr_homr -= 1; + curr_homc -= 1; + } + // Now we want to jump near the other tail, without evaluating that many + // terms in between. + // + // Each full log-likelihood evaluation requires 3 Lfact() calls. Lfact() + // tends to be ~2-3 times as expensive as log in my testing, and I've seen + // log be anywhere from ~3 to ~12 times as expensive as division, so we + // want to limit ourselves to 1-2 full evaluations most of the time. + // + // The current heuristic starts by reflecting (obs_homr + curr_homr) * 0.5 + // across the mode, performing a full log-likelihood check at the nearest + // valid point. Hopefully we find that we're in (starting_lnprob - 62 * + // kLn2, starting_lnprob], so we're at or near a cell that actually + // contributes to the tail-sum. (This window is chosen to be wide enough + // to guarantee that at least one point falls inside when sample_ct < + // 2^31.) + // + // If not, we jump again, using Newton's method. + // If curr_homr is too low (i.e. current log-likelihood is too high), when + // we increase curr_homr by 1, the likelihood gets multiplied by + // curr_hets * (curr_hets-1) / (4 * (curr_homr+1) * (curr_homc+1)) + // i.e. we're adding the logarithm of this value to the log-likelihood. + // If curr_homr is too high, when we decrease curr_homr by 1, the + // likelihood gets multiplied by + // 4 * curr_homr * curr_homc / ((curr_hets+2) * (curr_hets+1)) + // We use the log of the first expression as the Newton's method f'(x) when + // we're jumping to higher curr_homr, and the negative-log of the second + // expression when we're jumping to lower curr_homr. + // f''(x) is always negative, so we can aim for starting_lnprob instead of + // the middle of the interval. + + // curr_hets moves twice as fast as curr_homr. So if we add + // 0.5 * (curr_hets + S_CAST(double, obs_hets)) - modal_nhet + // to 0.5 * (curr_homr + S_CAST(double, obs_homr)), that reflects curr_homr + // across the mode. + const double max_homr = S_CAST(double, rare_ct / 2); + { + const double delta = 0.5 * (curr_hets + S_CAST(double, obs_hets)) - modal_nhet; + curr_homr = 0.5 * (curr_homr + S_CAST(double, obs_homr)) + delta; + // Round and clamp. (er, does clamping ever come up in this direction?) + curr_homr = S_CAST(double, S_CAST(int32_t, curr_homr + 0.5)); + if (curr_homr > max_homr) { + curr_homr = max_homr; + } + } + // starting_lnprob_other_component is guaranteed to be positive here. + // Lfact() should be accurate enough for us to use a smaller-than-usual + // epsilon for identifying ties here; this isn't enough to cancel out the + // effective increase in epsilon from working in log-space, but it helps. + const double max_diff = starting_lnprob_other_component * (kSmallEpsilon / 8); + while (1) { + curr_hets = rare_ctd - curr_homr * 2; + curr_homc = curr_homr + c_minus_r; + const double lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double lnprob_diff = lnprob_other_component - starting_lnprob_other_component; + if (lnprob_diff > max_diff) { + if (curr_homr >= max_homr) { + // All terms on this tail are larger than the starting term. Exit. + // (is this possible?) + if (midp) { + tailp -= 0.5; + } + return starting_lnprob + log(tailp); + } + const double ll_deriv = log(curr_hets * (curr_hets - 1) / (4 * (curr_homr + 1) * (curr_homc + 1))); + // Round absolute value up, to guarantee that we make progress. + // (lnprob_diff is positive and ll_deriv is negative.) + // This may overshoot. But the function is guaranteed to terminate + // because we never overshoot (and we do always make progress on each + // step) once we're on the other side. + curr_homr += 1 - S_CAST(int64_t, (1 - kSmallEpsilon) * lnprob_diff / ll_deriv); + if (curr_homr > max_homr) { + curr_homr = max_homr; + } + } else if (lnprob_diff > -62 * kLn2) { + lastp = exp(lnprob_diff); + break; + } else { + const double ll_deriv = log((curr_hets + 2) * (curr_hets + 1) / (4 * curr_homr * curr_homc)); + // Round down, to guarantee we don't overshoot. + // We're guaranteed to make progress, since lnprob_diff >= 62 * log(2) + // and sample_ct < 2^31. + curr_homr -= S_CAST(int64_t, lnprob_diff / ll_deriv); + if (curr_homr < 0) { + curr_homr = 0; + } + } } - double curr_hets_t1 = obs_hets + 2; - double curr_homr_t1 = obs_homr; - double curr_homc_t1 = obs_homc; - while (curr_homr_t1 > 0.5) { - // het_probs[curr_hets + 2] = het_probs[curr_hets] * 4 * curr_homr * curr_homc / ((curr_hets + 2) * (curr_hets + 1)) - lastp1 *= (4 * curr_homr_t1 * curr_homc_t1) / (curr_hets_t1 * (curr_hets_t1 - 1)); + // Sum toward center, until lastp >= 1. + double lastp_tail = lastp; + if (lastp < 1 - kSmallEpsilon) { + double curr_homr_center = curr_homr; + double curr_homc_center = curr_homc; + double curr_hets_center = curr_hets; + while (1) { + tailp += lastp; + curr_hets_center += 2; + lastp *= (4 * curr_homr_center * curr_homc_center) / (curr_hets_center * (curr_hets_center - 1)); + if (lastp >= 1 - kSmallEpsilon) { + break; + } + curr_homr_center -= 1; + curr_homc_center -= 1; + } + } + if (lastp < 1 + kSmallEpsilon) { + tailp += lastp; + ++tie_ct; + } + // Sum away from center, until sums stop changing. + while (1) { + curr_homr += 1; + curr_homc += 1; + lastp_tail *= (curr_hets * (curr_hets - 1)) / (4 * curr_homr * curr_homc); const double preaddp = tailp; - tailp += lastp1; + tailp += lastp_tail; if (tailp <= preaddp) { break; } - curr_hets_t1 += 2; - curr_homr_t1 -= 1; - curr_homc_t1 -= 1; + curr_hets -= 2; } - } else { - // tail 1 = lower - while (curr_homr_t2 > 0.5) { - curr_hets_t2 += 2; - lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1)); - curr_homr_t2 -= 1; - curr_homc_t2 -= 1; - if (lastp2 < kExactTestBias) { - tie_ct += (lastp2 > (1 - 2 * kSmallEpsilon) * kExactTestBias); - tailp += lastp2; + if (midp) { + tailp -= S_CAST(double, tie_ct) * 0.5; + } + return starting_lnprob + log(tailp); + } + // Same as above, just with directions flipped. + const double het_delta = modal_nhet - curr_hets; + if ((!midp) && (het_delta < 2.0)) { + // Fast path for p=1. + if ((4 * (1 - kSmallEpsilon)) * curr_homr * curr_homc <= (curr_hets + 2) * (curr_hets + 1)) { + return 0; + } + } + const double common_lnprob_component = Lfact(sample_ctd) + Lfact(rare_ctd) + Lfact(allele_ctd - rare_ctd) - Lfact(allele_ctd); + const double starting_lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double starting_lnprob = common_lnprob_component + starting_lnprob_other_component; + double lastp = 1; + if (starting_lnprob > -10 * kLn2) { + double centerp = 0; + // rescale this at the end. + while (curr_homr > 0.5) { + curr_hets += 2; + lastp *= 4 * curr_homr * curr_homc / (curr_hets * (curr_hets - 1)); + curr_homr -= 1; + curr_homc -= 1; + if (lastp < 1 + kSmallEpsilon) { + tie_ct += (lastp > 1 - kSmallEpsilon); break; } - centerp += lastp2; - if (centerp > DBL_MAX) { - return 0; - } + centerp += lastp; } - if ((centerp == 0) && (!midp)) { - return 1; + if (midp) { + centerp += S_CAST(double, tie_ct) * 0.5; + } + const double starting_prob = exp(starting_lnprob); + centerp *= starting_prob; + return log(1 - centerp); + } + double tailp = 1; + while (curr_hets > 1.5) { + curr_homr += 1; + curr_homc += 1; + lastp *= curr_hets * (curr_hets - 1) / (4 * curr_homr * curr_homc); + curr_hets -= 2; + const double preaddp = tailp; + tailp += lastp; + if (tailp <= preaddp) { + break; } - while (curr_homr_t2 > 0.5) { - curr_hets_t2 += 2; - lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1)); - curr_homr_t2 -= 1; - curr_homc_t2 -= 1; - const double preaddp = tailp; - tailp += lastp2; - if (tailp <= preaddp) { - break; + } + // Jump to other tail. + { + const double delta = modal_nhet - 0.5 * (curr_hets + S_CAST(double, obs_hets)); + curr_homr = 0.5 * (curr_homr + S_CAST(double, obs_homr)) - delta; + // Round and clamp. + curr_homr = S_CAST(double, S_CAST(int32_t, curr_homr + 0.5)); + if (curr_homr < 0) { + curr_homr = 0; + } + } + const double max_diff = starting_lnprob_other_component * (kSmallEpsilon / 8); + const double max_homr = S_CAST(double, rare_ct / 2); + while (1) { + curr_hets = rare_ctd - curr_homr * 2; + curr_homc = curr_homr + c_minus_r; + const double lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double lnprob_diff = lnprob_other_component - starting_lnprob_other_component; + if (lnprob_diff > max_diff) { + if (curr_homr <= 0) { + // All terms on this tail are larger than the starting term. Exit. + if (midp) { + tailp -= 0.5; + } + return starting_lnprob + log(tailp); + } + const double ll_deriv = log(4 * curr_homr * curr_homc / ((curr_hets + 2) * (curr_hets + 1))); + curr_homr += S_CAST(int64_t, (1 - kSmallEpsilon) * lnprob_diff / ll_deriv) - 1; + if (curr_homr < 0) { + curr_homr = 0; + } + } else if (lnprob_diff > -62 * kLn2) { + lastp = exp(lnprob_diff); + break; + } else { + const double ll_deriv = log(4 * (curr_homr + 1) * (curr_homc + 1) / (curr_hets * (curr_hets - 1))); + curr_homr += S_CAST(int64_t, lnprob_diff / ll_deriv); + if (curr_homr > max_homr) { + curr_homr = max_homr; } } - double curr_hets_t1 = obs_hets; - double curr_homr_t1 = obs_homr; - double curr_homc_t1 = obs_homc; - while (curr_hets_t1 > 1.5) { - curr_homr_t1 += 1; - curr_homc_t1 += 1; - lastp1 *= (curr_hets_t1 * (curr_hets_t1 - 1)) / (4 * curr_homr_t1 * curr_homc_t1); - const double preaddp = tailp; - tailp += lastp1; - if (tailp <= preaddp) { + } + // Sum toward center, until lastp >= 1. + double lastp_tail = lastp; + if (lastp < 1 - kSmallEpsilon) { + double curr_homr_center = curr_homr; + double curr_homc_center = curr_homc; + double curr_hets_center = curr_hets; + while (1) { + tailp += lastp; + curr_homr_center += 1; + curr_homc_center += 1; + lastp *= curr_hets_center * (curr_hets_center - 1) / (4 * curr_homr_center * curr_homc_center); + if (lastp >= 1 - kSmallEpsilon) { break; } - curr_hets_t1 -= 2; + curr_hets_center -= 2; } } - if (!midp) { - return tailp / (tailp + centerp); + if (lastp < 1 + kSmallEpsilon) { + tailp += lastp; + ++tie_ct; } - return (tailp - ((1 - kSmallEpsilon) * kExactTestBias * 0.5) * tie_ct) / (tailp + centerp); + // Sum away from center, until sums stop changing. + while (1) { + curr_hets += 2; + lastp_tail *= 4 * curr_homr * curr_homc / (curr_hets * (curr_hets - 1)); + const double preaddp = tailp; + tailp += lastp_tail; + if (tailp <= preaddp) { + break; + } + curr_homr -= 1; + curr_homc -= 1; + } + if (midp) { + tailp -= S_CAST(double, tie_ct) * 0.5; + } + return starting_lnprob + log(tailp); } -// don't see any point to supporting long double range here uint32_t HweThresh(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh) { // Threshold-test-only version of HweP() which is usually able to exit // from the calculation earlier. Returns 0 if these counts are close enough @@ -2307,6 +2688,267 @@ uint32_t HweThreshMidp(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, dou return 1; } +uint32_t HweThreshLnMain(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp, double ln_thresh) { + // Threshold-test-only version of HweLnP() which is usually able to exit + // from the calculation earlier. Returns 0 if these counts are close enough + // to Hardy-Weinberg equilibrium, 1 otherwise. + // + // Assumes ln_thresh < -708. + // + // Caller is responsible for including a tolerance in ln_thresh when + // appropriate; the only tolerance applied by this function is in the context + // of other-tail tie detection. + intptr_t obs_homc; + intptr_t obs_homr; + if (obs_hom1 < obs_hom2) { + obs_homc = obs_hom2; + obs_homr = obs_hom1; + } else { + obs_homc = obs_hom1; + obs_homr = obs_hom2; + } + const int64_t rare_ct = 2LL * obs_homr + obs_hets; + // Change this to "rare_ct < 2" if ln_thresh restriction is being loosened + // (to e.g. compare results against HweThresh()). + if (rare_ct < 64) { + return 0; + } + const int64_t sample_ct = obs_hom1 + obs_hom2 + obs_hets; + const double rare_ctd = rare_ct; + const double sample_ctd = sample_ct; + const double allele_ctd = sample_ctd * 2; + + // 1. Compute log-likelihood of test cell. This may be high enough on its + // own to immediately return 0. + // If likelihood is lower than threshold / , we can + // immediately return 1. + // 2. Determine tailsum we must hit to return 0. + // 3. The rest follows HweLnP(), except with an extra geometric-series-based + // early-exit attempt near the end. + const double common_lnprob_component = Lfact(sample_ctd) + Lfact(rare_ctd) + Lfact(allele_ctd - rare_ctd) - Lfact(allele_ctd); + double curr_hets = obs_hets; + double curr_homr = obs_homr; + double curr_homc = obs_homc; + const double starting_lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double starting_lnprob = common_lnprob_component + starting_lnprob_other_component; + const double midp_d = u31tod(midp); + if (ln_thresh <= starting_lnprob - midp_d * kLn2) { + return 0; + } + const double max_homr = S_CAST(double, rare_ct / 2); + if (ln_thresh > starting_lnprob + log(max_homr + 1 - midp_d * 0.5)) { + return 1; + } + + const double maf = rare_ctd / allele_ctd; + const double modal_nhet = rare_ctd * (1 - maf); + const double c_minus_r = curr_homc - curr_homr; + // This should be in (0.5, 2^30]. + const double tail_thresh = exp(ln_thresh - starting_lnprob); + double tailp = 1 - midp_d * 0.5; + double lastp = 1; + if (curr_hets > modal_nhet) { + // No center-sum (or p=1) code path, since it doesn't make sense to choose + // a HWE threshold that makes these relevant; we should have already + // returned 0. + // (So we can't assume obs_hets >= 2.) + curr_hets += 2; + while (curr_homr > 0.5) { + lastp *= (4 * curr_homr * curr_homc) / (curr_hets * (curr_hets - 1)); + const double preaddp = tailp; + tailp += lastp; + if (tailp <= preaddp) { + break; + } + if (tailp >= tail_thresh) { + return 0; + } + curr_hets += 2; + curr_homr -= 1; + curr_homc -= 1; + } + { + const double delta = 0.5 * (curr_hets + S_CAST(double, obs_hets)) - modal_nhet; + curr_homr = 0.5 * (curr_homr + S_CAST(double, obs_homr)) + delta; + curr_homr = S_CAST(double, S_CAST(int32_t, curr_homr + 0.5)); + if (curr_homr > max_homr) { + curr_homr = max_homr; + } + } + const double max_diff = starting_lnprob_other_component * (kSmallEpsilon / 8); + while (1) { + curr_hets = rare_ctd - curr_homr * 2; + curr_homc = curr_homr + c_minus_r; + const double lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double lnprob_diff = lnprob_other_component - starting_lnprob_other_component; + if (lnprob_diff > max_diff) { + if (curr_homr >= max_homr) { + return 1; + } + const double ll_deriv = log(curr_hets * (curr_hets - 1) / (4 * (curr_homr + 1) * (curr_homc + 1))); + curr_homr += 1 - S_CAST(int64_t, (1 - kSmallEpsilon) * lnprob_diff / ll_deriv); + if (curr_homr > max_homr) { + curr_homr = max_homr; + } + } else if (lnprob_diff > -62 * kLn2) { + lastp = exp(lnprob_diff); + break; + } else { + const double ll_deriv = log((curr_hets + 2) * (curr_hets + 1) / (4 * curr_homr * curr_homc)); + curr_homr -= S_CAST(int64_t, lnprob_diff / ll_deriv); + if (curr_homr < 0) { + curr_homr = 0; + } + } + } + double lastp_tail = lastp; + if (lastp < 1 - kSmallEpsilon) { + double curr_homr_center = curr_homr; + double curr_homc_center = curr_homc; + double curr_hets_center = curr_hets; + while (1) { + tailp += lastp; + if (tailp >= tail_thresh) { + return 0; + } + curr_hets_center += 2; + lastp *= (4 * curr_homr_center * curr_homc_center) / (curr_hets_center * (curr_hets_center - 1)); + if (lastp >= 1 - kSmallEpsilon) { + break; + } + curr_homr_center -= 1; + curr_homc_center -= 1; + } + } + if (lastp < 1 + kSmallEpsilon) { + tailp += lastp * (1 - midp_d * 0.5); + if (tailp >= tail_thresh) { + return 0; + } + } + // We're down to one tail that can be tightly bounded by a geometric + // series. (ratio is always decreasing) + // c + cr + cr^2 + ... = c/(1-r) + curr_homr += 1; + curr_homc += 1; + const double cur_ratio = (curr_hets * (curr_hets - 1)) / (4 * curr_homr * curr_homc); + lastp_tail *= cur_ratio; + const double remaining_ceil = lastp_tail / (1 - cur_ratio); + if (tailp + remaining_ceil < tail_thresh) { + return 1; + } + while (1) { + const double preaddp = tailp; + tailp += lastp_tail; + if (tailp <= preaddp) { + return 1; + } + if (tailp >= tail_thresh) { + return 0; + } + curr_hets -= 2; + curr_homr += 1; + curr_homc += 1; + lastp_tail *= (curr_hets * (curr_hets - 1)) / (4 * curr_homr * curr_homc); + } + // unreachable + } + while (curr_hets > 1.5) { + curr_homr += 1; + curr_homc += 1; + lastp *= curr_hets * (curr_hets - 1) / (4 * curr_homr * curr_homc); + curr_hets -= 2; + const double preaddp = tailp; + tailp += lastp; + if (tailp <= preaddp) { + break; + } + if (tailp >= tail_thresh) { + return 0; + } + } + { + const double delta = modal_nhet - 0.5 * (curr_hets + S_CAST(double, obs_hets)); + curr_homr = 0.5 * (curr_homr + S_CAST(double, obs_homr)) - delta; + curr_homr = S_CAST(double, S_CAST(int32_t, curr_homr + 0.5)); + if (curr_homr < 0) { + curr_homr = 0; + } + } + const double max_diff = starting_lnprob_other_component * (kSmallEpsilon / 8); + while (1) { + curr_hets = rare_ctd - curr_homr * 2; + curr_homc = curr_homr + c_minus_r; + const double lnprob_other_component = curr_hets * kLn2 - Lfact(curr_hets) - Lfact(curr_homr) - Lfact(curr_homc); + const double lnprob_diff = lnprob_other_component - starting_lnprob_other_component; + if (lnprob_diff > max_diff) { + if (curr_homr <= 0) { + return 1; + } + const double ll_deriv = log(4 * curr_homr * curr_homc / ((curr_hets + 2) * (curr_hets + 1))); + curr_homr += S_CAST(int64_t, (1 - kSmallEpsilon) * lnprob_diff / ll_deriv) - 1; + if (curr_homr < 0) { + curr_homr = 0; + } + } else if (lnprob_diff > -62 * kLn2) { + lastp = exp(lnprob_diff); + break; + } else { + const double ll_deriv = log(4 * (curr_homr + 1) * (curr_homc + 1) / (curr_hets * (curr_hets - 1))); + curr_homr += S_CAST(int64_t, lnprob_diff / ll_deriv); + if (curr_homr > max_homr) { + curr_homr = max_homr; + } + } + } + double lastp_tail = lastp; + if (lastp < 1 - kSmallEpsilon) { + double curr_homr_center = curr_homr; + double curr_homc_center = curr_homc; + double curr_hets_center = curr_hets; + while (1) { + tailp += lastp; + if (tailp >= tail_thresh) { + return 0; + } + curr_homr_center += 1; + curr_homc_center += 1; + lastp *= curr_hets_center * (curr_hets_center - 1) / (4 * curr_homr_center * curr_homc_center); + if (lastp >= 1 - kSmallEpsilon) { + break; + } + curr_hets_center -= 2; + } + } + if (lastp < 1 + kSmallEpsilon) { + tailp += lastp * (1 - midp_d * 0.5); + if (tailp >= tail_thresh) { + return 0; + } + } + curr_hets += 2; + const double cur_ratio = 4 * curr_homr * curr_homc / (curr_hets * (curr_hets - 1)); + lastp_tail *= cur_ratio; + const double remaining_ceil = lastp_tail / (1 - cur_ratio); + if (tailp + remaining_ceil < tail_thresh) { + return 1; + } + while (1) { + const double preaddp = tailp; + tailp += lastp_tail; + if (tailp <= preaddp) { + return 1; + } + if (tailp >= tail_thresh) { + return 0; + } + curr_homr -= 1; + curr_homc -= 1; + curr_hets += 2; + lastp_tail *= 4 * curr_homr * curr_homc / (curr_hets * (curr_hets - 1)); + } +} + // 2^{-40} for now, since 2^{-44} was too small on real data static const double kExactTestEpsilon2 = 0.0000000000009094947017729282379150390625; @@ -2685,12 +3327,13 @@ uint32_t HweFirstRow(double hetab, double homa, double homb, double* tailp_ptr, return 0; } +// HweXchrP() is licensed as GPL 2+. double HweXchrP(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, int32_t male1, int32_t male2, uint32_t midp) { // See Graffelman J, Weir BS (2016) Testing for Hardy-Weinberg equilibrium at // biallelic genetic markers on the X chromosome. // Evaluation strategy is similar to fisher23(). if ((!male1) && (!male2)) { - return HweP(female_hets, female_hom1, female_hom2, midp); + return exp(HweLnP(female_hets, female_hom1, female_hom2, midp)); } // 1. Determine relative tail vs. center masses for the male1/male2-unchanged // slice. @@ -2855,6 +3498,482 @@ double HweXchrP(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, i return (tailp - ((1 - kExactTestEpsilon2) * kExactTestBias * 0.5) * S_CAST(int32_t, tie_ct)) / (tailp + centerp); } +/* +void HweLnFirstRow(double hetab, double homa, double homb, double* tailp_ptr, double* starting_lnprob_ptr, uint32_t* tie_ct_ptr, double* orig_base_lnprobl_ptr, double* orig_base_lnprobr_ptr, double* orig_saved_lhets_ptr, double* orig_saved_lhoma_ptr, double* orig_saved_lhomb_ptr, double* orig_saved_rhets_ptr, double* orig_saved_rhoma_ptr, double* orig_saved_rhomb_ptr) { + const double sample_ctd = hetab + homa + homb; + const double a_ctd = hetab + 2 * homa; + const double b_ctd = hetab + 2 * homb; + const double allele_ctd = sample_ctd * 2; + const double common_lnprob_component = Lfact(sample_ctd) + Lfact(a_ctd) + Lfact(b_ctd) - Lfact(allele_ctd); + const double starting_lnprob_other_component = hetab * kLn2 - Lfact(hetab) - Lfact(homa) - Lfact(homb); + const double starting_lnprob = common_lnprob_component + starting_lnprob_other_component; + *starting_lnprob_ptr = starting_lnprob; + double lastp = 1; + uint32_t tie_ct = 1; + double tmp_hets = hetab; + if (hetab * hetab > 4 * homa * homb) { + // Incrementing hetab decreases likelihood from this point on. + *orig_base_lnprobr_ptr = 0; + *orig_saved_rhets_ptr = hetab; + *orig_saved_rhoma_ptr = homa; + *orig_saved_rhomb_ptr = homb; + if (starting_lnprob > -10 * kLn2) { + // Sum center instead of tail. + double centerp = 0; + double tmp_homa = homa; + double tmp_homb = homb; + while (tmp_hets > 1.5) { + tmp_homa += 1; + tmp_homb += 1; + lastp *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_homa * tmp_homb); + tmp_hets -= 2; + if (lastp < 1 + kSmallEpsilon) { + tie_ct += (lastp > 1 - kSmallEpsilon); + break; + } + centerp += lastp; + } + *tie_ct_ptr = tie_ct; + *orig_saved_lhets_ptr = tmp_hets; + *orig_saved_lhoma_ptr = tmp_homa; + *orig_saved_lhomb_ptr = tmp_homb; + *orig_base_lnprobl_ptr = log(lastp); + *tailp_ptr = (1 - centerp) * exp(-starting_lnprob); + return; + } + double* orig_saved_lhomr_ptr = orig_saved_lhoma_ptr; + double* orig_saved_lhomc_ptr = orig_saved_lhomb_ptr; + double rare_ctd = a_ctd; + double c_minus_r = b_ctd - a_ctd; + double tmp_homr = homa; + double tmp_homc = homb; + if (c_minus_r < 0.0) { + orig_saved_lhomr_ptr = orig_saved_lhomb_ptr; + orig_saved_lhomc_ptr = orig_saved_lhoma_ptr; + rare_ctd = b_ctd; + c_minus_r = -c_minus_r; + tmp_homr = homb; + tmp_homc = homa; + } + const double orig_homr = tmp_homr; + double tailp = 1; + while (tmp_homr > 0.5) { + tmp_hets += 2; + lastp *= (4 * tmp_homr * tmp_homc) / (tmp_hets * (tmp_hets - 1)); + const double preaddp = tailp; + tailp += lastp; + if (tailp <= preaddp) { + break; + } + tmp_homr -= 1; + tmp_homc -= 1; + } + const double maf = rare_ctd / allele_ctd; + const double modal_nhet = rare_ctd * (1 - maf); + const double max_homr = S_CAST(double, S_CAST(int32_t, rare_ctd * 0.5)); + { + const double delta = 0.5 * (tmp_hets + hetab) - modal_nhet; + // Round and clamp. + tmp_homr = 0.5 * (tmp_homr + orig_homr) + delta; + tmp_homr = S_CAST(double, S_CAST(int32_t, tmp_homr + 0.5)); + if (tmp_homr > max_homr) { + tmp_homr = max_homr; + } + } + const double max_diff = starting_lnprob_other_component * (kSmallEpsilon / 8); + while (1) { + tmp_hets = rare_ctd - tmp_homr * 2; + tmp_homc = tmp_homr + c_minus_r; + const double lnprob_other_component = tmp_hets * kLn2 - Lfact(tmp_hets) - Lfact(tmp_homr) - Lfact(tmp_homc); + const double lnprob_diff = lnprob_other_component - starting_lnprob_other_component; + if (lnprob_diff > max_diff) { + if (tmp_homr >= max_homr) { + // All terms on this tail are larger than the starting term. Exit. + assert(tmp_homr == max_homr); + *tie_ct_ptr = 1; + *orig_saved_lhets_ptr = tmp_hets; + *orig_saved_lhomr_ptr = tmp_homr; + *orig_saved_lhomc_ptr = tmp_homc; + // exp(lnprob_diff) can be too large to fit in a double, so we make + // this return value a log-probability. + *orig_base_lnprobl_ptr = lnprob_diff; + *tailp_ptr = tailp; + return; + } + const double ll_deriv = log(tmp_hets * (tmp_hets - 1) / (4 * (tmp_homr + 1) * (tmp_homc + 1))); + // Round absolute value up, to guarantee that we make progress. + // (lnprob_diff is positive and ll_deriv is negative.) + // This may overshoot. But the function is guaranteed to terminate + // because we never overshoot (and we do always make progress on each + // step) once we're on the other side. + tmp_homr += 1 - S_CAST(int64_t, (1 - kSmallEpsilon) * lnprob_diff / ll_deriv); + if (tmp_homr > max_homr) { + tmp_homr = max_homr; + } + } else if (lnprob_diff > -62 * kLn2) { + lastp = exp(lnprob_diff); + break; + } else { + const double ll_deriv = log((tmp_hets + 2) * (tmp_hets + 1) / (4 * tmp_homr * tmp_homc)); + // Round down, to guarantee we don't overshoot. + // We're guaranteed to make progress, since lnprob_diff >= 62 * log(2) + // and sample_ct < 2^31. + tmp_homr -= S_CAST(int64_t, lnprob_diff / ll_deriv); + if (tmp_homr < 0) { + tmp_homr = 0; + } + } + } + // Sum toward center, until lastp >= 1. + double lastp_tail = lastp; + { + double tmp_homr_center = tmp_homr; + double tmp_homc_center = tmp_homc; + double tmp_hets_center = tmp_hets; + // TODO: make sure we don't blow past mode due to floating-point error, + // and add the same term twice + while (lastp < 1 - kSmallEpsilon) { + tailp += lastp; + tmp_hets_center += 2; + lastp *= (4 * tmp_homr_center * tmp_homc_center) / (tmp_hets_center * (tmp_hets_center - 1)); + tmp_homr_center -= 1; + tmp_homc_center -= 1; + } + if (lastp < 1 + kSmallEpsilon) { + tailp += lastp; + ++tie_ct; + } + *tie_ct_ptr = tie_ct; + *orig_saved_lhets_ptr = tmp_hets_center; + *orig_saved_lhomr_ptr = tmp_homr_center; + *orig_saved_lhomc_ptr = tmp_homc_center; + *orig_base_lnprobl_ptr = log(lastp); + } + // Sum away from center, until sums stop changing. + while (1) { + tmp_homr += 1; + tmp_homc += 1; + lastp_tail *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_homr * tmp_homc); + const double preaddp = tailp; + tailp += lastp_tail; + if (tailp <= preaddp) { + break; + } + tmp_hets -= 2; + } + *tailp_ptr = tailp; + return; + } + // Decrementing hetab decreases likelihood from this point on. + *orig_base_lnprobl_ptr = 0; + *orig_saved_lhets_ptr = hetab; + *orig_saved_lhoma_ptr = homa; + *orig_saved_lhomb_ptr = homb; + if (starting_lnprob > -10 * kLn2) { + const double tmp_hets_stop = MAXV(a_ctd, b_ctd) - 1.5; + double centerp = 0; + double tmp_homa = homa; + double tmp_homb = homb; + while (tmp_hets < tmp_hets_stop) { + tmp_hets += 2; + lastp *= 4 * tmp_homa * tmp_homb / (tmp_hets * (tmp_hets - 1)); + tmp_homa -= 1; + tmp_homb -= 1; + if (lastp < 1 + kSmallEpsilon) { + tie_ct += (lastp > 1 - kSmallEpsilon); + break; + } + centerp += lastp; + } + *tie_ct_ptr = tie_ct; + *orig_saved_rhets_ptr = tmp_hets; + *orig_saved_rhoma_ptr = tmp_homa; + *orig_saved_rhomb_ptr = tmp_homb; + *orig_base_lnprobr_ptr = log(lastp); + *tailp_ptr = (1 - centerp) * exp(-starting_lnprob); + return; + } + double* orig_saved_rhomr_ptr = orig_saved_rhoma_ptr; + double* orig_saved_rhomc_ptr = orig_saved_rhomb_ptr; + double rare_ctd = a_ctd; + double c_minus_r = b_ctd - a_ctd; + double tmp_homr = homa; + double tmp_homc = homb; + if (c_minus_r < 0.0) { + orig_saved_rhomr_ptr = orig_saved_rhomb_ptr; + orig_saved_rhomc_ptr = orig_saved_rhoma_ptr; + rare_ctd = b_ctd; + c_minus_r = -c_minus_r; + tmp_homr = homb; + tmp_homc = homa; + } + double orig_homr = tmp_homr; + double tailp = 1; + while (tmp_hets > 1.5) { + tmp_homr += 1; + tmp_homc += 1; + lastp *= tmp_hets * (tmp_hets - 1) / (4 * tmp_homr * tmp_homc); + tmp_hets -= 2; + const double preaddp = tailp; + tailp += lastp; + if (tailp <= preaddp) { + break; + } + } + // Jump to other tail. + // TODO + { + const double delta = modal_nhet - 0.5 * (tmp_hets + S_CAST(double, obs_hets)); + tmp_homr = 0.5 * (tmp_homr + S_CAST(double, obs_homr)) - delta; + // Round and clamp. + tmp_homr = S_CAST(double, S_CAST(int32_t, tmp_homr + 0.5)); + if (tmp_homr < 0) { + tmp_homr = 0; + } + } + const double max_diff = starting_lnprob_other_component * (kSmallEpsilon / 8); + const double max_homr = S_CAST(double, rare_ct / 2); + while (1) { + tmp_hets = rare_ctd - tmp_homr * 2; + tmp_homc = tmp_homr + c_minus_r; + const double lnprob_other_component = tmp_hets * kLn2 - Lfact(tmp_hets) - Lfact(tmp_homr) - Lfact(tmp_homc); + const double lnprob_diff = lnprob_other_component - starting_lnprob_other_component; + if (lnprob_diff > max_diff) { + if (tmp_homr <= 0) { + // All terms on this tail are larger than the starting term. Exit. + if (midp) { + tailp -= 0.5; + } + return starting_lnprob + log(tailp); + } + const double ll_deriv = log(4 * tmp_homr * tmp_homc / ((tmp_hets + 2) * (tmp_hets + 1))); + tmp_homr += S_CAST(int64_t, (1 - kSmallEpsilon) * lnprob_diff / ll_deriv) - 1; + if (tmp_homr < 0) { + tmp_homr = 0; + } + } else if (lnprob_diff > -62 * kLn2) { + lastp = exp(lnprob_diff); + break; + } else { + const double ll_deriv = log(4 * (tmp_homr + 1) * (tmp_homc + 1) / (tmp_hets * (tmp_hets - 1))); + tmp_homr += S_CAST(int64_t, lnprob_diff / ll_deriv); + if (tmp_homr > max_homr) { + tmp_homr = max_homr; + } + } + } + // Sum toward center, until lastp >= 1. + double lastp_tail = lastp; + if (lastp < 1 - kSmallEpsilon) { + double tmp_homr_center = tmp_homr; + double tmp_homc_center = tmp_homc; + double tmp_hets_center = tmp_hets; + while (1) { + tailp += lastp; + tmp_homr_center += 1; + tmp_homc_center += 1; + lastp *= tmp_hets_center * (tmp_hets_center - 1) / (4 * tmp_homr_center * tmp_homc_center); + if (lastp >= 1 - kSmallEpsilon) { + break; + } + tmp_hets_center -= 2; + } + } + if (lastp < 1 + kSmallEpsilon) { + tailp += lastp; + ++tie_ct; + } + // Sum away from center, until sums stop changing. + while (1) { + tmp_hets += 2; + lastp_tail *= 4 * tmp_homr * tmp_homc / (tmp_hets * (tmp_hets - 1)); + const double preaddp = tailp; + tailp += lastp_tail; + if (tailp <= preaddp) { + break; + } + tmp_homr -= 1; + tmp_homc -= 1; + } + if (midp) { + tailp -= S_CAST(double, tie_ct) * 0.5; + } + return starting_lnprob + log(tailp); +} + */ + +/* +double HweXchrLnP(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, int32_t male1, int32_t male2, uint32_t midp) { + // See Graffelman J, Weir BS (2016) Testing for Hardy-Weinberg equilibrium at + // biallelic genetic markers on the X chromosome. + // Evaluation strategy is similar to fisher23(). + if ((!male1) && (!male2)) { + return HweLnP(female_hets, female_hom1, female_hom2, midp); + } + // 1. Determine relative tail vs. center masses for the male1/male2-unchanged + // slice. + double cur_female_hetd = S_CAST(double, female_hets); + double cur_female_hom1d = S_CAST(double, female_hom1); + double cur_female_hom2d = S_CAST(double, female_hom2); + double n1 = cur_female_hetd + 2 * cur_female_hom1d; + double n2 = cur_female_hetd + 2 * cur_female_hom2d; + double tailp; + double starting_lnprob; + uint32_t tie_ct; + // "left" = low hets side, "right" = high hets side + double orig_base_probl; + double orig_base_probr; + double orig_saved_lhets; + double orig_saved_lhom1; + double orig_saved_lhom2; + double orig_saved_rhets; + double orig_saved_rhom1; + double orig_saved_rhom2; + if (HweLnFirstRow(cur_female_hetd, cur_female_hom1d, cur_female_hom2d, &tailp, &starting_lnprob, &tie_ct, &orig_base_probl, &orig_base_probr, &orig_saved_lhets, &orig_saved_lhom1, &orig_saved_lhom2, &orig_saved_rhets, &orig_saved_rhom1, &orig_saved_rhom2)) { + return 0.0; + } + // a "row" holds male1/male2 constant. + const double orig_row_prob = tailp + centerp; + n1 += male1; + n2 += male2; + for (uint32_t male1_decreasing = 0; male1_decreasing != 2; ++male1_decreasing) { + double cur_male1 = male1; + double cur_male2 = male2; + double row_prob = orig_row_prob; + double cur_lhets = orig_saved_lhets; + double cur_lhom1 = orig_saved_lhom1; + double cur_lhom2 = orig_saved_lhom2; + double cur_rhets = orig_saved_rhets; + double cur_rhom1 = orig_saved_rhom1; + double cur_rhom2 = orig_saved_rhom2; + double base_probl = orig_base_probl; + double base_probr = orig_base_probr; + uint32_t iter_ct; + if (male1_decreasing) { + iter_ct = 2 * female_hom2 + female_hets; + if (iter_ct > S_CAST(uint32_t, male1)) { + iter_ct = male1; + } + } else { + iter_ct = 2 * female_hom1 + female_hets; + if (iter_ct > S_CAST(uint32_t, male2)) { + iter_ct = male2; + } + } + for (uint32_t iter_idx = 0; iter_idx != iter_ct; ++iter_idx) { + if (male1_decreasing) { + const double old_male1 = cur_male1; + const double old_female2 = n2 - cur_male2; + cur_male2 += 1; + cur_male1 -= 1; + // row likelihood is ((n1 choose male1) * (n2 choose male2)) / + // ((n1 + n2) choose (male1 + male2)) + row_prob *= (old_male1 * old_female2) / (cur_male2 * (n1 - cur_male1)); + // bugfix (19 Apr 2017): We cannot move to the right of the mode here. + // Otherwise, if the mode itself is more probable than our initial + // table, but the table to the immediate right of the mode is not, + // we'll fail to count the mode. + // ("right" = high het count, "left" = low het count.) + if (cur_lhets != 0.0) { + cur_lhom1 += 1; + base_probl *= (old_male1 * cur_lhets) / (2 * cur_male2 * cur_lhom1); + cur_lhets -= 1; + } else { + cur_lhets += 1; + base_probl *= (2 * old_male1 * cur_lhom2) / (cur_male2 * cur_lhets); + cur_lhom2 -= 1; + } + } else { + const double old_male2 = cur_male2; + const double old_female1 = n1 - cur_male1; + cur_male1 += 1; + cur_male2 -= 1; + row_prob *= (old_male2 * old_female1) / (cur_male1 * (n2 - cur_male2)); + if (cur_lhets != 0.0) { + cur_lhom2 += 1; + base_probl *= (old_male2 * cur_lhets) / (2 * cur_male1 * cur_lhom2); + cur_lhets -= 1; + } else { + cur_lhets += 1; + base_probl *= (2 * old_male2 * cur_lhom1) / (cur_male1 * cur_lhets); + cur_lhom1 -= 1; + } + } + double tail_incr1; + if (HweXchrPTailsum(0, &base_probl, &cur_lhets, &cur_lhom1, &cur_lhom2, &tie_ct, &tail_incr1)) { + // all tables in this row, and all subsequent rows, are less probable + // than the initial table. + double cur_female1 = n1 - cur_male1; + double cur_female2 = n2 - cur_male2; + if (male1_decreasing) { + while (1) { + const double preaddp = tailp; + tailp += row_prob; + if (tailp == preaddp) { + break; + } + cur_male2 += 1; + cur_female1 += 1; + row_prob *= (cur_male1 * cur_female2) / (cur_male2 * cur_female1); + cur_male1 -= 1; + cur_female2 -= 1; + } + } else { + while (1) { + const double preaddp = tailp; + tailp += row_prob; + if (tailp == preaddp) { + break; + } + cur_male1 += 1; + cur_female2 += 1; + row_prob *= (cur_male2 * cur_female1) / (cur_male1 * cur_female2); + cur_male2 -= 1; + cur_female1 -= 1; + } + } + break; + } + tailp += tail_incr1; + if (male1_decreasing) { + const double old_male1 = cur_male1 + 1; + if (cur_rhom2 != 0.0) { + cur_rhets += 1; + base_probr *= (2 * old_male1 * cur_rhom2) / (cur_male2 * cur_rhets); + cur_rhom2 -= 1; + } else { + cur_rhom1 += 1; + base_probr *= (old_male1 * cur_rhets) / (2 * cur_male2 * cur_rhom1); + cur_rhets -= 1; + } + } else { + const double old_male2 = cur_male2 + 1; + if (cur_rhom1 != 0.0) { + cur_rhets += 1; + base_probr *= (2 * old_male2 * cur_rhom1) / (cur_male1 * cur_rhets); + cur_rhom1 -= 1; + } else { + cur_rhom2 += 1; + base_probr *= (old_male2 * cur_rhets) / (2 * cur_male1 * cur_rhom2); + cur_rhets -= 1; + } + } + double tail_incr2 = 0.0; // maybe-uninitialized warning + HweXchrPTailsum(1, &base_probr, &cur_rhets, &cur_rhom1, &cur_rhom2, &tie_ct, &tail_incr2); + tailp += tail_incr2; + centerp += row_prob - tail_incr1 - tail_incr2; + if (centerp > DBL_MAX) { + return 0; + } + } + } + if (!midp) { + return tailp / (tailp + centerp); + } + return (tailp - ((1 - kExactTestEpsilon2) * kExactTestBias * 0.5) * S_CAST(int32_t, tie_ct)) / (tailp + centerp); +} +*/ + #ifdef __cplusplus } #endif diff --git a/2.0/include/plink2_stats.h b/2.0/include/plink2_stats.h index 1d0c692d..4c3a4c53 100644 --- a/2.0/include/plink2_stats.h +++ b/2.0/include/plink2_stats.h @@ -58,17 +58,42 @@ HEADER_INLINE double ZscoreToLnP(double zz) { return ChisqToLnP(zz * zz, 1); } -double HweP(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp); +// Assumes xx is a nonnegative integer. +double Lfact(double xx); -// returns 0 if close enough to Hardy-Weinberg equilibrium +// HweP() has been replaced by HweLnP(). HweThresh() and HweThreshMidp() have +// been replaced by HweThreshLn(). + +double HweLnP(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp); + +// these return 0 if close enough to Hardy-Weinberg equilibrium uint32_t HweThresh(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh); uint32_t HweThreshMidp(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh); +uint32_t HweThreshLnMain(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp, double ln_thresh); + +HEADER_INLINE uint32_t HweThreshLn(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp, double thresh, double ln_thresh) { + // kLnNormalMin = -708.3964185... + if (ln_thresh > -708.396) { + if (!midp) { + return HweThresh(obs_hets, obs_hom1, obs_hom2, thresh); + } else { + return HweThreshMidp(obs_hets, obs_hom1, obs_hom2, thresh); + } + } + return HweThreshLnMain(obs_hets, obs_hom1, obs_hom2, midp, ln_thresh); +} + double FisherExact2x2P(uint32_t m11, uint32_t m12, uint32_t m21, uint32_t m22, uint32_t midp); double HweXchrP(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, int32_t male1, int32_t male2, uint32_t midp); +HEADER_INLINE double HweXchrLnP(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, int32_t male1, int32_t male2, uint32_t midp) { + const double pval = HweXchrP(female_hets, female_hom1, female_hom2, male1, male2, midp); + return (pval == 0)? -750 : log(pval); +} + #ifdef __cplusplus } #endif diff --git a/2.0/plink2.cc b/2.0/plink2.cc index cf7b5d47..7d9f818d 100644 --- a/2.0/plink2.cc +++ b/2.0/plink2.cc @@ -72,7 +72,7 @@ static const char ver_str[] = "PLINK v2.00a6" #elif defined(USE_AOCL) " AMD" #endif - " (5 Jan 2024)"; + " (3 Feb 2024)"; static const char ver_str2[] = // include leading space if day < 10, so character length stays the same " " @@ -367,7 +367,7 @@ typedef struct Plink2CmdlineStruct { double vif_thresh; double mind_thresh; double geno_thresh; - double hwe_thresh; + double hwe_ln_thresh; double mach_r2_min; double mach_r2_max; double minimac3_r2_min; @@ -472,12 +472,12 @@ typedef struct Plink2CmdlineStruct { } Plink2Cmdline; // er, probably time to just always initialize this... -uint32_t SingleVariantLoaderIsNeeded(const char* king_cutoff_fprefix, Command1Flags command_flags1, MakePlink2Flags make_plink2_flags, RmDupMode rmdup_mode, double hwe_thresh) { +uint32_t SingleVariantLoaderIsNeeded(const char* king_cutoff_fprefix, Command1Flags command_flags1, MakePlink2Flags make_plink2_flags, RmDupMode rmdup_mode, double hwe_ln_thresh) { return (command_flags1 & (kfCommand1Exportf | kfCommand1MakeKing | kfCommand1GenoCounts | kfCommand1LdPrune | kfCommand1Validate | kfCommand1Pca | kfCommand1MakeRel | kfCommand1Glm | kfCommand1Score | kfCommand1Ld | kfCommand1Hardy | kfCommand1Sdiff | kfCommand1PgenDiff | kfCommand1Clump | kfCommand1Vcor)) || ((command_flags1 & kfCommand1MakePlink2) && (make_plink2_flags & kfMakePgen)) || ((command_flags1 & kfCommand1KingCutoff) && (!king_cutoff_fprefix)) || (rmdup_mode != kRmDup0) || - (hwe_thresh != 0.0); + (hwe_ln_thresh != -DBL_MAX); } @@ -578,15 +578,15 @@ uint32_t VariantMissingDosageCtsAreNeeded(Command1Flags command_flags1, MiscFlag // can simplify --geno-counts all-biallelic case, but let's first make sure the // general case works for multiallelic variants -uint32_t RawGenoCtsAreNeeded(Command1Flags command_flags1, MiscFlags misc_flags, double hwe_thresh) { +uint32_t RawGenoCtsAreNeeded(Command1Flags command_flags1, MiscFlags misc_flags, double hwe_ln_thresh) { // Keep this in sync with --error-on-freq-calc. return (command_flags1 & kfCommand1GenoCounts) || - ((misc_flags & kfMiscNonfounders) && ((command_flags1 & kfCommand1Hardy) || (hwe_thresh != 0.0))); + ((misc_flags & kfMiscNonfounders) && ((command_flags1 & kfCommand1Hardy) || (hwe_ln_thresh != -DBL_MAX))); } -uint32_t FounderRawGenoCtsAreNeeded(Command1Flags command_flags1, MiscFlags misc_flags, double hwe_thresh) { +uint32_t FounderRawGenoCtsAreNeeded(Command1Flags command_flags1, MiscFlags misc_flags, double hwe_ln_thresh) { // Keep this in sync with --error-on-freq-calc. - return (!(misc_flags & kfMiscNonfounders)) && ((command_flags1 & kfCommand1Hardy) || (hwe_thresh != 0.0)); + return (!(misc_flags & kfMiscNonfounders)) && ((command_flags1 & kfCommand1Hardy) || (hwe_ln_thresh != -DBL_MAX)); } uint32_t InfoReloadIsNeeded(Command1Flags command_flags1, PvarPsamFlags pvar_psam_flags, ExportfFlags exportf_flags, RmDupMode rmdup_mode) { @@ -1090,7 +1090,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c pgfi.gflags &= ~kfPgenGlobalAllNonref; } - if (SingleVariantLoaderIsNeeded(king_cutoff_fprefix, pcp->command_flags1, make_plink2_flags, pcp->rmdup_mode, pcp->hwe_thresh)) { + if (SingleVariantLoaderIsNeeded(king_cutoff_fprefix, pcp->command_flags1, make_plink2_flags, pcp->rmdup_mode, pcp->hwe_ln_thresh)) { // ugly kludge, probably want to add pgenlib_internal support for this // hybrid use pattern FILE* shared_ff_copy = pgfi.shared_ff; @@ -1906,7 +1906,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c x_start = cip->chr_fo_vidx_start[x_chr_fo_idx]; const uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1]; x_len = x_end - x_start; - if (x_len && ((pcp->command_flags1 & kfCommand1Hardy) || (pcp->hwe_thresh != 0.0)) && (!AllBitsAreZero(variant_include, x_start, x_end))) { + if (x_len && ((pcp->command_flags1 & kfCommand1Hardy) || (pcp->hwe_ln_thresh != -DBL_MAX)) && (!AllBitsAreZero(variant_include, x_start, x_end))) { if (nonfounders) { hwe_x_probs_needed = (sample_ct > nosex_ct); } else { @@ -1979,7 +1979,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c // [0] = homref ct, [1] = het ref-altx total, [2] = nonref diploid // total // use unfiltered indexes, since we remove more variants later - if (RawGenoCtsAreNeeded(pcp->command_flags1, pcp->misc_flags, pcp->hwe_thresh)) { + if (RawGenoCtsAreNeeded(pcp->command_flags1, pcp->misc_flags, pcp->hwe_ln_thresh)) { if (unlikely(BIGSTACK_ALLOC_STD_ARRAY(uint32_t, 3, raw_variant_ct, &raw_geno_cts))) { goto Plink2Core_ret_NOMEM; } @@ -1996,7 +1996,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c } } } - if (FounderRawGenoCtsAreNeeded(pcp->command_flags1, pcp->misc_flags, pcp->hwe_thresh)) { + if (FounderRawGenoCtsAreNeeded(pcp->command_flags1, pcp->misc_flags, pcp->hwe_ln_thresh)) { if ((founder_ct == sample_ct) && raw_geno_cts) { founder_raw_geno_cts = raw_geno_cts; founder_x_male_geno_cts = x_male_geno_cts; @@ -2134,14 +2134,14 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c EnforceGenoThresh(cip, (pcp->misc_flags & kfMiscGenoDosage)? variant_missing_dosage_cts : variant_missing_hc_cts, geno_hh_missing? variant_hethap_cts : nullptr, sample_ct, male_ct, geno_hh_missing? first_hap_uidx : 0x7fffffff, pcp->geno_thresh, variant_include, &variant_ct); } - if ((pcp->command_flags1 & kfCommand1Hardy) || (pcp->hwe_thresh != 0.0)) { + if ((pcp->command_flags1 & kfCommand1Hardy) || (pcp->hwe_ln_thresh != -DBL_MAX)) { if (cip->haploid_mask[0] & 1) { if (unlikely(pcp->command_flags1 & kfCommand1Hardy)) { logerrputs("Error: --hardy is pointless on an all-haploid genome.\n"); goto Plink2Core_ret_INCONSISTENT_INPUT; } - // could make hwe_thresh non-const and set it to 0.0 earlier on in - // this case + // could make hwe_ln_thresh non-const and set it to -DBL_MAX + // earlier on in this case logerrputs("Warning: --hwe has no effect since entire genome is haploid.\n"); } else { STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_geno_cts) = nonfounders? raw_geno_cts : founder_raw_geno_cts; @@ -2206,14 +2206,14 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c } } - double* hwe_x_pvals = nullptr; - if (hwe_x_ct && ((pcp->hwe_thresh != 0.0) || (pcp->hardy_flags & kfHardyColP))) { + double* hwe_x_ln_pvals = nullptr; + if (hwe_x_ct && ((pcp->hwe_ln_thresh != -DBL_MAX) || (pcp->hardy_flags & kfHardyColP))) { // support suppression of --hardy p column (with a gigantic // dataset, maybe it's reasonable to stick to femalep, etc.) uint32_t hwe_midp; if (pcp->command_flags1 & kfCommand1Hardy) { hwe_midp = (pcp->hardy_flags / kfHardyMidp) & 1; - if (pcp->hwe_thresh != 0.0) { + if (pcp->hwe_ln_thresh != -DBL_MAX) { const uint32_t hwe_midp2 = (pcp->misc_flags / kfMiscHweMidp) & 1; if (unlikely(hwe_midp != hwe_midp2)) { // could support this efficiently, but why bother... @@ -2224,13 +2224,13 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c } else { hwe_midp = (pcp->misc_flags / kfMiscHweMidp) & 1; } - reterr = ComputeHweXPvals(variant_include, allele_idx_offsets, hwe_geno_cts, hwe_x_male_geno_cts, hwe_x_nosex_geno_cts, x_knownsex_xgeno_cts, x_male_xgeno_cts, x_start, hwe_x_ct, x_xallele_ct, hwe_midp, pcp->max_thread_ct, &hwe_x_pvals); + reterr = ComputeHweXLnPvals(variant_include, allele_idx_offsets, hwe_geno_cts, hwe_x_male_geno_cts, hwe_x_nosex_geno_cts, x_knownsex_xgeno_cts, x_male_xgeno_cts, x_start, hwe_x_ct, x_xallele_ct, hwe_midp, pcp->max_thread_ct, &hwe_x_ln_pvals); if (unlikely(reterr)) { goto Plink2Core_ret_1; } } if (pcp->command_flags1 & kfCommand1Hardy) { - reterr = HardyReport(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, nonref_flags, hwe_geno_cts, autosomal_xgeno_cts, hwe_x_male_geno_cts, hwe_x_nosex_geno_cts, x_knownsex_xgeno_cts, x_male_xgeno_cts, hwe_x_pvals, variant_ct, hwe_x_ct, max_allele_slen, pgfi.gflags, pcp->output_min_ln, pcp->hardy_flags, pcp->max_thread_ct, nonfounders, outname, outname_end); + reterr = HardyReport(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, nonref_flags, hwe_geno_cts, autosomal_xgeno_cts, hwe_x_male_geno_cts, hwe_x_nosex_geno_cts, x_knownsex_xgeno_cts, x_male_xgeno_cts, hwe_x_ln_pvals, variant_ct, hwe_x_ct, max_allele_slen, pgfi.gflags, pcp->output_min_ln, pcp->hardy_flags, pcp->max_thread_ct, nonfounders, outname, outname_end); if (unlikely(reterr)) { goto Plink2Core_ret_1; } @@ -2238,14 +2238,14 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c continue; } } - if (pcp->hwe_thresh != 0.0) { - // assumes no filtering between hwe_x_pvals[] computation and + if (pcp->hwe_ln_thresh != -DBL_MAX) { + // assumes no filtering between hwe_x_ln_pvals[] computation and // here - EnforceHweThresh(cip, allele_idx_offsets, hwe_geno_cts, autosomal_xgeno_cts, hwe_x_male_geno_cts, hwe_x_nosex_geno_cts, x_knownsex_xgeno_cts, x_male_xgeno_cts, hwe_x_pvals, pcp->misc_flags, pcp->hwe_thresh, nonfounders, variant_include, &variant_ct); + EnforceHweThresh(cip, allele_idx_offsets, hwe_geno_cts, autosomal_xgeno_cts, hwe_x_male_geno_cts, hwe_x_nosex_geno_cts, x_knownsex_xgeno_cts, x_male_xgeno_cts, hwe_x_ln_pvals, pcp->misc_flags, pcp->hwe_ln_thresh, nonfounders, variant_include, &variant_ct); } } } - // raw_geno_cts/founder_raw_geno_cts/hwe_x_pvals no longer needed + // raw_geno_cts/founder_raw_geno_cts/hwe_x_ln_pvals no longer needed BigstackReset(bigstack_mark_geno_cts); if ((pcp->min_maf != 0.0) || (pcp->max_maf != 1.0) || pcp->min_allele_ddosage || (pcp->max_allele_ddosage != (~0LLU))) { @@ -3746,7 +3746,7 @@ int main(int argc, char** argv) { pc.vif_thresh = 50.0; pc.mind_thresh = 1.0; pc.geno_thresh = 1.0; - pc.hwe_thresh = 0.0; + pc.hwe_ln_thresh = -DBL_MAX; pc.mach_r2_min = 0.0; pc.mach_r2_max = 0.0; pc.minimac3_r2_min = 0.0; @@ -6352,6 +6352,8 @@ int main(int argc, char** argv) { pc.hardy_flags |= kfHardyZs; } else if (strequal_k(cur_modif, "midp", cur_modif_slen)) { pc.hardy_flags |= kfHardyMidp; + } else if (strequal_k(cur_modif, "log10", cur_modif_slen)) { + pc.hardy_flags |= kfHardyLog10; } else if (strequal_k(cur_modif, "redundant", cur_modif_slen)) { pc.hardy_flags |= kfHardyRedundant; } else if (likely(StrStartsWith(cur_modif, "cols=", cur_modif_slen))) { @@ -6388,17 +6390,17 @@ int main(int argc, char** argv) { } else if (!strcmp(cur_modif, "keep-fewhet")) { pc.misc_flags |= kfMiscHweKeepFewhet; } else { - if (unlikely((pc.hwe_thresh != 0.0) || (!ScantokDouble(cur_modif, &pc.hwe_thresh)))) { + if (unlikely((pc.hwe_ln_thresh != -DBL_MAX) || (!ScantokLn(cur_modif, &pc.hwe_ln_thresh)))) { logerrputs("Error: Invalid --hwe argument sequence.\n"); goto main_ret_INVALID_CMDLINE_A; } - if (unlikely((pc.hwe_thresh < 0.0) || (pc.hwe_thresh >= 1.0))) { + if (unlikely(pc.hwe_ln_thresh >= 0.0)) { snprintf(g_logbuf, kLogbufSize, "Error: Invalid --hwe threshold '%s' (must be in [0, 1)).\n", cur_modif); goto main_ret_INVALID_CMDLINE_WWA; } } } - if ((pc.misc_flags & kfMiscHweMidp) && (pc.hwe_thresh >= 0.5)) { + if ((pc.misc_flags & kfMiscHweMidp) && (pc.hwe_ln_thresh >= -kLn2)) { logerrputs("Error: --hwe threshold must be smaller than 0.5 when using mid-p adjustment.\n"); } pc.filter_flags |= kfFilterPvarReq; diff --git a/2.0/plink2_filter.cc b/2.0/plink2_filter.cc index 23f6505e..2c8fa81c 100644 --- a/2.0/plink2_filter.cc +++ b/2.0/plink2_filter.cc @@ -3791,18 +3791,18 @@ void EnforceGenoThresh(const ChrInfo* cip, const uint32_t* variant_missing_cts, *variant_ct_ptr -= removed_ct; } -void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_pvals, MiscFlags misc_flags, double hwe_thresh, uint32_t nonfounders, uintptr_t* variant_include, uint32_t* variant_ct_ptr) { +void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_ln_pvals, MiscFlags misc_flags, double hwe_ln_thresh, uint32_t nonfounders, uintptr_t* variant_include, uint32_t* variant_ct_ptr) { uint32_t prefilter_variant_ct = *variant_ct_ptr; const uint32_t midp = (misc_flags / kfMiscHweMidp) & 1; const uint32_t keep_fewhet = (misc_flags / kfMiscHweKeepFewhet) & 1; - hwe_thresh *= 1 - kSmallEpsilon; + hwe_ln_thresh *= 1 + kSmallEpsilon; + const double hwe_thresh = exp(hwe_ln_thresh); uint32_t removed_ct = 0; uint32_t min_obs = UINT32_MAX; uint32_t max_obs = 0; uint32_t male_a1_ct = 0; uint32_t male_ax_ct = 0; - const double* hwe_x_pvals_iter = hwe_x_pvals; - const double hwe_thresh_recip = (1 + 4 * kSmallEpsilon) / hwe_thresh; + const double* hwe_x_ln_pvals_iter = hwe_x_ln_pvals; uintptr_t autosomal_xgeno_idx = 0; uintptr_t x_xgeno_idx = 0; uint32_t x_skip_code = UINT32_MAX; @@ -3811,7 +3811,7 @@ void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, c if (XymtExists(cip, kChrOffsetX, &x_code)) { const uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[x_code]; x_start = cip->chr_fo_vidx_start[x_chr_fo_idx]; - if (!hwe_x_pvals) { + if (!hwe_x_ln_pvals) { x_skip_code = x_code; // only set this if we're skipping chrX prefilter_variant_ct -= PopcountBitRange(variant_include, x_start, cip->chr_fo_vidx_start[x_chr_fo_idx + 1]); } @@ -3870,11 +3870,7 @@ void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, c } } pval_computed = 1; - if (midp) { - test_failed = HweThreshMidp(het_a1_ct, hom_a1_ct, two_ax_ct, hwe_thresh); - } else { - test_failed = HweThresh(het_a1_ct, hom_a1_ct, two_ax_ct, hwe_thresh); - } + test_failed = HweThreshLn(het_a1_ct, hom_a1_ct, two_ax_ct, midp, hwe_thresh, hwe_ln_thresh); if (test_failed) { break; } @@ -3906,22 +3902,21 @@ void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, c const uint32_t female_obs_ct = hom_a1_ct + het_a1_ct + two_ax_ct; cur_obs_ct = female_obs_ct + male_a1_ct + male_ax_ct; pval_computed = 1; - double joint_pval = *hwe_x_pvals_iter++; + double joint_ln_pval = *hwe_x_ln_pvals_iter++; for (uint32_t xallele_idx = 0; ; ++xallele_idx) { - test_failed = (joint_pval < hwe_thresh); + test_failed = (joint_ln_pval < hwe_ln_thresh); if (test_failed && keep_fewhet && (het_a1_ct * S_CAST(uint64_t, het_a1_ct) < (4LLU * hom_a1_ct) * two_ax_ct)) { // female-only retest - if (joint_pval != 0.0) { - joint_pval *= hwe_thresh_recip; + double joint_pval; + if (joint_ln_pval != -750) { + joint_ln_pval = joint_ln_pval - hwe_ln_thresh; + joint_pval = exp(joint_ln_pval); } else { - // keep the variant iff female-only p-value also underflows + // temporary: preserve preexisting behavior + joint_ln_pval = -1022 * kLn2; joint_pval = kDblNormalMin; } - if (midp) { - test_failed = !HweThreshMidp(het_a1_ct, hom_a1_ct, two_ax_ct, joint_pval); - } else { - test_failed = !HweThresh(het_a1_ct, hom_a1_ct, two_ax_ct, joint_pval); - } + test_failed = !HweThreshLn(het_a1_ct, hom_a1_ct, two_ax_ct, midp, joint_pval, joint_ln_pval); } // bugfix (27 Jun 2020): don't clobber previous allele-test failure if // variant is multiallelic @@ -3937,10 +3932,10 @@ void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, c het_a1_ct -= cur_male_xgeno_cts[1]; } two_ax_ct = female_obs_ct - hom_a1_ct - het_a1_ct; - joint_pval = hwe_x_pvals_iter[xallele_idx]; + joint_ln_pval = hwe_x_ln_pvals_iter[xallele_idx]; } x_xgeno_idx += xallele_ct; - hwe_x_pvals_iter = &(hwe_x_pvals_iter[xallele_ct]); + hwe_x_ln_pvals_iter = &(hwe_x_ln_pvals_iter[xallele_ct]); } if (test_failed) { ClearBit(variant_uidx, variant_include); diff --git a/2.0/plink2_filter.h b/2.0/plink2_filter.h index 5f0238a3..f9c017b2 100644 --- a/2.0/plink2_filter.h +++ b/2.0/plink2_filter.h @@ -102,7 +102,7 @@ PglErr MindFilter(const uint32_t* sample_missing_cts, const uint32_t* sample_het void EnforceGenoThresh(const ChrInfo* cip, const uint32_t* variant_missing_cts, const uint32_t* variant_hethap_cts, uint32_t sample_ct, uint32_t male_ct, uint32_t first_hap_uidx, double geno_thresh, uintptr_t* variant_include, uint32_t* variant_ct_ptr); -void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_pvals, MiscFlags misc_flags, double hwe_thresh, uint32_t nonfounders, uintptr_t* variant_include, uint32_t* variant_ct_ptr); +void EnforceHweThresh(const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_ln_pvals, MiscFlags misc_flags, double hwe_ln_thresh, uint32_t nonfounders, uintptr_t* variant_include, uint32_t* variant_ct_ptr); // main loop has a bitwise Nref-or-Alt1 check ENUM_U31_DEF_START() diff --git a/2.0/plink2_help.cc b/2.0/plink2_help.cc index c52f6b11..e3918ad5 100644 --- a/2.0/plink2_help.cc +++ b/2.0/plink2_help.cc @@ -723,13 +723,14 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) { " The default is chrom,maybeprovref,nmiss,nobs,fmiss.\n\n" ); HelpPrint("hardy\0", &help_ctrl, 1, -" --hardy ['zs'] ['midp'] ['redundant'] ['cols=']\n" +" --hardy ['zs'] ['midp'] ['log10'] ['redundant'] ['cols=']\n" " Hardy-Weinberg exact test p-value report(s).\n" " * By default, only founders are considered; change this with --nonfounders.\n" " * chrX is now omitted from the main .hardy report. Instead,\n" " (if present) it gets its own .hardy.x report based on the\n" " method described in Graffelman J, Weir BS (2016) Hardy-Weinberg\n" " equilibrium and the X chromosome.\n" +" * The 'log10' modifier causes p-values to be reported in -log10(p) form.\n" " * For variants with k alleles where k>2, k separate 'biallelic' tests are\n" " performed, each reported on its own line. However, biallelic variants\n" " are normally reported on a single line, since the counts/frequencies\n" @@ -757,7 +758,7 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) { " hetfreq: Observed and expected het-A1 frequencies.\n" " sexaf: Female and male A1 observed allele frequencies (chrX only).\n" " femalep: Female-only p/midp-value (chrX only).\n" -" p: Hardy-Weinberg equilibrium exact test p/midp-value.\n" +" p: Hardy-Weinberg equilibrium exact test p/midp-value (or -log10(p)).\n" " The default is chrom,maybeprovref,ax,gcounts,hetfreq,sexaf,p.\n\n" ); HelpPrint("het\0", &help_ctrl, 1, diff --git a/2.0/plink2_import.cc b/2.0/plink2_import.cc index 28948abd..a45e9090 100644 --- a/2.0/plink2_import.cc +++ b/2.0/plink2_import.cc @@ -1866,6 +1866,8 @@ VcfParseErr VcfConvertUnphasedMultiallelicLine(const VcfImportBaseContext* vibcp } } } else { + // don't check halfcall_mode == kVcfHalfCallMissing yet because + // we want error_on_polyploid to work cur_geno = ctow(second_allele_first_char) - 48; if (unlikely(cur_geno >= 10)) { return kVcfParseInvalidGt; @@ -1906,6 +1908,9 @@ VcfParseErr VcfConvertUnphasedMultiallelicLine(const VcfImportBaseContext* vibcp cur_geno = 2; } } + } else { + // bugfix (3 Feb 2024) + cur_geno = 3; } } } @@ -2417,6 +2422,8 @@ VcfParseErr VcfConvertPhasedMultiallelicLine(const VcfImportBaseContext* vibcp, } } } else { + // don't check halfcall_mode == kVcfHalfCallMissing yet because + // we want error_on_polyploid to work cur_geno = ctow(second_allele_first_char) - 48; if (unlikely(cur_geno >= 10)) { return kVcfParseInvalidGt; @@ -2459,6 +2466,9 @@ VcfParseErr VcfConvertPhasedMultiallelicLine(const VcfImportBaseContext* vibcp, cur_geno = 2; } } + } else { + // bugfix (3 Feb 2024) + cur_geno = 3; } } } diff --git a/2.0/plink2_ld.cc b/2.0/plink2_ld.cc index f127c6e9..22179abf 100644 --- a/2.0/plink2_ld.cc +++ b/2.0/plink2_ld.cc @@ -5198,9 +5198,13 @@ PglErr LdConsole(const uintptr_t* variant_include, const ChrInfo* cip, const cha const uintptr_t* cur_genovec = pgvs[var_idx].genovec; STD_ARRAY_DECL(uint32_t, 4, genocounts); GenoarrCountFreqsUnsafe(cur_genovec, founder_ct, genocounts); - double hwe_pval; + write_iter = strcpya_k(g_logbuf, " "); + write_iter = strcpya(write_iter, ld_console_varids[var_idx]); + write_iter = strcpya_k(write_iter, ": "); + double hwe_ln_pval; if (!is_xs[var_idx]) { - hwe_pval = HweP(genocounts[1], genocounts[0], genocounts[2], hwe_midp); + hwe_ln_pval = HweLnP(genocounts[1], genocounts[0], genocounts[2], hwe_midp); + write_iter = lntoa_g(hwe_ln_pval, write_iter); } else { STD_ARRAY_DECL(uint32_t, 4, male_genocounts); GenoarrCountSubsetFreqs(cur_genovec, sex_male_collapsed_interleaved, founder_ct, x_male_ct, male_genocounts); @@ -5212,9 +5216,16 @@ PglErr LdConsole(const uintptr_t* variant_include, const ChrInfo* cip, const cha genocounts[1] -= nosex_genocounts[1]; genocounts[2] -= nosex_genocounts[2]; } - hwe_pval = HweXchrP(genocounts[1], genocounts[0] - male_genocounts[0], genocounts[2] - male_genocounts[2], male_genocounts[0], male_genocounts[2], hwe_midp); + hwe_ln_pval = HweXchrLnP(genocounts[1], genocounts[0] - male_genocounts[0], genocounts[2] - male_genocounts[2], male_genocounts[0], male_genocounts[2], hwe_midp); + // temporary + if (hwe_ln_pval != -750) { + write_iter = lntoa_g(hwe_ln_pval, write_iter); + } else { + *write_iter++ = '0'; + } } - logprintf(" %s: %g\n", ld_console_varids[var_idx], hwe_pval); + memcpy_k(write_iter, "\n", 2); + logputsb(); } } logputs("\n"); diff --git a/2.0/plink2_misc.cc b/2.0/plink2_misc.cc index 11db7d3f..dcf3a5be 100644 --- a/2.0/plink2_misc.cc +++ b/2.0/plink2_misc.cc @@ -4762,7 +4762,7 @@ PglErr GetMultiallelicMarginalCounts(const uintptr_t* founder_info, const uintpt return reterr; } -typedef struct ComputeHweXPvalsCtxStruct { +typedef struct ComputeHweXLnPvalsCtxStruct { const uintptr_t* variant_include; const uintptr_t* allele_idx_offsets; const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts); @@ -4777,10 +4777,10 @@ typedef struct ComputeHweXPvalsCtxStruct { uint32_t hwe_midp; uint32_t hwe_x_ct; - double* hwe_x_pvals; -} ComputeHweXPvalsCtx; + double* hwe_x_ln_pvals; +} ComputeHweXLnPvalsCtx; -void ComputeHweXPvalsMain(uintptr_t tidx, uintptr_t thread_ct, ComputeHweXPvalsCtx* ctx) { +void ComputeHweXLnPvalsMain(uintptr_t tidx, uintptr_t thread_ct, ComputeHweXLnPvalsCtx* ctx) { const uintptr_t* variant_include = ctx->variant_include; const uintptr_t* allele_idx_offsets = ctx->allele_idx_offsets; const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts) = ctx->founder_raw_geno_cts; @@ -4797,7 +4797,7 @@ void ComputeHweXPvalsMain(uintptr_t tidx, uintptr_t thread_ct, ComputeHweXPvalsC uint32_t variant_idx = (hwe_x_ct * S_CAST(uint64_t, tidx)) / thread_ct; uintptr_t xgeno_idx = ctx->extra_aidx_starts[tidx]; - double* hwe_x_pvals_iter = &(ctx->hwe_x_pvals[variant_idx + xgeno_idx]); + double* hwe_x_ln_pvals_iter = &(ctx->hwe_x_ln_pvals[variant_idx + xgeno_idx]); uintptr_t variant_uidx_base; uintptr_t cur_bits; BitIter1Start(variant_include, ctx->variant_uidx_starts[tidx], &variant_uidx_base, &cur_bits); @@ -4830,7 +4830,7 @@ void ComputeHweXPvalsMain(uintptr_t tidx, uintptr_t thread_ct, ComputeHweXPvalsC female_1copy_ct -= cur_nosex_geno_cts[1]; female_0copy_ct -= cur_nosex_geno_cts[2]; } - *hwe_x_pvals_iter++ = HweXchrP(female_1copy_ct, female_2copy_ct, female_0copy_ct, male_1copy_ct, male_0copy_ct, hwe_midp); + *hwe_x_ln_pvals_iter++ = HweXchrLnP(female_1copy_ct, female_2copy_ct, female_0copy_ct, male_1copy_ct, male_0copy_ct, hwe_midp); if (allele_idx_offsets) { const uint32_t allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx]; if (allele_ct != 2) { @@ -4847,7 +4847,7 @@ void ComputeHweXPvalsMain(uintptr_t tidx, uintptr_t thread_ct, ComputeHweXPvalsC male_0copy_ct = male_obs_ct - male_1copy_ct - male_hethap_ct; } female_0copy_ct = female_obs_ct - female_2copy_ct - female_1copy_ct; - *hwe_x_pvals_iter++ = HweXchrP(female_1copy_ct, female_2copy_ct, female_0copy_ct, male_1copy_ct, male_0copy_ct, hwe_midp); + *hwe_x_ln_pvals_iter++ = HweXchrLnP(female_1copy_ct, female_2copy_ct, female_0copy_ct, male_1copy_ct, male_0copy_ct, hwe_midp); ++xgeno_idx; } } @@ -4868,26 +4868,26 @@ void ComputeHweXPvalsMain(uintptr_t tidx, uintptr_t thread_ct, ComputeHweXPvalsC } } -THREAD_FUNC_DECL ComputeHweXPvalsThread(void* raw_arg) { +THREAD_FUNC_DECL ComputeHweXLnPvalsThread(void* raw_arg) { ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg); - ComputeHweXPvalsCtx* ctx = S_CAST(ComputeHweXPvalsCtx*, arg->sharedp->context); - ComputeHweXPvalsMain(arg->tidx, GetThreadCt(arg->sharedp) + 1, ctx); + ComputeHweXLnPvalsCtx* ctx = S_CAST(ComputeHweXLnPvalsCtx*, arg->sharedp->context); + ComputeHweXLnPvalsMain(arg->tidx, GetThreadCt(arg->sharedp) + 1, ctx); THREAD_RETURN; } -PglErr ComputeHweXPvals(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), uint32_t x_start, uint32_t hwe_x_ct, uintptr_t x_xallele_ct, uint32_t hwe_midp, uint32_t calc_thread_ct, double** hwe_x_pvals_ptr) { +PglErr ComputeHweXLnPvals(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), uint32_t x_start, uint32_t hwe_x_ct, uintptr_t x_xallele_ct, uint32_t hwe_midp, uint32_t calc_thread_ct, double** hwe_x_ln_pvals_ptr) { unsigned char* bigstack_mark = g_bigstack_base; PglErr reterr = kPglRetSuccess; ThreadGroup tg; PreinitThreads(&tg); - ComputeHweXPvalsCtx ctx; + ComputeHweXLnPvalsCtx ctx; { assert(hwe_x_ct); - if (unlikely(bigstack_alloc_d(hwe_x_ct + x_xallele_ct, hwe_x_pvals_ptr))) { - goto ComputeHweXPvals_ret_NOMEM; + if (unlikely(bigstack_alloc_d(hwe_x_ct + x_xallele_ct, hwe_x_ln_pvals_ptr))) { + goto ComputeHweXLnPvals_ret_NOMEM; } bigstack_mark = g_bigstack_base; - ctx.hwe_x_pvals = *hwe_x_pvals_ptr; + ctx.hwe_x_ln_pvals = *hwe_x_ln_pvals_ptr; if (calc_thread_ct > hwe_x_ct) { calc_thread_ct = hwe_x_ct; @@ -4895,7 +4895,7 @@ PglErr ComputeHweXPvals(const uintptr_t* variant_include, const uintptr_t* allel if (unlikely(SetThreadCt0(calc_thread_ct - 1, &tg) || bigstack_alloc_u32(calc_thread_ct, &ctx.variant_uidx_starts) || bigstack_alloc_w(calc_thread_ct, &ctx.extra_aidx_starts))) { - goto ComputeHweXPvals_ret_NOMEM; + goto ComputeHweXLnPvals_ret_NOMEM; } // possible todo: extra-allele-based load balancer ComputeUidxStartPartition(variant_include, hwe_x_ct, calc_thread_ct, x_start, ctx.variant_uidx_starts); @@ -4922,22 +4922,22 @@ PglErr ComputeHweXPvals(const uintptr_t* variant_include, const uintptr_t* allel fputs("0%", stdout); fflush(stdout); if (calc_thread_ct > 1) { - SetThreadFuncAndData(ComputeHweXPvalsThread, &ctx, &tg); + SetThreadFuncAndData(ComputeHweXLnPvalsThread, &ctx, &tg); DeclareLastThreadBlock(&tg); if (unlikely(SpawnThreads(&tg))) { - goto ComputeHweXPvals_ret_THREAD_CREATE_FAIL; + goto ComputeHweXLnPvals_ret_THREAD_CREATE_FAIL; } } - ComputeHweXPvalsMain(calc_thread_ct - 1, calc_thread_ct, &ctx); + ComputeHweXLnPvalsMain(calc_thread_ct - 1, calc_thread_ct, &ctx); JoinThreads0(&tg); fputs("\b\b", stdout); logputs("done.\n"); } while (0) { - ComputeHweXPvals_ret_NOMEM: + ComputeHweXLnPvals_ret_NOMEM: reterr = kPglRetNomem; break; - ComputeHweXPvals_ret_THREAD_CREATE_FAIL: + ComputeHweXLnPvals_ret_THREAD_CREATE_FAIL: reterr = kPglRetThreadCreateFail; break; } @@ -4946,7 +4946,7 @@ PglErr ComputeHweXPvals(const uintptr_t* variant_include, const uintptr_t* allel return reterr; } -PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* nonref_flags, const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_pvals, uint32_t variant_ct, uint32_t hwe_x_ct, uint32_t max_allele_slen, PgenGlobalFlags gflags, double output_min_ln, HardyFlags hardy_flags, uint32_t max_thread_ct, uint32_t nonfounders, char* outname, char* outname_end) { +PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* nonref_flags, const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_ln_pvals, uint32_t variant_ct, uint32_t hwe_x_ct, uint32_t max_allele_slen, PgenGlobalFlags gflags, double output_min_ln, HardyFlags hardy_flags, uint32_t max_thread_ct, uint32_t nonfounders, char* outname, char* outname_end) { unsigned char* bigstack_mark = g_bigstack_base; char* cswritep = nullptr; CompressStreamState css; @@ -5017,6 +5017,7 @@ PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const u } } } + const uint32_t report_neglog10p = (hardy_flags / kfHardyLog10) & 1; const uint32_t ax_col = hardy_flags & kfHardyColAx; const uint32_t gcounts = hardy_flags & (kfHardyColGcounts | kfHardyColGcount1col); const uint32_t gcount_1col = hardy_flags & kfHardyColGcount1col; @@ -5074,11 +5075,13 @@ PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const u } if (p_col) { *cswritep++ = '\t'; + if (report_neglog10p) { + cswritep = strcpya_k(cswritep, "NEG_LOG10_"); + } if (midp) { - cswritep = strcpya_k(cswritep, "MIDP"); - } else { - *cswritep++ = 'P'; + cswritep = strcpya_k(cswritep, "MID"); } + *cswritep++ = 'P'; } AppendBinaryEoln(&cswritep); uintptr_t variant_uidx_base = 0; @@ -5206,8 +5209,13 @@ PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const u if (p_col) { // possible todo: multithread this *cswritep++ = '\t'; - const double hwe_p = HweP(het_a1_ct, hom_a1_ct, two_ax_ct, midp); - cswritep = dtoa_g(MAXV(hwe_p, output_min_p), cswritep); + const double hwe_ln_p = HweLnP(het_a1_ct, hom_a1_ct, two_ax_ct, midp); + if (report_neglog10p) { + const double reported_val = (-kRecipLn10) * hwe_ln_p; + cswritep = dtoa_g(reported_val, cswritep); + } else { + cswritep = lntoa_g(MAXV(hwe_ln_p, output_min_ln), cswritep); + } } AppendBinaryEoln(&cswritep); if (unlikely(Cswrite(&css, &cswritep))) { @@ -5294,19 +5302,23 @@ PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const u const uint32_t femalep_col = hardy_flags & kfHardyColFemalep; if (femalep_col) { cswritep = strcpya_k(cswritep, "\tFEMALE_ONLY_"); + if (report_neglog10p) { + cswritep = strcpya_k(cswritep, "NEG_LOG10_"); + } if (midp) { - cswritep = strcpya_k(cswritep, "MIDP"); - } else { - *cswritep++ = 'P'; + cswritep = strcpya_k(cswritep, "MID"); } + *cswritep++ = 'P'; } if (p_col) { *cswritep++ = '\t'; + if (report_neglog10p) { + cswritep = strcpya_k(cswritep, "NEG_LOG10_"); + } if (midp) { - cswritep = strcpya_k(cswritep, "MIDP"); - } else { - *cswritep++ = 'P'; + cswritep = strcpya_k(cswritep, "MID"); } + *cswritep++ = 'P'; } AppendBinaryEoln(&cswritep); fputs("--hardy: Writing chrX results...", stdout); @@ -5416,8 +5428,8 @@ PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const u male_ax_ct = male_obs_ct - male_a1_ct - male_hethap_ct; } female_two_ax_ct = female_obs_ct - female_hom_a1_ct - female_het_a1_ct; - // Correct to increment this before looking up hwe_x_pvals[] (and - // to not increment on a1_idx == 0). + // Correct to increment this before looking up hwe_x_ln_pvals[] + // (and to not increment on a1_idx == 0). ++xgeno_idx; } if (gcounts) { @@ -5454,14 +5466,35 @@ PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const u } if (femalep_col) { *cswritep++ = '\t'; - const double female_hwe_p = HweP(female_het_a1_ct, female_hom_a1_ct, female_two_ax_ct, midp); - cswritep = dtoa_g(MAXV(female_hwe_p, output_min_p), cswritep); + const double female_hwe_ln_p = HweLnP(female_het_a1_ct, female_hom_a1_ct, female_two_ax_ct, midp); + if (report_neglog10p) { + const double reported_val = (-kRecipLn10) * female_hwe_ln_p; + cswritep = dtoa_g(reported_val, cswritep); + } else { + cswritep = lntoa_g(MAXV(female_hwe_ln_p, output_min_ln), cswritep); + } } if (p_col) { *cswritep++ = '\t'; // bugfix (27 Jun 2020): forgot to correct this for multiallelic // variants - cswritep = dtoa_g(MAXV(hwe_x_pvals[variant_idx + xgeno_idx], output_min_p), cswritep); + + // temporary + const double ln_pval = hwe_x_ln_pvals[variant_idx + xgeno_idx]; + if (report_neglog10p) { + if (ln_pval != -750) { + const double reported_val = (-kRecipLn10) * ln_pval; + cswritep = dtoa_g(reported_val, cswritep); + } else { + cswritep = strcpya_k(cswritep, "inf"); + } + } else { + if (ln_pval != -750) { + cswritep = lntoa_g(MAXV(ln_pval, output_min_ln), cswritep); + } else { + cswritep = dtoa_g(output_min_p, cswritep); + } + } } AppendBinaryEoln(&cswritep); if (unlikely(Cswrite(&css, &cswritep))) { diff --git a/2.0/plink2_misc.h b/2.0/plink2_misc.h index 14b1cba1..c27bb171 100644 --- a/2.0/plink2_misc.h +++ b/2.0/plink2_misc.h @@ -180,22 +180,23 @@ FLAGSET_DEF_START() kfHardy0, kfHardyZs = (1 << 0), kfHardyMidp = (1 << 1), - kfHardyRedundant = (1 << 2), - - kfHardyColChrom = (1 << 3), - kfHardyColPos = (1 << 4), - kfHardyColRef = (1 << 5), - kfHardyColAlt1 = (1 << 6), - kfHardyColAlt = (1 << 7), - kfHardyColMaybeprovref = (1 << 8), - kfHardyColProvref = (1 << 9), - kfHardyColAx = (1 << 10), - kfHardyColGcounts = (1 << 11), - kfHardyColGcount1col = (1 << 12), - kfHardyColHetfreq = (1 << 13), - kfHardyColSexaf = (1 << 14), - kfHardyColFemalep = (1 << 15), - kfHardyColP = (1 << 16), + kfHardyLog10 = (1 << 2), + kfHardyRedundant = (1 << 3), + + kfHardyColChrom = (1 << 4), + kfHardyColPos = (1 << 5), + kfHardyColRef = (1 << 6), + kfHardyColAlt1 = (1 << 7), + kfHardyColAlt = (1 << 8), + kfHardyColMaybeprovref = (1 << 9), + kfHardyColProvref = (1 << 10), + kfHardyColAx = (1 << 11), + kfHardyColGcounts = (1 << 12), + kfHardyColGcount1col = (1 << 13), + kfHardyColHetfreq = (1 << 14), + kfHardyColSexaf = (1 << 15), + kfHardyColFemalep = (1 << 16), + kfHardyColP = (1 << 17), kfHardyColDefault = (kfHardyColChrom | kfHardyColMaybeprovref | kfHardyColAx | kfHardyColGcounts | kfHardyColHetfreq | kfHardyColSexaf | kfHardyColP), kfHardyColAll = ((kfHardyColP * 2) - kfHardyColChrom) FLAGSET_DEF_END(HardyFlags); @@ -398,7 +399,7 @@ PglErr WriteMissingnessReports(const uintptr_t* sample_include, const SampleIdIn PglErr GetMultiallelicMarginalCounts(const uintptr_t* founder_info, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_geno_cts), uint32_t raw_sample_ct, uint32_t autosomal_variant_ct, uint32_t autosomal_xallele_ct, uint32_t hwe_x_ct, uint32_t x_xallele_ct, PgenReader* simple_pgrp, STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts)); -PglErr ComputeHweXPvals(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), uint32_t x_start, uint32_t hwe_x_ct, uintptr_t x_xallele_ct, uint32_t hwe_midp, uint32_t calc_thread_ct, double** hwe_x_pvals_ptr); +PglErr ComputeHweXLnPvals(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), uint32_t x_start, uint32_t hwe_x_ct, uintptr_t x_xallele_ct, uint32_t hwe_midp, uint32_t calc_thread_ct, double** hwe_x_ln_pvals_ptr); PglErr HardyReport(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* nonref_flags, const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, autosomal_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_x_male_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 3, hwe_x_nosex_geno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_knownsex_xgeno_cts), const STD_ARRAY_PTR_DECL(uint32_t, 2, x_male_xgeno_cts), const double* hwe_x_pvals, uint32_t variant_ct, uint32_t hwe_x_ct, uint32_t max_allele_slen, PgenGlobalFlags gflags, double output_min_ln, HardyFlags hardy_flags, uint32_t max_thread_ct, uint32_t nonfounders, char* outname, char* outname_end);