Skip to content

Commit

Permalink
Change variable names and update docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Mar 7, 2024
1 parent 2a266c0 commit 7905759
Showing 1 changed file with 38 additions and 38 deletions.
76 changes: 38 additions & 38 deletions python/tests/beagle_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,8 +815,8 @@ def compute_individual_scores(
Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
Note 0 refers to the REF allele and 1 the ALT allele.
Genotype probabilities are: P(RR), P(RA or AR), P(AA).
Allele dosages are: RR = 0, RA or AR = 1, AA = 2.
Unphased genotype (or dosage) probabilities are: P(RR), P(RA or AR), P(AA).
Dosages of the ALT allele are: RR = 0, RA or AR = 1, AA = 2.
In BEAGLE 4.1 output,
GP: "Estimated Genotype Probability", and
Expand All @@ -826,17 +826,17 @@ def compute_individual_scores(
:param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
:param numpy.ndarray alleles_2: Imputed alleles for haplotype 2.
:param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
:param int ref: Designated REF allele (ACGT encoding).
:return: Genotype probabilities and allele dosages.
:param int ref: Specified REF allele (ACGT encoding).
:return: Dosage probabilities and dosage scores.
:rtype: tuple(numpy.ndarray, numpy.ndarray)
"""
n = len(alleles_1) # Number of individuals.
assert len(alleles_2) == n, "Lengths of alleles differ."
assert n > 0, "There must be at least one individual."
assert len(allele_probs_1) == n, "Lengths of alleles and probabilities differ."
assert len(allele_probs_2) == n, "Lengths of alleles and probabilities differ."
gt_probs = np.zeros((n, 3), dtype=np.float64)
dosages = np.zeros(n, dtype=np.float64)
dosage_probs = np.zeros((n, 3), dtype=np.float64)
dosage_scores = np.zeros(n, dtype=np.float64)
for i in range(n):
ap_hap1_ref = (
allele_probs_1[i] if alleles_1[i] == ref else 1 - allele_probs_1[i]
Expand All @@ -846,19 +846,19 @@ def compute_individual_scores(
allele_probs_2[i] if alleles_2[i] == ref else 1 - allele_probs_2[i]
)
ap_hap2_alt = 1 - ap_hap2_ref
gt_probs[i, 0] = ap_hap1_ref * ap_hap2_ref # P(RR)
gt_probs[i, 1] = ap_hap1_ref * ap_hap2_alt # P(RA)
gt_probs[i, 1] += ap_hap1_alt * ap_hap2_ref # P(AR)
gt_probs[i, 2] = ap_hap1_alt * ap_hap2_alt # P(AA)
dosages[i] = gt_probs[i, 1] + 2 * gt_probs[i, 2]
return (gt_probs, dosages)
dosage_probs[i, 0] = ap_hap1_ref * ap_hap2_ref # P(RR)
dosage_probs[i, 1] = ap_hap1_ref * ap_hap2_alt # P(RA)
dosage_probs[i, 1] += ap_hap1_alt * ap_hap2_ref # P(AR)
dosage_probs[i, 2] = ap_hap1_alt * ap_hap2_alt # P(AA)
dosage_scores[i] = dosage_probs[i, 1] + 2 * dosage_probs[i, 2]
return (dosage_probs, dosage_scores)


# Site-level data.
def compute_allelic_r_squared(gt_probs):
def compute_allelic_r_squared(dosage_probs):
"""
Compute the estimated allelic R^2 at a position from the posterior genotype
probabilities of a set of diploid individuals.
Compute the estimated allelic R^2 at a position from the unphased genotype
(or dosage) probabilities of a set of diploid individuals.
Assume that site is biallelic. Otherwise, the calculation below is incorrect.
Note that 0 refers to REF allele and 1 the ALT allele.
Expand All @@ -874,18 +874,18 @@ def compute_allelic_r_squared(gt_probs):
See the formulation in the Appendix 1 of Browning & Browning (2009).
Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005.
:param np.ndarray gt_probs: Genotype probabilities.
:return: Dosage probabilities and dosage scores.
:return: Estimated allelic R-squared.
:rtype: float
"""
_MIN_R2_DEN = 1e-8
n = len(gt_probs) # Number of individuals.
n = len(dosage_probs) # Number of individuals.
assert n > 0, "There must be at least one individual."
assert gt_probs.shape[1] == 3, "Three genotypes are considered."
assert dosage_probs.shape[1] == 3, "Three genotypes are considered."
f = 1 / n
z = np.argmax(gt_probs, axis=1) # Most likely imputed genotypes.
u = gt_probs[:, 1] + 2 * gt_probs[:, 2] # E[X | y_i]
w = gt_probs[:, 1] + 4 * gt_probs[:, 2] # E[X^2 | y_i]
z = np.argmax(dosage_probs, axis=1) # Most likely imputed dosage.
u = dosage_probs[:, 1] + 2 * dosage_probs[:, 2] # E[X | y_i]
w = dosage_probs[:, 1] + 4 * dosage_probs[:, 2] # E[X^2 | y_i]
cov = np.sum(z * u) - np.sum(z) * np.sum(u) * f
var_best = np.sum(z**2) - np.sum(z) ** 2 * f
var_exp = np.sum(w) - np.sum(u) ** 2 * f
Expand All @@ -895,10 +895,10 @@ def compute_allelic_r_squared(gt_probs):
return allelic_rsq


def compute_dosage_r_squared(gt_probs):
def compute_dosage_r_squared(dosage_probs):
"""
Compute the dosage R^2 for a position from the posterior genotype probabilities
of a set of diploid individuals.
Compute the dosage R^2 for a position from the unphased genotype (or dosage)
probabilities of a set of diploid individuals.
Assume that site is biallelic. Otherwise, the calculation below is incorrect.
Note that 0 refers to REF allele and 1 the ALT allele.
Expand All @@ -907,17 +907,17 @@ def compute_dosage_r_squared(gt_probs):
between estimated REF dose [P(RA) + 2 * P(RR)] and true REF dose".
See `doseR2` in `R2Estimator.java` of the BEAGLE 4.1 source code.
:param np.ndarray gt_probs: Genotype probabilities.
:return: Dosage probabilities and dosage scores.
:return: Dosage R-squared.
:rtype: float
"""
_MIN_R2_DEN = 1e-8
n = len(gt_probs) # Number of individuals.
n = len(dosage_probs) # Number of individuals.
assert n > 0, "There must be at least one individual."
assert gt_probs.shape[1] == 3, "Three genotypes are considered."
assert dosage_probs.shape[1] == 3, "Three genotypes are considered."
f = 1 / n
u = gt_probs[:, 1] + 2 * gt_probs[:, 2] # E[X | y_i].
w = gt_probs[:, 1] + 4 * gt_probs[:, 2] # E[X^2 | y_i].
u = dosage_probs[:, 1] + 2 * dosage_probs[:, 2] # E[X | y_i].
w = dosage_probs[:, 1] + 4 * dosage_probs[:, 2] # E[X^2 | y_i].
c = np.sum(u) ** 2 * f
num = np.sum(u**2) - c
if num < 0:
Expand All @@ -935,8 +935,8 @@ def compute_allele_frequency(
allele,
):
"""
Estimate the frequency of an allele at a position from the posterior
allele probabilities of a set of diploid individuals.
Estimate the frequency of a specified allele at a position from allele probabilities
of a set of diploid individuals.
Assume that site is biallelic. Otherwise, the calculation below is incorrect.
Expand Down Expand Up @@ -968,7 +968,7 @@ def compute_allele_frequency(
return est_af


def write_vcf(impdata, out_file, *, chr_name="1", print_gp=False, num_digits=2):
def write_vcf(impdata, out_file, *, chr_name="1", print_gp=False, decimals=2):
"""
Print imputation results in VCF format, following the output of BEAGLE 4.1.
Expand All @@ -978,7 +978,7 @@ def write_vcf(impdata, out_file, *, chr_name="1", print_gp=False, num_digits=2):
:param str out_file: Path to output VCF file.
:param str chr_name: Chromosome name (default = "1").
:param bool print_gp: Print genotype probabilities if True (default = False).
:param int num_digits: Number of decimal places to print (default = 2).
:param int decimals: Number of decimal places to print (default = 2).
:return: None
:rtype: None
"""
Expand Down Expand Up @@ -1052,9 +1052,9 @@ def write_vcf(impdata, out_file, *, chr_name="1", print_gp=False, num_digits=2):
ar2 = compute_allelic_r_squared(gt_probs)
dr2 = compute_dosage_r_squared(gt_probs)
af = compute_allele_frequency(a1, ap1, a2, ap2, allele=1)
ar2 = round(ar2, num_digits)
dr2 = round(dr2, num_digits)
af = round(af, num_digits)
ar2 = round(ar2, decimals)
dr2 = round(dr2, decimals)
af = round(af, decimals)
info_str = f"AR2={ar2};DR2={dr2};AF={af}"
if is_imputed:
info_str += ";" + "IMP"
Expand All @@ -1071,11 +1071,11 @@ def write_vcf(impdata, out_file, *, chr_name="1", print_gp=False, num_digits=2):
gt_a1 = "0" if a1[j] == REF else "1"
gt_a2 = "0" if a2[j] == REF else "1"
data_str += gt_a1 + "|" + gt_a2 + ":"
data_str += str(round(dosages[j], num_digits))
data_str += str(round(dosages[j], decimals))
if print_gp:
data_str += ":"
data_str += ",".join(
[str(round(gt_probs[j, k], num_digits)) for k in range(3)]
[str(round(gt_probs[j, k], decimals)) for k in range(3)]
)
if j < impdata.num_individuals - 1:
data_str += "\t"
Expand Down

0 comments on commit 7905759

Please sign in to comment.