Update docstrings

szhan · Aug 20, 2023 · 8da32cd · 8da32cd
1 parent 94a1d0d
commit 8da32cd
Showing 1 changed file with 50 additions and 37 deletions.
diff --git a/python/tests/test_beagle.py b/python/tests/test_beagle.py
@@ -1,5 +1,17 @@
 """
 Implementation of the BEAGLE 4.1 algorithm to impute genotypes.
+
+This implementation takes the following inputs:
+1. Reference haplotypes.
+2. Target haplotypes.
+3. Genotyped markers (genotyped sites in the target haplotypes).
+4. Imputed markers (UNgenotyped sites in the target haplotypes).
+
+The reference and target haplotype matrices are of the same size.
+
+For simplicity, we assume that:
+1. All the sites biallelic (0/1 encoding).
+2. The genetic map is defined by equating 1 Mbp to 1 cM.
 """
 import numpy as np
 
@@ -9,7 +21,7 @@ def convert_to_genetic_map_position(pos):
     Convert genomic site positions (bp) to
     genetic map positions (cM).
 
-    In BEAGLE 4.1, when a genetic map is not specified,
+    When a genetic map is not specified,
     it is assumed that 1 Mbp = 1cM.
 
     This trivial function is meant for documentation.
@@ -23,25 +35,23 @@ def convert_to_genetic_map_position(pos):
 
 def get_weights(ref_pos, target_pos):
     """
-    Get weights for each target marker to be imputed.
+    Get weights for each imputed marker.
 
     These weights are used in linear interpolation of
-    HMM state probabilities at target markers
-    during genotype imputation.
+    HMM state probabilities at imputed markers.
 
-    In BB2016, a weight between ref. markers m and m + 1
-    at target marker x is denoted lambda_m,x.
+    In BB2016, a weight between genotyped markers m and m + 1
+    at imputed marker x is denoted lambda_m,x.
 
     lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)],
     where g(i) = genetic map position of marker i.
 
-    In BEAGLE 4.1, there is the concept of aggregated
-    markers, which are markers that are close together
-    (within a specified max genetic distance). Here,
-    each marker is treated separately.
+    BEAGLE uses aggregated markers, which are clusters of markers
+    that are close together (within a specified max genetic distance).
+    Here, each marker is treated separately.
 
-    :param numpy.ndarray: Site positions of ref. markers (bp).
-    :param numpy.ndarray: Site positions of target markers (bp).
+    :param numpy.ndarray ref_pos: Site positions of genotyped markers.
+    :param numpy.ndarray target_pos: Site positions of imputed markers.
     :return: Weights used in linear interpolation.
     :rtype: numpy.ndarray
     """
@@ -69,14 +79,15 @@ def get_weights(ref_pos, target_pos):
 
 def get_mismatch_prob(pos, miscall_rate=0.0001):
     """
-    Set mismatch probabilities as per BEAGLE 4.1.
+    Set mismatch probabilities at genotyped markers.
 
-    In BEAGLE 4.1, the mismatch probability is dominated by
+    In BEAGLE, the mismatch probability is dominated by
     the allelic miscall rate. By default, it is set to 0.0001.
+    It is also capped at 0.5.
 
-    :param numpy.ndarray pos: Site positions (bp).
+    :param numpy.ndarray pos: Site positions.
     :param float miscall_rate: Allelic miscall rate.
-    :return: Mismatch probabilities, one per site.
+    :return: Mismatch probabilities.
     :rtype: numpy.ndarray
     """
     assert 0 <= miscall_rate <= 0.5
@@ -86,15 +97,16 @@ def get_mismatch_prob(pos, miscall_rate=0.0001):
 
 def get_switch_prob(pos, ref_h, ne=1e6):
     """
-    Set switch probabilities as per BEAGLE 4.1.
+    Set switch probabilities at genotyped markers.
 
     ref_h is a panel of reference haplotypes of size m x h,
-    where m is the number of sites and h is the number of haplotypes.
+    where m is the number of genotyped sites
+    and h is the number of reference haplotypes.
 
-    :param numpy.ndarray pos: Genomic site positions.
-    :param numpy.ndarray ref_h: Panel of reference haplotypes.
+    :param numpy.ndarray pos: Site positions.
+    :param numpy.ndarray ref_h: Reference haplotypes.
     :param float ne: Effective population size.
-    :return: Switch probabilities, one per site.
+    :return: Switch probabilities.
     :rtype: numpy.ndarray
     """
     assert pos[0] > 0, "Site positions are not 1-based."
@@ -109,18 +121,18 @@ def get_switch_prob(pos, ref_h, ne=1e6):
 
 def compute_forward_probability_matrix(ref_h, query_h, rho, mu):
     """
-    Implement the forwards algorithm in BEAGLE 4.1.
+    Implement the forwards algorithm.
 
     ref_h is a panel of reference haplotypes of size m x h,
     where m is the number of genotyped markers
     and h is the number of haplotypes.
 
-    query_h contains only genotyped markers.
+    query_h contains only genotyped markers (length m).
 
-    :param numpy.ndarray ref_h: Panel of reference haplotypes.
+    :param numpy.ndarray ref_h: Reference haplotypes.
     :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray rho: Switch probabilities, one per site.
-    :param numpy.ndarray mu: Mismatch probabilities, one per site.
+    :param numpy.ndarray rho: Switch probabilities.
+    :param numpy.ndarray mu: Mismatch probabilities.
     :return: Forward probability matrix.
     :rtype: numpy.ndarray
     """
@@ -154,15 +166,15 @@ def compute_forward_probability_matrix(ref_h, query_h, rho, mu):
 
 def compute_backward_probability_matrix(ref_h, query_h, rho, mu):
     """
-    Implement the backwards algorithm of BEAGLE 4.1.
+    Implement the backwards algorithm.
 
-    In BEAGLE 4.1, the values are kept one site at a time.
+    In BEAGLE, the values are kept one site at a time.
     Here, we keep the values at all the sites.
 
-    :param numpy.ndarray ref_h: Panel of reference haplotypes.
+    :param numpy.ndarray ref_h: Reference haplotypes.
     :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray rho: Switch probability for each site.
-    :param numpy.ndarray mu: Mismatch probability for each site.
+    :param numpy.ndarray rho: Switch probabilities.
+    :param numpy.ndarray mu: Mismatch probabilities.
     :return: Backward probability matrix.
     :rtype: numpy.ndarray
     """
@@ -193,17 +205,18 @@ def compute_backward_probability_matrix(ref_h, query_h, rho, mu):
 
 def compute_state_probability_matrix(fm, bm, ref_h, query_h, rho, mu):
     """
-    Compute HMM state probability matrix.
+    Implement the forwards-backwards algorithm to
+    compute HMM state probability matrix.
 
-    A HMM state is an index of a reference haplotype.
+    A state is an index of a reference haplotype.
 
     :param numpy.ndarray fm: Forward probability matrix.
     :param numpy.ndarray bm: Backward probability matrix.
-    :param numpy.ndarray ref_h: Panel of reference haplotypes.
+    :param numpy.ndarray ref_h: Reference haplotypes.
     :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray rho: Switch probability for each site.
-    :param numpy.ndarray mu: Mismatch probability for each site.
-    :return: HMM state probability matrix.
+    :param numpy.ndarray rho: Switch probabilities.
+    :param numpy.ndarray mu: Mismatch probabilities.
+    :return: State probability matrix.
     :rtype: numpy.ndarray
     """
     assert ref_h.shape[0] == len(query_h)