Merge pull request #104 from xin-huang/revision

Update docstrings and add type hint
xin-huang · Apr 16, 2024 · f5fd232 · f5fd232
2 parents f506725 + d28cfac
commit f5fd232
Show file tree

Hide file tree

Showing 23 changed files with 1,294 additions and 735 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9"]
+        python-version: ["3.9", "3.10"]
       max-parallel: 5
 
     steps:

diff --git a/build-env.yml b/build-env.yml
@@ -14,7 +14,7 @@ dependencies:
   - numpy
   - pip
   - pytest-cov
-  - python==3.8
+  - python==3.9
   - scipy
   - snakemake==7.1.1
   - pip:

diff --git a/dadi_cli/BestFit.py b/dadi_cli/BestFit.py
@@ -1,23 +1,51 @@
 import glob, sys
 import numpy as np
+from typing import Optional
 from dadi_cli.Models import get_model
 from dadi_cli.Pdfs import get_dadi_pdf_params
 
 
-def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=100):
+def get_bestfit_params(
+    path: str, 
+    lbounds: list[float], 
+    ubounds: list[float], 
+    output: str, 
+    delta: float, 
+    Nclose: int = 3, 
+    Nbest: int = 100
+) -> Optional[np.array]:
     """
-    Description:
-        Obtains bestfit parameters.
-
-    Arguments:
-        path str: Path to results of inference.
-        lbounds list: Lower bounds of the optimized parameters.
-        ubounds list: Upper bounds of the optimized parameters.
-        output str: Name of the output file.
-        delta float: Max percentage difference for log-likliehoods compared to the best optimization
-                     log-likliehood to be consider convergent.
-        Nclose int: Number of best-fit results to be consider convergent. 
-        Nbest int: Number of best-fit results to be displayed.
+    Obtains best-fit parameters from optimization results, filters them based on log-likelihood
+    differences, boundaries, and ranks them, finally outputs the results to a specified file.
+
+    Parameters
+    ----------
+    path : str
+        Path to results of inference.
+    lbounds : list[float]
+        Lower bounds of the optimized parameters.
+    ubounds : list[float]
+        Upper bounds of the optimized parameters.
+    output : str
+        Name of the output file where results will be written.
+    delta : float
+        Maximum difference for log-likelihoods compared to the best optimization
+        log-likelihood to be considered convergent.
+    Nclose : int, optional
+        Number of best-fit results to be considered convergent (default is 3).
+    Nbest : int, optional
+        Number of best-fit results to be displayed (default is 100).
+
+    Returns
+    -------
+    np.array | None
+        Array of close enough results if the convergence criteria are met, None otherwise.
+
+    Raises
+    ------
+    ValueError
+        If no files are found at the specified path or if an incorrect path naming convention is used.
+
     """
     files = glob.glob(path)
     if files == []:
@@ -26,23 +54,21 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10
         )
     res, comments = [], []
 
-    for f in files:
-        fid = open(f, "r")
-        for line in fid.readlines():
-            if line.startswith("#"):
-                if line.startswith("# Log(likelihood)"):
-                    params = line.rstrip()
-                else:
-                    comments.append(line.rstrip())
-                continue
-            # Parse numerical result
-            try:
-
-                res.append([float(_) for _ in line.rstrip().split()])
-            except ValueError:
-                # Ignore lines with a parsing error
-                pass
-        fid.close()
+    for file in files:
+        with open(file, "r") as fid:
+            for line in fid.readlines():
+                if line.startswith("#"):
+                    if line.startswith("# Log(likelihood)"):
+                        params = line.rstrip()
+                    else:
+                        comments.append(line.rstrip())
+                    continue
+                # Parse numerical result
+                try:
+                    res.append([float(_) for _ in line.rstrip().split()])
+                except ValueError:
+                    # Ignore lines with a parsing error
+                    pass
 
     if len(res) == 0:
         print("No optimization results found.")
@@ -58,7 +84,7 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10
     try:
         opt_ll = res[0][0]
     except IndexError:
-        return None
+        return
 
     # Filter out those results within delta threshold
     close_enough = res[1 - (opt_ll / res[:, 0]) <= delta]
@@ -91,43 +117,72 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10
         return close_enough
 
 
-def close2boundaries(params, lbounds, ubounds):
+def close2boundaries(
+    params: list[float], 
+    lbounds: list[float], 
+    ubounds: list[float],
+    threshold: float = 0.01
+) -> bool:
     """
-    Description:
-        Helper function for detemining whether a parameter is close to the boundaries.
+    Determines whether any parameter is close to its boundaries within a specified threshold.
+
+    Parameters
+    ----------
+    params : list[float]
+        Inferred parameters.
+    lbounds : list[float]
+        Lower bounds for the parameters.
+    ubounds : list[float]
+        Upper bounds for the parameters.
+    threshold : float, optional
+        Proportion of the boundary range that defines closeness (default is 0.01).
+
+    Returns
+    -------
+    bool
+        True if any parameter is within the threshold of its boundaries, False otherwise.
 
-    Arguments:
-        params list: Inferred parameters.
-        lbounds list: Lower bounds for the parameters.
-        ubounds list: Upper bounds for the parameters.
-
-    Returns:
-        is_close2boundaries: True, if any parameter is close to the boundaries;
-                             False, otherwise.
     """
     is_close2boundaries = False
     for i in range(len(params)):
         if ubounds[i] is not None and lbounds[i] is not None:
             bound_range = ubounds[i] - lbounds[i]
-            if (params[i] - lbounds[i]) / bound_range < 0.01 or (
+            if (params[i] - lbounds[i]) / bound_range < threshold or (
                 ubounds[i] - params[i]
-            ) / bound_range < 0.01:
+            ) / bound_range < threshold:
                 is_close2boundaries = True
     return is_close2boundaries
 
 
-def boundary_filter(res, ubounds, lbounds):
+def boundary_filter(
+    res: np.array, 
+    ubounds: list[float], 
+    lbounds: list[float]
+) -> np.array:
     """
-    Description:
-        Helper function to filter out results where the params are outside the boundaries.
-
-    Arguments:
-        res numpy.array: Inference results stored as an array.
-        lbounds list: Lower bounds for the parameters.
-        ubounds list: Upper bounds for the parameters.
+    Filters inference results to exclude those where any parameter is outside specified boundaries.
+
+    Parameters
+    ----------
+    res : np.array
+        Inference results stored as an array, where each row represents an inference result and
+        columns correspond to different parameters.
+    lbounds : list[float]
+        Lower bounds for each parameter.
+    ubounds : list[float]
+        Upper bounds for each parameter.
+
+    Returns
+    -------
+    np.array
+        Array with rows filtered based on boundaries.
+
+    Raises
+    ------
+    ValueError
+        If the number of upper boundaries does not match the number of lower boundaries,
+        or if the number of boundaries does not match the number of parameters.
 
-    Returns:
-        res: Numpy array with values filtered based on boundaries
     """
     # Filter out results where the params are outside the boundaries
     # Doing this before getting opt_ll so it doesn't throw off convergence check

diff --git a/dadi_cli/GenerateCache.py b/dadi_cli/GenerateCache.py
@@ -7,35 +7,54 @@
 
 
 def generate_cache(
-    func,
-    grids,
-    popt,
-    gamma_bounds,
-    gamma_pts,
-    additional_gammas,
-    output,
-    sample_sizes,
-    cpus,
-    gpus,
-    dimensionality,
-):
+    func: callable,
+    grids: list[int],
+    popt: str,
+    gamma_bounds: list[str],
+    gamma_pts: int,
+    additional_gammas: list[float],
+    output: str,
+    sample_sizes: list[int],
+    cpus: int,
+    gpus: int,
+    dimensionality: int,
+) -> None:
     """
-    Description:
-        Generates caches of frequency spectra for DFE inference.
+    Generates caches of frequency spectra for DFE inference using demographic models.
+
+    Parameters
+    ----------
+    func : callable
+        A callable demographic model function from DFE.DemogSelModels.
+    grids : list[int]
+        Grid sizes for the frequency spectrum calculation.
+    popt : str
+        File name containing demographic parameters for inference.
+    gamma_bounds : list[str]
+        Range of population-scaled selection coefficients, specified as strings.
+    gamma_pts : int
+        Number of grid points for gamma integration.
+    additional_gammas : list[float]
+        List of additional gamma values to cache.
+    output : str
+        Output file name where the cache will be saved.
+    sample_sizes : list[int]
+        List of population sample sizes.
+    cpus : int
+        Number of CPUs to utilize.
+    gpus : int
+        Number of GPUs to utilize.
+    dimensionality : int
+        Dimensionality of the frequency spectrum (must be 1 or 2).
+
+    Raises
+    ------
+    ValueError
+        If the dimensionality is not 1 or 2.
 
-    Arguments:
-        func function: dadi demographic models.
-        grids list: Grid sizes.
-        popt str: Name of the file containing demographic parameters for the inference.
-        gamma_bounds list: Range of population-scaled selection coefficients to cache.
-        gamma_pts int: Number of gamma grid points over which to integrate.
-        additional_gammas list: Additional positive population-scaled selection coefficients to cache for.
-        output str: Name of the output file.
-        sample_sizes list: Sample sizes of populations.
-        cpus int: Number of CPUs to use in cache generation.
-        gpus int: Number of GPUs to use in cache generation.
-        dimensionality int: Dimensionality of the frequency spectrum.
     """
+    if dimensionality not in [1, 2]:
+        raise ValueError(f"Invalid dimensionality {dimensionality}. Only 1 or 2 are accepted.")
 
     if func is not getattr(DFE.DemogSelModels, 'equil'):
         popt, theta = get_opts_and_theta(popt, gen_cache=True)
@@ -69,8 +88,6 @@ def generate_cache(
             cpus=cpus,
             gpus=gpus
         )
-    else:
-        raise ValueError("--dimensionality only accepts 1 or 2.")
 
     if (spectra.spectra < 0).sum() > 0:
         print(