Skip to content

Commit

Permalink
Merge pull request #104 from xin-huang/revision
Browse files Browse the repository at this point in the history
Update docstrings and add type hint
  • Loading branch information
xin-huang authored Apr 16, 2024
2 parents f506725 + d28cfac commit f5fd232
Show file tree
Hide file tree
Showing 23 changed files with 1,294 additions and 735 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9"]
python-version: ["3.9", "3.10"]
max-parallel: 5

steps:
Expand Down
2 changes: 1 addition & 1 deletion build-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies:
- numpy
- pip
- pytest-cov
- python==3.8
- python==3.9
- scipy
- snakemake==7.1.1
- pip:
Expand Down
163 changes: 109 additions & 54 deletions dadi_cli/BestFit.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,51 @@
import glob, sys
import numpy as np
from typing import Optional
from dadi_cli.Models import get_model
from dadi_cli.Pdfs import get_dadi_pdf_params


def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=100):
def get_bestfit_params(
path: str,
lbounds: list[float],
ubounds: list[float],
output: str,
delta: float,
Nclose: int = 3,
Nbest: int = 100
) -> Optional[np.array]:
"""
Description:
Obtains bestfit parameters.
Arguments:
path str: Path to results of inference.
lbounds list: Lower bounds of the optimized parameters.
ubounds list: Upper bounds of the optimized parameters.
output str: Name of the output file.
delta float: Max percentage difference for log-likliehoods compared to the best optimization
log-likliehood to be consider convergent.
Nclose int: Number of best-fit results to be consider convergent.
Nbest int: Number of best-fit results to be displayed.
Obtains best-fit parameters from optimization results, filters them based on log-likelihood
differences, boundaries, and ranks them, finally outputs the results to a specified file.
Parameters
----------
path : str
Path to results of inference.
lbounds : list[float]
Lower bounds of the optimized parameters.
ubounds : list[float]
Upper bounds of the optimized parameters.
output : str
Name of the output file where results will be written.
delta : float
Maximum difference for log-likelihoods compared to the best optimization
log-likelihood to be considered convergent.
Nclose : int, optional
Number of best-fit results to be considered convergent (default is 3).
Nbest : int, optional
Number of best-fit results to be displayed (default is 100).
Returns
-------
np.array | None
Array of close enough results if the convergence criteria are met, None otherwise.
Raises
------
ValueError
If no files are found at the specified path or if an incorrect path naming convention is used.
"""
files = glob.glob(path)
if files == []:
Expand All @@ -26,23 +54,21 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10
)
res, comments = [], []

for f in files:
fid = open(f, "r")
for line in fid.readlines():
if line.startswith("#"):
if line.startswith("# Log(likelihood)"):
params = line.rstrip()
else:
comments.append(line.rstrip())
continue
# Parse numerical result
try:

res.append([float(_) for _ in line.rstrip().split()])
except ValueError:
# Ignore lines with a parsing error
pass
fid.close()
for file in files:
with open(file, "r") as fid:
for line in fid.readlines():
if line.startswith("#"):
if line.startswith("# Log(likelihood)"):
params = line.rstrip()
else:
comments.append(line.rstrip())
continue
# Parse numerical result
try:
res.append([float(_) for _ in line.rstrip().split()])
except ValueError:
# Ignore lines with a parsing error
pass

if len(res) == 0:
print("No optimization results found.")
Expand All @@ -58,7 +84,7 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10
try:
opt_ll = res[0][0]
except IndexError:
return None
return

# Filter out those results within delta threshold
close_enough = res[1 - (opt_ll / res[:, 0]) <= delta]
Expand Down Expand Up @@ -91,43 +117,72 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10
return close_enough


def close2boundaries(params, lbounds, ubounds):
def close2boundaries(
params: list[float],
lbounds: list[float],
ubounds: list[float],
threshold: float = 0.01
) -> bool:
"""
Description:
Helper function for detemining whether a parameter is close to the boundaries.
Determines whether any parameter is close to its boundaries within a specified threshold.
Parameters
----------
params : list[float]
Inferred parameters.
lbounds : list[float]
Lower bounds for the parameters.
ubounds : list[float]
Upper bounds for the parameters.
threshold : float, optional
Proportion of the boundary range that defines closeness (default is 0.01).
Returns
-------
bool
True if any parameter is within the threshold of its boundaries, False otherwise.
Arguments:
params list: Inferred parameters.
lbounds list: Lower bounds for the parameters.
ubounds list: Upper bounds for the parameters.
Returns:
is_close2boundaries: True, if any parameter is close to the boundaries;
False, otherwise.
"""
is_close2boundaries = False
for i in range(len(params)):
if ubounds[i] is not None and lbounds[i] is not None:
bound_range = ubounds[i] - lbounds[i]
if (params[i] - lbounds[i]) / bound_range < 0.01 or (
if (params[i] - lbounds[i]) / bound_range < threshold or (
ubounds[i] - params[i]
) / bound_range < 0.01:
) / bound_range < threshold:
is_close2boundaries = True
return is_close2boundaries


def boundary_filter(res, ubounds, lbounds):
def boundary_filter(
res: np.array,
ubounds: list[float],
lbounds: list[float]
) -> np.array:
"""
Description:
Helper function to filter out results where the params are outside the boundaries.
Arguments:
res numpy.array: Inference results stored as an array.
lbounds list: Lower bounds for the parameters.
ubounds list: Upper bounds for the parameters.
Filters inference results to exclude those where any parameter is outside specified boundaries.
Parameters
----------
res : np.array
Inference results stored as an array, where each row represents an inference result and
columns correspond to different parameters.
lbounds : list[float]
Lower bounds for each parameter.
ubounds : list[float]
Upper bounds for each parameter.
Returns
-------
np.array
Array with rows filtered based on boundaries.
Raises
------
ValueError
If the number of upper boundaries does not match the number of lower boundaries,
or if the number of boundaries does not match the number of parameters.
Returns:
res: Numpy array with values filtered based on boundaries
"""
# Filter out results where the params are outside the boundaries
# Doing this before getting opt_ll so it doesn't throw off convergence check
Expand Down
73 changes: 45 additions & 28 deletions dadi_cli/GenerateCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,54 @@


def generate_cache(
func,
grids,
popt,
gamma_bounds,
gamma_pts,
additional_gammas,
output,
sample_sizes,
cpus,
gpus,
dimensionality,
):
func: callable,
grids: list[int],
popt: str,
gamma_bounds: list[str],
gamma_pts: int,
additional_gammas: list[float],
output: str,
sample_sizes: list[int],
cpus: int,
gpus: int,
dimensionality: int,
) -> None:
"""
Description:
Generates caches of frequency spectra for DFE inference.
Generates caches of frequency spectra for DFE inference using demographic models.
Parameters
----------
func : callable
A callable demographic model function from DFE.DemogSelModels.
grids : list[int]
Grid sizes for the frequency spectrum calculation.
popt : str
File name containing demographic parameters for inference.
gamma_bounds : list[str]
Range of population-scaled selection coefficients, specified as strings.
gamma_pts : int
Number of grid points for gamma integration.
additional_gammas : list[float]
List of additional gamma values to cache.
output : str
Output file name where the cache will be saved.
sample_sizes : list[int]
List of population sample sizes.
cpus : int
Number of CPUs to utilize.
gpus : int
Number of GPUs to utilize.
dimensionality : int
Dimensionality of the frequency spectrum (must be 1 or 2).
Raises
------
ValueError
If the dimensionality is not 1 or 2.
Arguments:
func function: dadi demographic models.
grids list: Grid sizes.
popt str: Name of the file containing demographic parameters for the inference.
gamma_bounds list: Range of population-scaled selection coefficients to cache.
gamma_pts int: Number of gamma grid points over which to integrate.
additional_gammas list: Additional positive population-scaled selection coefficients to cache for.
output str: Name of the output file.
sample_sizes list: Sample sizes of populations.
cpus int: Number of CPUs to use in cache generation.
gpus int: Number of GPUs to use in cache generation.
dimensionality int: Dimensionality of the frequency spectrum.
"""
if dimensionality not in [1, 2]:
raise ValueError(f"Invalid dimensionality {dimensionality}. Only 1 or 2 are accepted.")

if func is not getattr(DFE.DemogSelModels, 'equil'):
popt, theta = get_opts_and_theta(popt, gen_cache=True)
Expand Down Expand Up @@ -69,8 +88,6 @@ def generate_cache(
cpus=cpus,
gpus=gpus
)
else:
raise ValueError("--dimensionality only accepts 1 or 2.")

if (spectra.spectra < 0).sum() > 0:
print(
Expand Down
Loading

0 comments on commit f5fd232

Please sign in to comment.