From d28cfac3f0a1eb99a38fae034f14f685777db71f Mon Sep 17 00:00:00 2001 From: xin-huang Date: Sun, 14 Apr 2024 17:32:13 +0200 Subject: [PATCH] Update docstrings and add type hint --- .github/workflows/build.yml | 2 +- build-env.yml | 2 +- dadi_cli/BestFit.py | 163 ++++++++---- dadi_cli/GenerateCache.py | 73 +++--- dadi_cli/GenerateFs.py | 149 +++++++---- dadi_cli/InferDFE.py | 130 ++++++---- dadi_cli/InferDM.py | 195 +++++++++------ dadi_cli/Models.py | 131 ++++++---- dadi_cli/Pdfs.py | 258 +++++++++---------- dadi_cli/Plot.py | 39 ++- dadi_cli/SimulateFs.py | 165 +++++++++---- dadi_cli/Stat.py | 355 ++++++++++++++++----------- dadi_cli/utilities.py | 102 +++++--- mkdocs-env.yml | 2 +- setup.py | 2 +- tests/example_data/example_models.py | 2 +- tests/test_GenerateCache.py | 21 ++ tests/test_GenerateFs.py | 38 +++ tests/test_InferDFE.py | 20 ++ tests/test_Models.py | 23 +- tests/test_Pdfs.py | 113 +++++---- tests/test_Plot.py | 3 + tests/test_SimulateFs.py | 41 +++- 23 files changed, 1294 insertions(+), 735 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b9174124..7cc595b1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9"] + python-version: ["3.9", "3.10"] max-parallel: 5 steps: diff --git a/build-env.yml b/build-env.yml index e6b6462d..35a00bce 100644 --- a/build-env.yml +++ b/build-env.yml @@ -14,7 +14,7 @@ dependencies: - numpy - pip - pytest-cov - - python==3.8 + - python==3.9 - scipy - snakemake==7.1.1 - pip: diff --git a/dadi_cli/BestFit.py b/dadi_cli/BestFit.py index 08375ee9..48de872b 100644 --- a/dadi_cli/BestFit.py +++ b/dadi_cli/BestFit.py @@ -1,23 +1,51 @@ import glob, sys import numpy as np +from typing import Optional from dadi_cli.Models import get_model from dadi_cli.Pdfs import get_dadi_pdf_params -def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=100): +def get_bestfit_params( + path: str, + lbounds: list[float], + ubounds: list[float], + output: str, + delta: float, + Nclose: int = 3, + Nbest: int = 100 +) -> Optional[np.array]: """ - Description: - Obtains bestfit parameters. - - Arguments: - path str: Path to results of inference. - lbounds list: Lower bounds of the optimized parameters. - ubounds list: Upper bounds of the optimized parameters. - output str: Name of the output file. - delta float: Max percentage difference for log-likliehoods compared to the best optimization - log-likliehood to be consider convergent. - Nclose int: Number of best-fit results to be consider convergent. - Nbest int: Number of best-fit results to be displayed. + Obtains best-fit parameters from optimization results, filters them based on log-likelihood + differences, boundaries, and ranks them, finally outputs the results to a specified file. + + Parameters + ---------- + path : str + Path to results of inference. + lbounds : list[float] + Lower bounds of the optimized parameters. + ubounds : list[float] + Upper bounds of the optimized parameters. + output : str + Name of the output file where results will be written. + delta : float + Maximum difference for log-likelihoods compared to the best optimization + log-likelihood to be considered convergent. + Nclose : int, optional + Number of best-fit results to be considered convergent (default is 3). + Nbest : int, optional + Number of best-fit results to be displayed (default is 100). + + Returns + ------- + np.array | None + Array of close enough results if the convergence criteria are met, None otherwise. + + Raises + ------ + ValueError + If no files are found at the specified path or if an incorrect path naming convention is used. + """ files = glob.glob(path) if files == []: @@ -26,23 +54,21 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10 ) res, comments = [], [] - for f in files: - fid = open(f, "r") - for line in fid.readlines(): - if line.startswith("#"): - if line.startswith("# Log(likelihood)"): - params = line.rstrip() - else: - comments.append(line.rstrip()) - continue - # Parse numerical result - try: - - res.append([float(_) for _ in line.rstrip().split()]) - except ValueError: - # Ignore lines with a parsing error - pass - fid.close() + for file in files: + with open(file, "r") as fid: + for line in fid.readlines(): + if line.startswith("#"): + if line.startswith("# Log(likelihood)"): + params = line.rstrip() + else: + comments.append(line.rstrip()) + continue + # Parse numerical result + try: + res.append([float(_) for _ in line.rstrip().split()]) + except ValueError: + # Ignore lines with a parsing error + pass if len(res) == 0: print("No optimization results found.") @@ -58,7 +84,7 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10 try: opt_ll = res[0][0] except IndexError: - return None + return # Filter out those results within delta threshold close_enough = res[1 - (opt_ll / res[:, 0]) <= delta] @@ -91,43 +117,72 @@ def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=10 return close_enough -def close2boundaries(params, lbounds, ubounds): +def close2boundaries( + params: list[float], + lbounds: list[float], + ubounds: list[float], + threshold: float = 0.01 +) -> bool: """ - Description: - Helper function for detemining whether a parameter is close to the boundaries. + Determines whether any parameter is close to its boundaries within a specified threshold. + + Parameters + ---------- + params : list[float] + Inferred parameters. + lbounds : list[float] + Lower bounds for the parameters. + ubounds : list[float] + Upper bounds for the parameters. + threshold : float, optional + Proportion of the boundary range that defines closeness (default is 0.01). + + Returns + ------- + bool + True if any parameter is within the threshold of its boundaries, False otherwise. - Arguments: - params list: Inferred parameters. - lbounds list: Lower bounds for the parameters. - ubounds list: Upper bounds for the parameters. - - Returns: - is_close2boundaries: True, if any parameter is close to the boundaries; - False, otherwise. """ is_close2boundaries = False for i in range(len(params)): if ubounds[i] is not None and lbounds[i] is not None: bound_range = ubounds[i] - lbounds[i] - if (params[i] - lbounds[i]) / bound_range < 0.01 or ( + if (params[i] - lbounds[i]) / bound_range < threshold or ( ubounds[i] - params[i] - ) / bound_range < 0.01: + ) / bound_range < threshold: is_close2boundaries = True return is_close2boundaries -def boundary_filter(res, ubounds, lbounds): +def boundary_filter( + res: np.array, + ubounds: list[float], + lbounds: list[float] +) -> np.array: """ - Description: - Helper function to filter out results where the params are outside the boundaries. - - Arguments: - res numpy.array: Inference results stored as an array. - lbounds list: Lower bounds for the parameters. - ubounds list: Upper bounds for the parameters. + Filters inference results to exclude those where any parameter is outside specified boundaries. + + Parameters + ---------- + res : np.array + Inference results stored as an array, where each row represents an inference result and + columns correspond to different parameters. + lbounds : list[float] + Lower bounds for each parameter. + ubounds : list[float] + Upper bounds for each parameter. + + Returns + ------- + np.array + Array with rows filtered based on boundaries. + + Raises + ------ + ValueError + If the number of upper boundaries does not match the number of lower boundaries, + or if the number of boundaries does not match the number of parameters. - Returns: - res: Numpy array with values filtered based on boundaries """ # Filter out results where the params are outside the boundaries # Doing this before getting opt_ll so it doesn't throw off convergence check diff --git a/dadi_cli/GenerateCache.py b/dadi_cli/GenerateCache.py index ce0ef832..ac1bfab0 100644 --- a/dadi_cli/GenerateCache.py +++ b/dadi_cli/GenerateCache.py @@ -7,35 +7,54 @@ def generate_cache( - func, - grids, - popt, - gamma_bounds, - gamma_pts, - additional_gammas, - output, - sample_sizes, - cpus, - gpus, - dimensionality, -): + func: callable, + grids: list[int], + popt: str, + gamma_bounds: list[str], + gamma_pts: int, + additional_gammas: list[float], + output: str, + sample_sizes: list[int], + cpus: int, + gpus: int, + dimensionality: int, +) -> None: """ - Description: - Generates caches of frequency spectra for DFE inference. + Generates caches of frequency spectra for DFE inference using demographic models. + + Parameters + ---------- + func : callable + A callable demographic model function from DFE.DemogSelModels. + grids : list[int] + Grid sizes for the frequency spectrum calculation. + popt : str + File name containing demographic parameters for inference. + gamma_bounds : list[str] + Range of population-scaled selection coefficients, specified as strings. + gamma_pts : int + Number of grid points for gamma integration. + additional_gammas : list[float] + List of additional gamma values to cache. + output : str + Output file name where the cache will be saved. + sample_sizes : list[int] + List of population sample sizes. + cpus : int + Number of CPUs to utilize. + gpus : int + Number of GPUs to utilize. + dimensionality : int + Dimensionality of the frequency spectrum (must be 1 or 2). + + Raises + ------ + ValueError + If the dimensionality is not 1 or 2. - Arguments: - func function: dadi demographic models. - grids list: Grid sizes. - popt str: Name of the file containing demographic parameters for the inference. - gamma_bounds list: Range of population-scaled selection coefficients to cache. - gamma_pts int: Number of gamma grid points over which to integrate. - additional_gammas list: Additional positive population-scaled selection coefficients to cache for. - output str: Name of the output file. - sample_sizes list: Sample sizes of populations. - cpus int: Number of CPUs to use in cache generation. - gpus int: Number of GPUs to use in cache generation. - dimensionality int: Dimensionality of the frequency spectrum. """ + if dimensionality not in [1, 2]: + raise ValueError(f"Invalid dimensionality {dimensionality}. Only 1 or 2 are accepted.") if func is not getattr(DFE.DemogSelModels, 'equil'): popt, theta = get_opts_and_theta(popt, gen_cache=True) @@ -69,8 +88,6 @@ def generate_cache( cpus=cpus, gpus=gpus ) - else: - raise ValueError("--dimensionality only accepts 1 or 2.") if (spectra.spectra < 0).sum() > 0: print( diff --git a/dadi_cli/GenerateFs.py b/dadi_cli/GenerateFs.py index 3e77e41f..ed7fb87d 100644 --- a/dadi_cli/GenerateFs.py +++ b/dadi_cli/GenerateFs.py @@ -1,43 +1,62 @@ -import dadi -import random +import dadi, random def generate_fs( - vcf, - output, - pop_ids, - pop_info, - projections, - subsample, - polarized, - marginalize_pops, - bootstrap, - chunk_size, - masking, - seed, -): + vcf: str, + output: str, + pop_ids: list[str], + pop_info: str, + projections: list[int], + subsample: bool, + polarized: bool, + marginalize_pops: list[str], + bootstrap: int, + chunk_size: int, + masking: str, + seed: int, +) -> None: """ - Description: - Generates a frequency spectrum from a VCF file. - - Arguments: - vcf str: Name of the VCF file containing genotype data. - output str: Name of the output file containing frequency spectra. - pop_ids list: List of population ids. - pop_info str: Name of the file containing population information. - projections list: List of sample sizes after projection. - subsample bool: If True, spectrum is generated with sub-samples; - Otherwise, spectrum is generated with all the samples. - polarized bool: If True, unfolded spectrum is generated; - Otherwise, folded spectrum is generated. - margnialize_pops list: List of population ids to remove from the frequency spectrum. - bootstrap int: Times to perform bootstrapping. - chunk_size int: Chunk size to divide the genomes for bootstrapping. - masking str: If masking == 'singleton', singletons in each population are masked; - If masking == 'shared', singletons in each population and shared across populations are masked; - Otherwise, singletons are not masked. - seed int: Seed for generating random numbers. + Generates a frequency spectrum from a VCF file using various settings for data manipulation and analysis. + + Parameters + ---------- + vcf : str + Name of the VCF file containing genotype data. + output : str + Name of the output file containing frequency spectra. + pop_ids : list[str] + List of population ids. + pop_info : str + Name of the file containing population information. + projections : list[int] + List of sample sizes after projection. + subsample : bool + If True, spectrum is generated with sub-samples; otherwise, spectrum is generated with all samples. + polarized : bool + If True, unfolded spectrum is generated; otherwise, folded spectrum is generated. + marginalize_pops : list[str] + List of population ids to remove from the frequency spectrum. If None, no populations are marginalized. + bootstrap : int + Number of times to perform bootstrapping. If None, bootstrapping is not performed. + chunk_size : int + Chunk size to divide the genomes for bootstrapping. + masking : str + Determines the masking method for singletons: + 'singleton' - Masks singletons in each population, + 'shared' - Masks singletons in each population and those shared across populations, + '' - No masking is applied. + seed : int + Seed for generating random numbers. If None, a random seed is used. + + Raises + ------ + ValueError + If the lengths of `pop_ids` and `projections` do not match. + """ + if len(pop_ids) != len(projections): + raise ValueError("The lengths of `pop_ids` and `projections` must match.") + if subsample: subsample_dict = {} for i in range(len(pop_ids)): @@ -52,33 +71,42 @@ def generate_fs( fs = dadi.Spectrum.from_data_dict( dd, pop_ids=pop_ids, projections=projections, polarized=polarized ) - if marginalize_pops != None: + if marginalize_pops is not None: fs = _marginalized_fs(fs, marginalize_pops, pop_ids) if masking != "": _mask_entries(fs, masking) fs.to_file(output) else: - if seed != None: + if seed is not None: random.seed(seed) fragments = dadi.Misc.fragment_data_dict(dd, chunk_size) bootstrap_list = dadi.Misc.bootstraps_from_dd_chunks( fragments, bootstrap, pop_ids, projections, polarized ) for fs, b in zip(bootstrap_list, range(len(bootstrap_list))): - if marginalize_pops != None: + if marginalize_pops is not None: fs = _marginalized_fs(fs, marginalize_pops, pop_ids) if masking != "": _mask_entries(fs, masking) fs.to_file(output + ".bootstrapping." + str(b) + ".fs") -def _mask_entries(fs, masking): + +def _mask_entries(fs: dadi.Spectrum, masking: str) -> None: """ - Description: - Helper function for masking singletons in a frequency spectrum. + Masks entries in the frequency spectrum based on the number of populations and the specified masking strategy. + + Parameters + ---------- + fs : dadi.Spectrum + The frequency spectrum object, which contains genotype frequency data across populations. + masking : str + Masking shared singletons if masking == 'shared'. + + Raises + ------ + ValueError + If the frequency spectrum contains more than 3 populations, as the current implementation does not support more. - Arguments: - fs dadi.Spectrum: Frequency spectrum. - masking str: Masking shared singletons if masking == 'shared'. """ if len(fs.sample_sizes) == 1: fs.mask[1] = True @@ -111,19 +139,34 @@ def _mask_entries(fs, masking): ) -def _marginalized_fs(fs, marginalize_pops, pop_ids): +def _marginalized_fs(fs: dadi.Spectrum, marginalize_pops: list[str], + pop_ids: list[str]) -> dadi.Spectrum: """ - Description: - Helper function for generating a marginalized frequency spectrum. + Generates a marginalized frequency spectrum by removing specified populations. + + Parameters + ---------- + fs : dadi.Spectrum + Frequency spectrum for marginalization. + marginalize_pops : list[str] + List of population ids to remove from the frequency spectrum. + pop_ids : list[str] + List of all the population ids in the frequency spectrum. - Arguments: - fs dadi.Spectrum: Frequency spectrum for marginalization. - marginalize_pops list: List of population ids to remove from the frequency spectrum. - pop_ids list: List of all the population ids in the frequency spectrum. + Returns + ------- + dadi.Spectrum + Marginalized frequency spectrum. + + Raises + ------ + ValueError + If any of the populations in marginalize_pops are not in pop_ids. - Returns: - mfs dadi.Spectrum: Marginalized frequency spectrum. """ + if not set(marginalize_pops).issubset(set(pop_ids)): + raise ValueError("All populations to marginalize must be in the list of population ids.") + marginalize_list = [pop_ids.index(pop) for pop in marginalize_pops] mfs = fs.marginalize(marginalize_list) return mfs diff --git a/dadi_cli/InferDFE.py b/dadi_cli/InferDFE.py index 36358bb1..e07c1e4f 100644 --- a/dadi_cli/InferDFE.py +++ b/dadi_cli/InferDFE.py @@ -8,71 +8,99 @@ def infer_dfe( - fs, - cache1d, - cache2d, - sele_dist, - sele_dist2, - theta, - p0, - upper_bounds, - lower_bounds, - fixed_params, - misid, - cuda, - maxeval, - maxtime, - bestfits=None, - seed=None, -): + fs: dadi.Spectrum, + cache1d: dadi.DFE.Cache1D_mod, + cache2d: dadi.DFE.Cache2D_mod, + sele_dist: str, + sele_dist2: str, + theta: float, + p0: list[float], + upper_bounds: list[float], + lower_bounds: list[float], + fixed_params: list[float], + misid: bool, + cuda: bool, + maxeval: int, + maxtime: int, + bestfits: list[float] = None, + seed: int = None, +) -> tuple[float, list[float], float]: """ - Description: - DFE inference. - - Arguments: - fs dadi.Spectrum: Frequency spectrum. - cache1d dadi.DFE.Cache1D_mod: Caches of 1d frequency spectra for DFE inference. - cache2d dadi.DFE.Cache2D_mod: Caches of 2d frequency spectra for DFE inference. - sele_dist str: Name of the 1d PDF function for modeling DFEs. - sele_dist2 str: Name of the 2d PDF function for modeling DFEs. - theta float: Population-scaled mutation rate inferred from the demographic model without selection. - p0 list: Initial parameter values for inference. - upper_bounds list: Upper bounds of the optimized parameters. - lower_bounds list: Lower bounds of the optimized parameters. - fixed_params list: Fixed parameters during the inference. - misid bool: If True, add a parameter for modeling ancestral state misidentification when data are polarized. - cuda bool: If True, use GPU to speed up calculation; - Otherwise, use CPU to do calculation. - maxeval int: Max number of parameter set evaluations tried for optimizing demography. - maxtime int: Max amount of time for optimizing demography. - seed int: Seed for generating random numbers. - - Returns: - ll_model float: Log(likelihood) of the inferred model. - popt list: Optimized parameters. - theta float: Population-scaled mutation rate inferred from the demographic model with selection. + Performs Distribution of Fitness Effects (DFE) inference using either 1D or 2D frequency spectra, or a mixture of both. + + Parameters + ---------- + fs : dadi.Spectrum + Frequency spectrum from which to infer the DFE. + cache1d : dadi.DFE.Cache1D_mod + Caches of 1D frequency spectra for DFE inference. + cache2d : dadi.DFE.Cache2D_mod + Caches of 2D frequency spectra for DFE inference. + sele_dist : str + Identifier for the 1D Probability Density Function (PDF) used for modeling DFEs. + sele_dist2 : str + Identifier for the 2D PDF used for modeling DFEs. + theta : float + Population-scaled mutation rate inferred from the demographic model without selection. + p0 : list[float] + Initial parameter values for the inference. + upper_bounds : list[float] + Upper bounds of the parameters for optimization. + lower_bounds : list[float] + Lower bounds of the parameters for optimization. + fixed_params : list[float] + Parameters that are held fixed during the optimization. Use `None` for parameters that are free to vary. + misid : bool + If True, incorporates a parameter to model ancestral state misidentification. + cuda : bool + If True, enables GPU acceleration for calculations. Otherwise, calculations are performed using the CPU. + maxeval : int + Maximum number of parameter set evaluations attempted during optimization. + maxtime : int + Maximum time allowed for optimization, in seconds. + bestfits : list[float], optional + List of best-fit parameters from previous runs to use as initial values. Randomly selects from these if provided. + seed : int, optional + Seed for random number generation, affecting the randomization of initial parameters and any stochastic processes. + + Returns + ------- + ll_model : float + Log-likelihood of the inferred model. + popt : list[float] + Optimized parameters. + theta : float + Population-scaled mutation rate inferred from the demographic model with selection. + + Raises + ------ + ValueError + If neither `cache1d` nor `cache2d` is provided, an error is raised. + """ + if not cache1d and not cache2d: + raise ValueError("At least one of cache1d or cache2d must be provided.") # Randomize starting parameter values - if seed != None: + if seed is not None: np.random.seed(seed) - if bestfits != None: + if bestfits is not None: p0 = bestfits[np.random.randint(len(bestfits))%10] - if cache1d != None: + if cache1d is not None: func = cache1d.integrate - if cache2d != None: + if cache2d is not None: func = cache2d.integrate - if sele_dist != None: + if sele_dist is not None: sele_dist = get_dadi_pdf(sele_dist) - if sele_dist2 != None: + if sele_dist2 is not None: sele_dist2 = get_dadi_pdf(sele_dist2) - if (sele_dist == None) and (sele_dist2 != None): + if (sele_dist is None) and (sele_dist2 is not None): sele_dist = sele_dist2 - if (cache1d != None) and (cache2d != None): + if (cache1d is not None) and (cache2d is not None): func = dadi.DFE.Cache2D_mod.mixture func_args = [cache1d, cache2d, sele_dist, sele_dist2, theta] else: @@ -107,7 +135,7 @@ def infer_dfe( ) # Get expected SFS for MLE - if (cache1d != None) and (cache2d != None): + if (cache1d is not None) and (cache2d is not None): model = func(popt, None, cache1d, cache2d, sele_dist, sele_dist2, theta, None) else: model = func(popt, None, sele_dist, theta, None) diff --git a/dadi_cli/InferDM.py b/dadi_cli/InferDM.py index b6acee24..21c26ffd 100644 --- a/dadi_cli/InferDM.py +++ b/dadi_cli/InferDM.py @@ -5,55 +5,77 @@ def infer_demography( - fs, - func, - p0, - pts_l, - upper_bounds, - lower_bounds, - fixed_params, - misid, - cuda, - maxeval, - maxtime, - bestfits=None, - seed=None, -): + fs: dadi.Spectrum, + func: callable, + p0: list[float], + pts_l: tuple[int], + upper_bounds: list[float], + lower_bounds: list[float], + fixed_params: list[float], + misid: bool, + cuda: bool, + maxeval: int, + maxtime: int, + bestfits: list[float] = None, + seed: int = None, +) -> tuple[float, list[float], float]: """ - Description: - Demographic inference. - - Arguments: - fs dadi.Spectrum: Frequence spectrum. - func function: Demographic model for modeling. - p0 list: Initial parameter values for inference. - pts_l tuple: Grid sizes for modeling. - upper_bounds list: Upper bounds of the optimized parameters. - lower_bounds list: Lower bounds of the optimized parameters. - fixed_params list: Fixed parameters during the inference. - misid bool: If True, add a parameter for modeling ancestral state misidentification when data are polarized. - cuda bool: If True, use GPU to speed up calculation; - Otherwise, use CPU to do calculation. - maxeval int: Max number of parameter set evaluations tried for optimization. - maxtime int: Max amount of time for optimization. - bestfits list: Best-fit parameters. - seed int: Seed for generating random numbers. - - Returns: - ll_model float: Log(likelihood) of the inferred model. - popt list: Optimized parameters. - theta float: Population-scaled mutation rate inferred from the demographic model. + Performs demographic inference using a frequency spectrum and a demographic model. + + Parameters + ---------- + fs : dadi.Spectrum + Frequency spectrum for demographic inference. + func : callable + Demographic model function used for modeling. + p0 : list[float] + Initial parameter values for inference. + pts_l : tuple[int], optional + Grid sizes used in the demographic model. + upper_bounds : list[float] + Upper bounds of the optimized parameters. + lower_bounds : list[float] + Lower bounds of the optimized parameters. + fixed_params : list[float] + Parameters that are held fixed during the inference. + misid : bool + If True, incorporates a parameter to model ancestral state misidentification. + cuda : bool + If True, uses GPU acceleration for calculations; otherwise, uses CPU. + maxeval : int + Maximum number of parameter set evaluations during optimization. + maxtime : int + Maximum amount of time (in seconds) allowed for optimization. + bestfits : list[float], optional + List of best-fit parameters from previous runs to use as initial values. + seed : int, optional + Seed for random number generation, affecting initial parameter perturbation. + + Returns + ------- + float + Log likelihood of the inferred demographic model. + list[float] + List of optimized parameters. + float + Population-scaled mutation rate (theta) inferred from the demographic model. + + Raises + ------ + nlopt.RoundoffLimited + If optimization fails due to roundoff errors, likely indicating issues with parameter bounds or initial values. + """ # Set seed for starting parameter values when using WorkQueue - if seed != None: + if seed is not None: np.random.seed(seed) # TODO: Need to consider appropriate rtol & atol values, and whether these maxeval are appropriate if cuda: dadi.cuda_enabled(True) - if bestfits != None: + if bestfits is not None: p0 = bestfits[np.random.randint(len(bestfits))%10] if misid: @@ -100,49 +122,65 @@ def infer_demography( return ll_model, popt, theta - def infer_global_opt( - fs, - func, - p0, - pts_l, - upper_bounds, - lower_bounds, - fixed_params, - misid, - cuda, - maxeval, - maxtime, - global_algorithm, - seed=None, -): + fs: dadi.Spectrum, + func: callable, + p0: list[float], + pts_l: tuple[int], + upper_bounds: list[float], + lower_bounds: list[float], + fixed_params: list[float], + misid: bool, + cuda: bool, + maxeval: int, + maxtime: int, + global_algorithm: str, + seed: int = None, +) -> tuple[float, list[float], float]: """ - Description: - Demographic inference with global optimization. - - Arguments: - fs dadi.Spectrum: Frequence spectrum. - func function: Demographic model for modeling. - p0 list: Initial parameter values for inference. - pts_l tuple: Grid sizes for modeling. - upper_bounds list: Upper bounds of the optimized parameters. - lower_bounds list: Lower bounds of the optimized parameters. - fixed_params list: Fixed parameters during the inference. - misid bool: If True, add a parameter for modeling ancestral state misidentification when data are polarized. - cuda bool: If True, use GPU to speed up calculation; - Otherwise, use CPU to do calculation. - maxeval int: Max number of parameter set evaluations tried for optimization. - maxtime int: Max amount of time for optimization. - global_algorithm str: Algorithm for global optimization. - seed int: Seed for generating random numbers. - - Returns: - ll_model float: Log(likelihood) of the inferred model. - popt list: Optimized parameters. - theta float: Population-scaled mutation rate inferred from the demographic model. + Performs demographic inference using global optimization techniques. + + Parameters + ---------- + fs : dadi.Spectrum + Frequency spectrum from which to infer demographics. + func : callable + Demographic model function to be fitted. + p0 : list[float] + Initial parameter values for the optimization. + pts_l : tuple[int] + Grid sizes used in the numerical integration of the demographic model. + upper_bounds : list[float] + Upper bounds for the parameters during optimization. + lower_bounds : list[float] + Lower bounds for the parameters during optimization. + fixed_params : list[float] + Parameters that are held fixed during the optimization; None for parameters that are free. + misid : bool + If True, incorporate a model for ancestral state misidentification. + cuda : bool + Enable GPU acceleration for calculations. + maxeval : int + Maximum number of evaluations for the optimizer. + maxtime : int + Maximum time (in seconds) allowed for optimization. + global_algorithm : str + Name of the global optimization algorithm to use. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + float + Log likelihood of the best fit model. + list[float] + Optimized parameter values. + float + Population-scaled mutation rate (theta) based on the best fit model. + """ # Randomize starting parameter values - if seed != None: + if seed is not None: np.random.seed(seed) if cuda: @@ -189,4 +227,5 @@ def infer_global_opt( model = func_ex(popt, fs.sample_sizes, pts_l) ll_global = dadi.Inference.ll_multinom(model, fs) theta = dadi.Inference.optimal_sfs_scaling(model, fs) + return ll_global, popt, theta diff --git a/dadi_cli/Models.py b/dadi_cli/Models.py index 3ec69f69..5b6c86b4 100644 --- a/dadi_cli/Models.py +++ b/dadi_cli/Models.py @@ -1,8 +1,5 @@ -import dadi +import dadi, importlib, os, sys import dadi.DFE as DFE -import sys -import os -import importlib from inspect import getmembers, isfunction duplicated_models = ["snm", "bottlegrowth"] @@ -36,33 +33,56 @@ sele_models.remove(m) -def get_model(model_name, model_file=None): +def get_model( + model_name: str, + model_file: str = None +) -> tuple[callable, list[str]]: """ - Description: - Obtains a demographic model and its parameters. + Obtains a demographic model and its parameters from either a predefined catalog or a specified file. - Arguments: - model_name str: Name of the demographic model. - model_file str: Path and name of the file containing customized models. + Parameters + ---------- + model_name : str + Name of the demographic model. + model_file : str, optional + Path and name of the file containing customized models. Defaults to None, which uses predefined models. + + Returns + ------- + tuple + A tuple containing: + - function: The demographic model function for modeling. + - list[str]: List of parameter names required by the model. + + Raises + ------ + ImportError + If the specified model file cannot be imported. + ValueError + If the demographic model or its parameter names cannot be found or are undefined. - Returns: - func function: Demographic model for modeling. - params list: List of parameters. """ model_name0 = model_name - if model_file != None: + if model_file is not None: # If the user has the model folder in their PATH - try: - func = getattr(importlib.import_module(model_file), model_name) + #try: + # func = getattr(importlib.import_module(model_file), model_name) # If the user does not have the model folder in their PATH we add it # This currently can mess with the User's PATH while running dadi-cli - except: - model_file = os.path.abspath(model_file) - model_path = os.path.dirname(model_file) - model_file = os.path.basename(model_file) - model_file = os.path.splitext(model_file)[0] + #except: + # model_file = os.path.abspath(model_file) + # model_path = os.path.dirname(model_file) + # model_file = os.path.basename(model_file) + # model_file = os.path.splitext(model_file)[0] + # sys.path.append(model_path) + # func = getattr(importlib.import_module(model_file), model_name) + try: + module_name = os.path.splitext(os.path.basename(model_file))[0] + model_path = os.path.dirname(os.path.abspath(model_file)) sys.path.append(model_path) - func = getattr(importlib.import_module(model_file), model_name) + func = getattr(importlib.import_module(module_name), model_name) + except ImportError as e: + raise ImportError(f"Failed to import module: {model_file}") from e elif model_name in oned_models: func = getattr(dadi.Demographics1D, model_name) elif model_name in twod_models: @@ -74,20 +94,29 @@ def get_model(model_name, model_file=None): else: raise ValueError(f"Cannot find model: {model_name}.") - try: - params = func.__param_names__ - except: + if not hasattr(func, '__param_names__'): raise ValueError( f"Demographic model needs a .__param_names__ attribute!\nAdd one by adding the line {model_name0}.__param_name__ = [LIST_OF_PARAMS]\nReplacing LIST_OF_PARAMS with the names of the parameters as strings." ) - return func, params + return func, func.__param_names__ -def print_built_in_models(): +def print_built_in_models() -> None: """ - Description: - Outputs built-in models in dadi. + Prints the names of all built-in demographic models available in dadi, categorized by their dimensions + and whether they incorporate selection. + + The output includes: + - 1D demographic models + - 2D demographic models including those from Portik et al. (2017) + - 3D demographic models including those from Portik et al. (2017) + - Models that incorporate selection + + Notes + ----- + 3D models are available only in dadi versions >= 2.3.2. + """ print("Built-in 1D dadi demographic models:") for m in oned_models: @@ -100,10 +129,11 @@ def print_built_in_models(): print() print("Built-in 3D dadi and Portik et al. (2017) demographic models:") - for m in threed_models: - print(f"- {m}") - if threed_models == []: - print("- dadi version is < 2.3.2, upgdate for simple access to 3D models added to dadi.") + if threed_models: + for m in threed_models: + print(f"- {m}") + else: + print("- dadi version is < 2.3.2, please update for simple access to 3D models added to dadi.") print() print("Built-in demographic models with selection:") @@ -111,22 +141,31 @@ def print_built_in_models(): print(f"- {m}") -def print_built_in_model_details(model_name): +def print_built_in_model_details(model_name: str) -> None: """ - Description: - Outputs details of built-in models. + Prints detailed documentation for a specified built-in demographic model. + + Parameters + ---------- + model_name : str + The name of the built-in model to retrieve details for. + + Raises + ------ + ValueError + If the model cannot be found or the model documentation is not accessible. - Arguments: - model_name str: Name of the built-in model. """ - func, params = get_model(model_name) - model_doc = func.__doc__ - model_doc_new = "" - for ele in model_doc.split("\n"): - if "ns:" not in ele and "pts:" not in ele and "n1" not in ele: - model_doc_new += "\t" + ele.strip() + "\n" - model_doc_new = model_doc_new.strip() try: + func, params = get_model(model_name) + model_doc = func.__doc__ + model_doc_new = "" + for ele in model_doc.split("\n"): + if "ns:" not in ele and "pts:" not in ele and "n1" not in ele: + model_doc_new += "\t" + ele.strip() + "\n" + model_doc_new = model_doc_new.strip() print(f"- {model_name}:\n\n\t{model_doc_new}\n") - except: + except AttributeError: + raise ValueError(f"Documentation for the model '{model_name}' is not available.") + except Exception as e: raise ValueError(f"Cannot find model: {model_name}.") diff --git a/dadi_cli/Pdfs.py b/dadi_cli/Pdfs.py index b4f274d8..a3722236 100644 --- a/dadi_cli/Pdfs.py +++ b/dadi_cli/Pdfs.py @@ -1,17 +1,26 @@ import dadi.DFE as DFE -def get_dadi_pdf(pdf): +def get_dadi_pdf(pdf: str) -> callable: """ - Description: - Obtains a built-in probability density function for - modeling distribution of fitness effects in dadi. + Obtains a built-in probability density function for modeling the distribution + of fitness effects in dadi. - Arguments: - pdf string: Name of the probability density function. + Parameters + ---------- + pdf : str + Name of the probability density function. + + Returns + ------- + callable + The probability density function from DFE.PDFs module. + + Raises + ------ + ValueError + If the specified probability density function name is not recognized. - Returns: - func DFE.PDFs: PDF for modeling. """ if pdf == "beta": func = DFE.PDFs.beta @@ -30,145 +39,148 @@ def get_dadi_pdf(pdf): elif "mixture" in pdf: func = DFE.mixture else: - raise Exception("Probability density function " + pdf + " is not available!") + raise ValueError("Probability density function " + pdf + " is not available!") return func -def get_dadi_pdf_params(pdf): - """ - Description: - Obtains a list of parameters for a given built-in probability density function - in dadi. - - Arguments: - pdf string: Name of the probability density function. - - Returns: - params list: List of parameters. - """ - if pdf == "beta": - params = ["alpha", "beta"] - elif pdf == "biv_sym_ind_gamma": - params = ["shape", "scale"] - elif pdf == "biv_asym_ind_gamma": - params = ["shape1", "scale1", "shape2", "scale2"] - elif pdf == "biv_sym_lognormal": - params = ["log_mu", "log_sigma", "rho"] - elif pdf == "biv_asym_lognormal": - params = ["log_mu1", "log_sigma1", "log_mu2", "log_sigma2", "rho"] - elif pdf == "exponential": - params = ["scale"] - elif pdf == "gamma": - params = ["shape", "scale"] - elif pdf == "lognormal": - params = ["log_mu", "log_sigma"] - elif pdf == "mixture_gamma": - params = ["shape", "scale", "w"] - elif pdf == "mixture_lognormal": - params = ["log_mu", "log_sigma", "rho", "w"] - elif pdf == "normal": - params = ["mu", "sigma"] +def get_dadi_pdf_params(pdf: str) -> list[str]: + """ + Obtains a list of parameters for a given built-in probability density function in dadi. + + Parameters + ---------- + pdf : str + Name of the probability density function. + + Returns + ------- + list[str] + List of parameter names associated with the specified PDF. + + Raises + ------ + ValueError + If the specified probability density function name is not recognized. + + """ + params_dict = { + "beta": ["alpha", "beta"], + "biv_sym_ind_gamma": ["shape", "scale"], + "biv_asym_ind_gamma": ["shape1", "scale1", "shape2", "scale2"], + "biv_sym_lognormal": ["log_mu", "log_sigma", "rho"], + "biv_asym_lognormal": ["log_mu1", "log_sigma1", "log_mu2", "log_sigma2", "rho"], + "exponential": ["scale"], + "gamma": ["shape", "scale"], + "lognormal": ["log_mu", "log_sigma"], + "mixture_gamma": ["shape", "scale", "w"], + "mixture_lognormal": ["log_mu", "log_sigma", "rho", "w"], + "normal": ["mu", "sigma"] + } + + if pdf in params_dict: + return params_dict[pdf] else: - raise Exception("Probability density function " + pdf + " is not available!") + raise ValueError(f"Probability density function '{pdf}' is not available!") return params -def print_available_pdfs(): +def print_available_pdfs() -> None: """ - Description: - Prints out available built-in probability density functions in dadi. + Prints out available built-in probability density functions in dadi. + + This function lists all the PDFs that can be used with the dadi library for demographic modeling, + providing a quick reference for users to know what options are available. + """ + pdfs = [ + "beta", "biv_ind_gamma", "biv_lognormal", + "exponential", "gamma", "lognormal", + "normal", "mixture" + ] + print("Available probability density functions:") - print("- beta") - print("- biv_ind_gamma") - print("- biv_lognormal") - print("- exponential") - print("- gamma") - print("- lognormal") - print("- normal") - print("- mixture") + for pdf in pdfs: + print(f"- {pdf}") -def print_pdf_details(pdf_name): +def print_pdf_details(pdf_name: str) -> None: """ - Description: - Prints out the details of a given built-in probability density function - in dadi. + Prints out the details of a given built-in probability density function in dadi. - Arguments: - pdf_name string: Name of the probability density function. - """ + Parameters + ---------- + pdf_name : str + Name of the probability density function. - beta = """ - Beta probability density function. + Raises + ------ + ValueError + If the specified probability density function name is not recognized. - params = [alpha, beta] """ - biv_ind_gamma = """ - Bivariate independent gamma probability density function. + pdf_details = { + "beta" : """ + Beta probability density function. - If len(params) == 2, then params = [alpha, beta] by assuming alpha and beta are equal in the two populations - If len(params) == 4, then params = [alpha1, alpha2, beta1, beta2] - If len(params) == 3 or 5, then the last parameter is ignored - """ - biv_lognormal = """ - Bivariate lognormal probability density function. + params = [alpha, beta] + """, - If len(params) == 3, then params = [mu, sigma, rho] by assuming mu and sigma are equal in the two populations - If len(params) == 5, then params = [mu1, mu2, sigma1, sigma2, rho] - """ - exponential = """ - Exponential probability density function. + "biv_ind_gamma" : """ + Bivariate independent gamma probability density function. - params = [scale] - """ - gamma = """ - Gamma probability density function. + If len(params) == 2, then params = [alpha, beta] by assuming alpha and beta are equal in the two populations + If len(params) == 4, then params = [alpha1, alpha2, beta1, beta2] + If len(params) == 3 or 5, then the last parameter is ignored + """, - params = [alpha, beta] = [shape, scale] - """ - lognormal = """ - Lognormal probability density function. + "biv_lognormal" : """ + Bivariate lognormal probability density function. - params = [log_mu, log_sigma] - """ - normal = """ - Normal probability density function. + If len(params) == 3, then params = [mu, sigma, rho] by assuming mu and sigma are equal in the two populations + If len(params) == 5, then params = [mu1, mu2, sigma1, sigma2, rho] + """, - params = [mu, sigma] - """ - mixture = """ - Weighted summation of 1d and 2d distributions that share parameters. - The 1d distribution is equivalent to assuming selection coefficients are - perfectly correlated. - - params: Parameters for potential optimization. - It is assumed that last parameter is the weight for the 2d dist. - The second-to-last parameter is assumed to be the correlation - coefficient for the 2d distribution. - The remaining parameters as assumed to be shared between the - 1d and 2d distributions. - """ + "exponential" : """ + Exponential probability density function. + + params = [scale] + """, + + "gamma" : """ + Gamma probability density function. + + params = [alpha, beta] = [shape, scale] + """, + + "lognormal" : """ + Lognormal probability density function. + + params = [log_mu, log_sigma] + """, + + "normal" : """ + Normal probability density function. + + params = [mu, sigma] + """, + + "mixture" : """ + Weighted summation of 1d and 2d distributions that share parameters. + The 1d distribution is equivalent to assuming selection coefficients are + perfectly correlated. + + params: Parameters for potential optimization. + It is assumed that last parameter is the weight for the 2d dist. + The second-to-last parameter is assumed to be the correlation + coefficient for the 2d distribution. + The remaining parameters as assumed to be shared between the + 1d and 2d distributions. + """, + } - if pdf_name == "beta": - print("- beta:\n" + beta) - elif pdf_name == "biv_ind_gamma": - print("- biv_ind_gamma:\n" + biv_ind_gamma) - elif pdf_name == "biv_lognormal": - print("- biv_lognormal:\n" + biv_lognormal) - elif pdf_name == "exponential": - print("- exponential:\n" + exponential) - elif pdf_name == "gamma": - print("- gamma:\n" + gamma) - elif pdf_name == "lognormal": - print("- lognormal:\n" + lognormal) - elif pdf_name == "normal": - print("- normal:\n" + normal) - elif pdf_name == "mixture": - print("- mixture:\n" + mixture) + if pdf_name in pdf_details: + print(f"- {pdf_name}:\n{pdf_details[pdf_name]}") else: - raise Exception( - "Probability density function " + pdf_name + " is not available!" - ) + raise ValueError(f"Probability density function '{pdf_name}' is not available!") diff --git a/dadi_cli/Plot.py b/dadi_cli/Plot.py index edbe362c..b00153d1 100644 --- a/dadi_cli/Plot.py +++ b/dadi_cli/Plot.py @@ -7,22 +7,40 @@ from dadi_cli.Pdfs import get_dadi_pdf from dadi_cli.utilities import pts_l_func + fig = plt.figure(figsize=(8, 6)) -def plot_single_sfs(fs, projections, output, vmin): +def plot_single_sfs( + fs: str, + output: str, + vmin: float, + projections: list[int] = None, +) -> None: """ - Description: - Plots 1d or 2d frequency spectrum. + Plots 1D, 2D, or 3D frequency spectrum based on the number of sample sizes in the given dadi Spectrum file. + + Parameters + ---------- + fs : str + Path to the file containing the frequency spectrum data from dadi. + output : str + File path where the plot will be saved. The format of the saved plot is determined by the extension of the file path. + vmin : float + Minimum value in the colorbar. This sets the lower limit for color scaling in 2D and 3D plots. + projections : list[int], optional + Sample sizes after projection which may be used to adjust plots. If not provided, the original sample sizes from the spectrum file are used. + + Raises + ------ + AttributeError + If an outdated version of dadi is used that does not support required plotting functions, specifically 3D plotting. + ValueError + If the number of sample sizes (populations) in the spectrum exceeds three, as plotting of more than three populations is not supported by dadi-cli. - Arguments: - fs str: Name of the file containing frequency spectrum from dadi. - projections list: Sample sizes after projection. - output str: Name of the output file. - vmin float: Minimum value in the colorbar. """ fs = dadi.Spectrum.from_file(fs) - if projections == None: + if projections is None: projections = fs.sample_sizes if len(fs.sample_sizes) == 1: fig = plt.figure(219033) @@ -37,7 +55,8 @@ def plot_single_sfs(fs, projections, output, vmin): except AttributeError: raise AttributeError("Update to dadi 2.3.3 to use plot_3d_pairwise function") if len(fs.sample_sizes) > 3: - raise Exception("dadi-cli does not support plotting a single fs with more than three populations") + raise ValueError("dadi-cli does not support plotting a single fs with more than three populations") + fig.savefig(output) diff --git a/dadi_cli/SimulateFs.py b/dadi_cli/SimulateFs.py index b5b74259..72e0684e 100644 --- a/dadi_cli/SimulateFs.py +++ b/dadi_cli/SimulateFs.py @@ -7,26 +7,43 @@ def simulate_demography( - model, model_file, p0, ns, pts_l, misid, output, inference_file -): + model: str, + model_file: str, + p0: list[float], + ns: int, + pts_l: tuple[int], + misid: bool, + output: str, + inference_file: bool +) -> None: """ - Description: - Simulates fequency spectrum from a demographic model. - - Arguments: - model str: Name of the demographic model. - model_file str: Path and name of the file containing customized models. - p0 list: Initial parameter values for inference. - ns int: Sample size. - pts_l tuple: Grid sizes for modeling. - misid bool: If True, add a parameter for modeling ancestral state misidentification when data are polarized. - output str: Name of the output file. - inference_file bool: If True, outputs the inference results to generate caches. + Simulates frequency spectrum from a demographic model and optionally output inference files. + + Parameters + ---------- + model : str + Name of the demographic model. + model_file : str + Path and name of the file containing customized models. + p0 : list[float] + Initial parameter values for inference. + ns : int + Sample size. + pts_l : tuple[int] + Grid sizes for modeling. If None, a default is calculated based on ns. + misid : bool + If True, adds a parameter for modeling ancestral state misidentification. + output : str + Name of the output file. + inference_file : bool + If True, outputs the inference results to generate caches. + """ - if pts_l == None: + if pts_l is None: pts_l = pts_l_func(ns) # fs = dadi.Numerics.make_extrap_func(get_dadi_model_func(model, model_file))(popt, fs.sample_sizes, pts_l)*theta func, params = get_model(model, model_file) + if misid: param_names = func.__param_names__ func = dadi.Numerics.make_anc_state_misid_func(func) @@ -34,65 +51,105 @@ def simulate_demography( func_ex = dadi.Numerics.make_extrap_func(func) fs = func_ex(p0, ns, pts_l) fs.to_file(output) + if inference_file: - fi = open(output + ".SimulateDM.pseudofit", "w") - fi.write( - "# Ran SimulateDM\n# This is a fake inference output results to generate caches\n# Log(likelihood) " - + "\t".join(func.__param_names__) - + "\ttheta0\n" - "-0\t" + "\t".join([str(ele) for ele in p0]) + "\t1" - ) - fi.close() + with open(f"{output}.SimulateDM.pseudofit", "w") as fi: + fi.write( + "# Ran SimulateDM\n# This is a fake inference output results to generate caches\n# Log(likelihood) " + + "\t".join(func.__param_names__) + + "\ttheta0\n" + + "-0\t" + "\t".join([str(ele) for ele in p0]) + "\t1" + ) -def simulate_demes(demes_file, ns, pts_l, pop_ids, output): +def simulate_demes( + demes_file: str, + ns: int, + pts_l: tuple[int], + pop_ids: list[str], + output: str, +) -> None: """ - Description: - Simulates frequency spectrum from a DEMES format file. - - Arguments - demes_file str: Name of the DEME format file. - ns int: Sample size. - pts_l tuple: Grid sizes for modeling. - pop_ids list: Names of the populations in the model. - output str: Name of the output file. + Simulates frequency spectrum from a DEMES format file using `dadi`. + + Parameters + ---------- + demes_file : str + Path to the DEMES format file. + ns : int + Sample size for each population. + pts_l : tuple[int] + Grid sizes for numerical integration modeling. If None, which triggers automatic grid size determination based on `ns`. + pop_ids : list[str] + Names of the populations to be sampled within the DEMES model. + output : str + Path and name of the output file where the frequency spectrum is saved. + """ - if pts_l == None: + if pts_l is None: pts_l = pts_l_func(ns) + fs = dadi.Spectrum.from_demes( demes_file, sampled_demes=pop_ids, sample_sizes=ns, pts=pts_l ) + fs.to_file(output) -def simulate_dfe(p0, cache1d, cache2d, sele_dist, sele_dist2, ratio, misid, output): +def simulate_dfe( + p0: list[str], + cache1d: dadi.DFE.Cache1D_mod, + cache2d: dadi.DFE.Cache2D_mod, + sele_dist: str, + sele_dist2: str, + ratio: float, + misid: bool, + output: str +) -> None: """ - Description: - Simulates frequency spectrum from a DFE model. - - Arguments: - p0 list: Initial parameter values for inference. - cache1d dadi.DFE.Cache1D_mod: Caches of 1d frequency spectra for DFE inference. - cache2d dadi.DFE.Cache2D_mod: Caches of 2d frequency spectra for DFE inference. - sele_dist str: Name of the 1d PDF function for modeling DFEs. - sele_dist2 str: Name of the 2d PDF function for modeling DFEs. - ratio float: Ratio of synonymous to non-synonymous mutations. - misid bool: If True, add a parameter for modeling ancestral state misidentification when data are polarized. - output str: Name of the output file. + Simulates frequency spectrum from a Distribution of Fitness Effects (DFE) model. + + Parameters + ---------- + p0 : list[str] + Initial parameter values for inference. + cache1d : dadi.DFE.Cache1D_mod + Caches of 1D frequency spectra for DFE inference. + cache2d : dadi.DFE.Cache2D_mod + Caches of 2D frequency spectra for DFE inference. + sele_dist : str + Name of the 1D Probability Density Function (PDF) for modeling DFEs. + sele_dist2 : str + Name of the 2D PDF for modeling DFEs. + ratio : float + Ratio of synonymous to non-synonymous mutations. + misid : bool + If True, adds a parameter for modeling ancestral state misidentification when data are polarized. + output : str + Name of the output file where the simulated frequency spectrum is saved. + + Raises + ------ + ValueError + If neither `cache1d` nor `cache2d` is provided, an error is raised. + """ - if cache1d != None: + if not cache1d and not cache2d: + raise ValueError("At least one of cache1d or cache2d must be provided.") + + if cache1d is not None: func = cache1d.integrate - if cache2d != None: + if cache2d is not None: func = cache2d.integrate - if sele_dist != None: + if sele_dist is not None: sele_dist = get_dadi_pdf(sele_dist) - if sele_dist2 != None: + if sele_dist2 is not None: sele_dist2 = get_dadi_pdf(sele_dist2) - if (sele_dist == None) and (sele_dist2 != None): + if (sele_dist is None) and (sele_dist2 is not None): sele_dist = sele_dist2 - if (cache1d != None) and (cache2d != None): + if (cache1d is not None) and (cache2d is not None): func = dadi.DFE.Cache2D_mod.mixture func_args = [cache1d, cache2d, sele_dist, sele_dist2, ratio] else: @@ -102,7 +159,7 @@ def simulate_dfe(p0, cache1d, cache2d, sele_dist, sele_dist2, ratio, misid, outp func = dadi.Numerics.make_anc_state_misid_func(func) print(p0, None, sele_dist, ratio, None) # Get expected SFS for MLE - if (cache1d != None) and (cache2d != None): + if (cache1d is not None) and (cache2d is not None): fs = func(p0, None, cache1d, cache2d, sele_dist, sele_dist2, ratio, None) else: fs = func(p0, None, sele_dist, ratio, None) diff --git a/dadi_cli/Stat.py b/dadi_cli/Stat.py index 1a0e69cd..7d369f92 100644 --- a/dadi_cli/Stat.py +++ b/dadi_cli/Stat.py @@ -3,28 +3,51 @@ import dadi.DFE as DFE import glob, pickle import numpy as np +from typing import Optional from dadi_cli.Models import get_model from dadi_cli.Pdfs import get_dadi_pdf from dadi_cli.utilities import get_opts_and_theta, convert_to_None def godambe_stat_demograpy( - fs, func, grids, output, bootstrap_dir, demo_popt, fixed_params, nomisid, logscale, eps_l -): + fs: str, + func: callable, + grids: tuple[int], + output: str, + bootstrap_dir: str, + demo_popt: str, + fixed_params: list[Optional[float]], + nomisid: bool, + logscale: bool, + eps_l: list[float], +) -> None: """ - Description: - Godambe statistics for demographic inference. - - Arguments: - fs str: Name of the file containing frequency spectrum from dadi. - func function: Function for modeling demography. - grids tuple: Grid sizes for modeling. - output str: Name of the output file. - bootstrap_dir str: Name of the directory containing bootstrapped frequency spectra. - demo_popt str: Name of the file containg the best-fit demographic parameters. - fixed_params list: List of the fixed parameters. - nomisid bool: If False, add a parameter for modeling ancestral state misidentification when data are polarized. - logscale bool: If True, calculate statistics using log scale. + Calculates Godambe statistics for demographic inference, estimating parameter uncertainties + using bootstrapped frequency spectra. + + Parameters + ---------- + fs : str + Path to the file containing the frequency spectrum. + func : callable + Demographic model function to be used for simulations. + grids : tuple[int] + Grid sizes for numerical integration. + output : str + File path for writing the output results. + bootstrap_dir : str + Directory containing bootstrapped frequency spectra files. + demo_popt : str + Path to the file containing the best-fit demographic parameters. + fixed_params : list[Optional[float]] + List of fixed parameters where `None` indicates parameters to be inferred. + nomisid : bool + If False, includes modeling of ancestral state misidentification. + logscale : bool + If True, statistics are calculated on a logarithmic scale. + eps_l : list[float] + List of epsilon values for numerical differentiation when calculating the Godambe matrix. + """ # We want the best fits from the demograpic fit. # We set the second argument to true, since we want @@ -48,84 +71,106 @@ def gfunc(free_params, ns, pts): params = _convert_free_params(free_params, fixed_params) return func_ex(params, ns, pts) - f = open(output, "w") - for eps in eps_l: - popt = np.array(free_params) - uncerts_adj = dadi.Godambe.GIM_uncert( - gfunc, grids, all_boot, popt, fs, multinom=True, log=logscale, eps=eps - ) - # The uncertainty for theta is predicted, so we slice the - # the uncertainties for just the parameters. - uncerts_adj = uncerts_adj[:-1] - - f.write( - "Estimated 95% uncerts (theta adj), with step size " - + str(eps) - + "): {0}".format(1.96 * uncerts_adj) - + "\n" - ) - if logscale: + with open(output, "w") as f: + for eps in eps_l: + popt = np.array(free_params) + uncerts_adj = dadi.Godambe.GIM_uncert( + gfunc, grids, all_boot, popt, fs, multinom=True, log=logscale, eps=eps + ) + # The uncertainty for theta is predicted, so we slice the + # the uncertainties for just the parameters. + uncerts_adj = uncerts_adj[:-1] + f.write( - "Lower bounds of 95% confidence interval : {0}".format( - np.exp(np.log(popt) - 1.96 * uncerts_adj) - ) + "Estimated 95% uncerts (theta adj), with step size " + + str(eps) + + "): {0}".format(1.96 * uncerts_adj) + "\n" ) - f.write( - "Upper bounds of 95% confidence interval : {0}".format( - np.exp(np.log(popt) + 1.96 * uncerts_adj) + if logscale: + f.write( + "Lower bounds of 95% confidence interval : {0}".format( + np.exp(np.log(popt) - 1.96 * uncerts_adj) + ) + + "\n" ) - + "\n\n" - ) - else: - f.write( - "Lower bounds of 95% confidence interval : {0}".format( - popt - 1.96 * uncerts_adj + f.write( + "Upper bounds of 95% confidence interval : {0}".format( + np.exp(np.log(popt) + 1.96 * uncerts_adj) + ) + + "\n\n" ) - + "\n" - ) - f.write( - "Upper bounds of 95% confidence interval : {0}".format( - popt + 1.96 * uncerts_adj + else: + f.write( + "Lower bounds of 95% confidence interval : {0}".format( + popt - 1.96 * uncerts_adj + ) + + "\n" + ) + f.write( + "Upper bounds of 95% confidence interval : {0}".format( + popt + 1.96 * uncerts_adj + ) + + "\n\n" ) - + "\n\n" - ) - f.close() def godambe_stat_dfe( - fs, - cache1d, - cache2d, - sele_dist, - sele_dist2, - output, - bootstrap_syn_dir, - bootstrap_non_dir, - dfe_popt, - fixed_params, - nomisid, - logscale, - eps_l, -): + fs: str, + cache1d: str, + cache2d: str, + sele_dist: str, + sele_dist2: str, + output: str, + bootstrap_syn_dir: str, + bootstrap_non_dir: str, + dfe_popt: str, + fixed_params: list[Optional[float]], + nomisid: bool, + logscale: bool, + eps_l: list[float], +) -> None: """ - Description: - Godambe statistics for DFE inference. - - Argument: - fs str: Name of the file containing frequency spectrum from dadi. - cache1d str: Name of the file containing 1d frequency spectra cache from dadi. - cache2d str: Name of the file containing 2d frequency spectra cache from dadi. - sele_dist str: Name of the 1d PDF for modeling DFE. - sele_dist2 str: Name of the 2d PDF for modeling DFE. - output str: Name of the output file. - bootstrap_syn_dir str: Name of the directory containing bootstrapped frequency spectra for synonymous mutations. - bootstrap_non_dir str: Name of the directory containing bootstrapped frequency spectra for non-synonymous mutations. - dfe_popt str: Name of the file containing the best-fit DFE parameters. - fixed_params list: List of the fixed parameters. - nomisid bool: If False, add a parameter for modeling ancestral state misidentification when data are polarized. - logscale bool: If True, calculate statistics using log scale. + Calculates Godambe statistics for DFE inference using cached frequency spectra and bootstrapping. + + Parameters + ---------- + fs : str + Path to the file containing the frequency spectrum. + cache1d : str + Path to the file containing 1D frequency spectra cache. + cache2d : str + Path to the file containing 2D frequency spectra cache. + sele_dist : str + Name of the 1D PDF for modeling DFE. + sele_dist2 : str + Name of the 2D PDF for modeling DFE. + output : str + File path for the output results. + bootstrap_syn_dir : str + Directory containing bootstrapped frequency spectra for synonymous mutations. + bootstrap_non_dir : str + Directory containing bootstrapped frequency spectra for non-synonymous mutations. + dfe_popt : str + Path to the file containing the best-fit DFE parameters. + fixed_params : list[Optional[float]] + List of fixed parameters where `None` indicates parameters to be inferred. + nomisid : bool + If False, includes a parameter for modeling ancestral state misidentification. + logscale : bool + If True, calculations are done on a logarithmic scale. + eps_l : list[float] + List of epsilon values for numerical differentiation when calculating the Godambe matrix. + + Raises + ------ + ValueError + If neither `cache1d` nor `cache2d` is provided, an error is raised. + """ + if not cache1d and not cache2d: + raise ValueError("At least one of cache1d or cache2d must be provided.") + dfe_popt, theta = get_opts_and_theta(dfe_popt) fixed_params = convert_to_None(fixed_params, len(dfe_popt)) free_params = _free_params(dfe_popt, fixed_params) @@ -142,21 +187,21 @@ def godambe_stat_dfe( boot_fs = dadi.Spectrum.from_file(f) all_syn_boot.append(boot_fs) - if cache1d != None: + if cache1d is not None: s1 = pickle.load(open(cache1d, "rb")) sfunc = s1.integrate - if cache2d != None: + if cache2d is not None: s2 = pickle.load(open(cache2d, "rb")) sfunc = s2.integrate - if sele_dist2 != None: + if sele_dist2 is not None: sele_dist2 = get_dadi_pdf(sele_dist2) - if sele_dist != None: + if sele_dist is not None: sele_dist = get_dadi_pdf(sele_dist) else: sele_dist = sele_dist2 - if (cache1d != None) and (cache2d != None): + if (cache1d is not None) and (cache2d is not None): mfunc = dadi.DFE.mixture if not nomisid: mfunc = dadi.Numerics.make_anc_state_misid_func(mfunc) @@ -178,7 +223,7 @@ def func(free_params, ns, pts): exterior_int=True, ) - elif (cache1d != None) or (cache2d != None): + elif (cache1d is not None) or (cache2d is not None): if not nomisid: sfunc = dadi.Numerics.make_anc_state_misid_func(sfunc) @@ -186,71 +231,79 @@ def func(free_params, ns, pts): params = _convert_free_params(free_params, fixed_params) return sfunc(params, None, sele_dist, theta, None, exterior_int=True) - f = open(output, "w") - for eps in eps_l: - free_params = np.array(free_params) - boot_theta_adjusts = [b.sum() / fs.sum() for b in all_syn_boot] - uncerts_adj = dadi.Godambe.GIM_uncert( - func, - [], - all_non_boot, - free_params, - fs, - multinom=False, - log=logscale, - eps=eps, - boot_theta_adjusts=boot_theta_adjusts, - ) - - f.write( - "Estimated 95% uncerts (theta adj), with step size " - + str(eps) - + "): {0}".format(1.96 * uncerts_adj) - + "\n" - ) - if logscale: + with open(output, "w") as f: + for eps in eps_l: + free_params = np.array(free_params) + boot_theta_adjusts = [b.sum() / fs.sum() for b in all_syn_boot] + uncerts_adj = dadi.Godambe.GIM_uncert( + func, + [], + all_non_boot, + free_params, + fs, + multinom=False, + log=logscale, + eps=eps, + boot_theta_adjusts=boot_theta_adjusts, + ) + f.write( - "Lower bounds of 95% confidence interval : {0}".format( - np.exp(np.log(free_params) - 1.96 * uncerts_adj) - ) + "Estimated 95% uncerts (theta adj), with step size " + + str(eps) + + "): {0}".format(1.96 * uncerts_adj) + "\n" ) - f.write( - "Upper bounds of 95% confidence interval : {0}".format( - np.exp(np.log(free_params) + 1.96 * uncerts_adj) + if logscale: + f.write( + "Lower bounds of 95% confidence interval : {0}".format( + np.exp(np.log(free_params) - 1.96 * uncerts_adj) + ) + + "\n" ) - + "\n\n" - ) - else: - f.write( - "Lower bounds of 95% confidence interval : {0}".format( - free_params - 1.96 * uncerts_adj + f.write( + "Upper bounds of 95% confidence interval : {0}".format( + np.exp(np.log(free_params) + 1.96 * uncerts_adj) + ) + + "\n\n" ) - + "\n" - ) - f.write( - "Upper bounds of 95% confidence interval : {0}".format( - free_params + 1.96 * uncerts_adj + else: + f.write( + "Lower bounds of 95% confidence interval : {0}".format( + free_params - 1.96 * uncerts_adj + ) + + "\n" + ) + f.write( + "Upper bounds of 95% confidence interval : {0}".format( + free_params + 1.96 * uncerts_adj + ) + + "\n\n" ) - + "\n\n" - ) - f.close() # Because we don't want confidence intervals estimated # on fixed parameters, we find the free parameters # that we will pass into Godambe. -def _free_params(dfe_popt, fixed_params): +def _free_params( + dfe_popt: list[float], + fixed_params: list[Optional[float]] +) -> list[float]: """ - Description: - Helper function for finding the free parameters for Godambe. + Identifies and returns the free parameters for Godambe statistics calculation + based on a list of best-fit parameters and a corresponding list of fixed indicators. - Arguments: - dfe_popt list: List of the best-fit parameters for DFE inference. - fixed_params list: List of the fixed parameters. + Parameters + ---------- + dfe_popt : list[float] + List of the best-fit parameters for DFE inference. + fixed_params : list[None] + List indicating which parameters are fixed. `None` signifies that a parameter is free. + + Returns + ------- + list[float] + List of the free parameters, extracted from `dfe_popt` based on `fixed_params`. - Returns: - free_params list: List of the free parameters. """ free_params = [] free_index = {} @@ -269,17 +322,25 @@ def _free_params(dfe_popt, fixed_params): # Because the free params need to be passed into Godambe and func but also # need to contain any fixed parameters used to generate the model or DFE # we use the free parameters to recreate the optimal parameters. -def _convert_free_params(free_params, fixed_params): +def _convert_free_params( + free_params: list[float], + fixed_params: list[Optional[float]] +) -> list[float]: """ - Description: - Helper function for recreating the optimal parameters. + Recreates the complete list of optimal parameters by merging free parameters with their fixed counterparts. + + Parameters + ---------- + free_params : list[float] + List of the free parameters, these are the parameters that have been optimized. + fixed_params : list[Optional[float]] + List indicating which parameters are fixed (specified as actual values) or free (specified as None). - Arguments: - free_params list: List of the free parameters. - fixed_params list: List of the fixed parameters. + Returns + ------- + list[float] + List of the optimal parameters, combining free and fixed parameters into a single parameter vector. - Returns: - params list: List of the optimal parameters. """ params = [] ii = 0 diff --git a/dadi_cli/utilities.py b/dadi_cli/utilities.py index 2a70ce1a..48a6899a 100644 --- a/dadi_cli/utilities.py +++ b/dadi_cli/utilities.py @@ -1,50 +1,74 @@ import dadi import numpy as np +from typing import Optional -def pts_l_func(sample_sizes): +def pts_l_func( + sample_sizes: list[int] +) -> tuple[int]: """ - Description: - Calculates plausible grid sizes for modeling a frequency spectrum. + Calculates plausible grid sizes for modeling a frequency spectrum based on the maximum sample size. - Arguments: - sample_sizes list: Sample sizes. + Parameters + ---------- + sample_sizes : list[int] + List of sample sizes for which to calculate the grid sizes. + + Returns + ------- + tuple[int] + Calculated grid sizes for numerical integration, slightly increasing with each step to ensure + coverage and accuracy. - Returns: - grid_sizes tuple: Grid sizes for modeling. """ n = max(sample_sizes) grid_sizes = (int(n * 1.1) + 2, int(n * 1.2) + 4, int(n * 1.3) + 6) return grid_sizes -def cache_pts_l_func(sample_sizes): +def cache_pts_l_func( + sample_sizes: list[int] +) -> tuple[int]: """ - Description: - Calculates plausible grid sizes for modeling a frequency spectrum. + Calculates plausible grid sizes for modeling a frequency spectrum by scaling up the maximum sample size. + + Parameters + ---------- + sample_sizes : list[int] + List of sample sizes for which to calculate the grid sizes. - Arguments: - sample_sizes list: Sample sizes. + Returns + ------- + tuple[int] + A tuple of integers representing the grid sizes for modeling. These sizes are scaled versions + of the maximum sample size, incremented by small constants to ensure computational robustness. - Returns: - grid_sizes tuple: Grid sizes for modeling. """ n = max(sample_sizes) grid_sizes = (int(n * 2.2) + 2, int(n * 2.4) + 4, int(n * 2.6) + 6) return grid_sizes -def convert_to_None(inference_input, p0_len): +def convert_to_None( + inference_input: list[float], + p0_len: int +) -> list[Optional[float]]: """ - Description: - Converts -1 in input parameters into None. + Converts '-1' values in a list of input parameters to 'None', typically used to + signify missing or undefined values for parameter inference. - Arguments: - inference_input list: Input parameters for inference. - p0_len int: Length of the initial parameters. + Parameters + ---------- + inference_input : list[float] + List of input parameters for inference, where '-1' signifies a value to be converted to 'None'. + p0_len : int + Expected length of the list of initial parameters. Ensures the output list matches this length. + + Returns + ------- + list[Optional[float]] + A new list of input parameters where '-1' values have been replaced with 'None'. - Returns: - inference_input list: Converted input parameters. """ if inference_input == -1: inference_input = [inference_input] * p0_len @@ -54,18 +78,32 @@ def convert_to_None(inference_input, p0_len): return inference_input -def get_opts_and_theta(filename, gen_cache=False): +def get_opts_and_theta( + filename: str, + gen_cache: bool = False +) -> tuple[list[float], float]: """ - Description: - Obtains optimized parameters and theta. - - Arguments: - filename str: Name of the file. - gen_cache bool: Make True for generating a cache to remove misid parameter when present. + Parses a file to obtain optimized parameters and the population-scaled mutation rate (theta). + + Parameters + ---------- + filename : str + The path to the file containing the optimization results. + gen_cache : bool, optional + If True, generates a cache that excludes the misidentification parameter, if present. + + Returns + ------- + tuple[list[float], float] + A tuple containing a list of the optimized parameters and the value of theta. + If the 'gen_cache' is True and 'misid' is found, 'misid' is excluded from the returned parameters. + + Notes + ----- + The file should contain lines starting with '# Converged' to indicate convergence, + '# Log(likelihood)' followed by parameter names, and other lines with numerical values + for parameters. The function checks for convergence and reads the parameters accordingly. - Returns: - opts list: Optimized parameters. - theta float: Population-scaled mutation rate. """ opts = [] param_names = [] diff --git a/mkdocs-env.yml b/mkdocs-env.yml index e8429001..ea96030f 100644 --- a/mkdocs-env.yml +++ b/mkdocs-env.yml @@ -18,7 +18,7 @@ dependencies: - numpy - pip - pytest-cov - - python==3.8 + - python==3.9 - python-markdown-math - scipy - dadi diff --git a/setup.py b/setup.py index 0acd79f5..3d0f4674 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ classifiers=[ "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", ], packages=["dadi_cli"], include_package_data=True, diff --git a/tests/example_data/example_models.py b/tests/example_data/example_models.py index 5b77a4de..5104b3bc 100644 --- a/tests/example_data/example_models.py +++ b/tests/example_data/example_models.py @@ -104,4 +104,4 @@ def split_mig_fix_T_one_s(params, ns, pts): """ nu1, nu2, m, gamma = params return split_mig_fix_T_sel([nu1, nu2, m, gamma, gamma], ns, pts) -split_mig_fix_T_one_s.__param_names__ = ["nu1", "nu2", "m", "gamma"] \ No newline at end of file +split_mig_fix_T_one_s.__param_names__ = ["nu1", "nu2", "m", "gamma"] diff --git a/tests/test_GenerateCache.py b/tests/test_GenerateCache.py index 0f4cab92..e47222e4 100644 --- a/tests/test_GenerateCache.py +++ b/tests/test_GenerateCache.py @@ -35,6 +35,27 @@ def test_generate_cache_code(capsys): assert len(s.gammas) == 50 +def test_generate_cache_failure(): + with pytest.raises(ValueError) as excinfo: + dimensionality=10 + + generate_cache( + func=DFE.DemogSelModels.two_epoch_sel, + grids=[120, 140, 160], + popt="./tests/example_data/example.two_epoch.demo.params.InferDM.bestfits", + gamma_bounds=[1e-4, 2000], + gamma_pts=50, + output="./tests/test_results/cache_large_two_epoch_code.bpkl", + sample_sizes=[20], + additional_gammas=[], + cpus=None, + gpus=0, + dimensionality=dimensionality, + ) + + assert f"Invalid dimensionality {dimensionality}. Only 1 or 2 are accepted." in str(excinfo.value) + + @pytest.mark.skip() def test_generate_cache_bash(capsys): import subprocess diff --git a/tests/test_GenerateFs.py b/tests/test_GenerateFs.py index f1340a33..c845ee35 100644 --- a/tests/test_GenerateFs.py +++ b/tests/test_GenerateFs.py @@ -103,6 +103,26 @@ def test_generate_fs(data): assert dadi_cli_fs.sample_sizes[1] == 20 +def test_generate_fs_failure(data): + with pytest.raises(ValueError) as excinfo: + generate_fs( + vcf=pytest.vcf, + output=pytest.unfolded_output, + pop_ids=["YRI", "CEU"], + pop_info=pytest.pop_info, + projections=[216], + marginalize_pops=None, + subsample=False, + polarized=True, + bootstrap=None, + chunk_size=None, + masking="", + seed=None, + ) + + assert "The lengths of `pop_ids` and `projections` must match." in str(excinfo.value) + + def test_generate_fs_subsample(data): pop_ids, projections = ["YRI", "CEU"], [216, 198] generate_fs( @@ -337,3 +357,21 @@ def test_generate_fs_marginalize(data): dadi_fs = dadi_fs.marginalize([1]) assert np.allclose(dadi_cli_fs, dadi_fs) + + with pytest.raises(ValueError) as excinfo: + generate_fs( + vcf=pytest.vcf, + output=pytest.marginalize_CEU_output, + pop_ids=["YRI", "CEU"], + pop_info=pytest.pop_info, + projections=[216, 198], + marginalize_pops=["CHB"], + subsample=False, + polarized=True, + bootstrap=None, + chunk_size=None, + masking="", + seed=None, + ) + + assert "All populations to marginalize must be in the list of population ids." in str(excinfo.value) diff --git a/tests/test_InferDFE.py b/tests/test_InferDFE.py index 347956b6..d4072e7b 100644 --- a/tests/test_InferDFE.py +++ b/tests/test_InferDFE.py @@ -48,6 +48,26 @@ def test_infer_dfe_code(): maxtime, ) + with pytest.raises(ValueError) as excinfo: + infer_dfe( + fs=fs, + cache1d=None, + cache2d=None, + sele_dist=sele_dist, + sele_dist2=sele_dist2, + theta=theta, + p0=p0, + upper_bounds=upper_bounds, + lower_bounds=lower_bounds, + fixed_params=fixed_params, + misid=misid, + cuda=cuda, + maxeval=maxeval, + maxtime=maxtime, + ) + + assert "At least one of cache1d or cache2d must be provided." in str(excinfo.value) + @pytest.mark.skip() def test_InferDFE_bash(capsys): diff --git a/tests/test_Models.py b/tests/test_Models.py index d952fe8a..27c3c453 100644 --- a/tests/test_Models.py +++ b/tests/test_Models.py @@ -176,21 +176,30 @@ def test_print_built_in_model_details(capfd, model_list): assert str(e_info.value) == "Cannot find model: mixture." -def test_custome_model_import(capfd): - import tests.example_data.example_models as custom + +def test_custom_model_import(capfd): + # Test importing a non-existing file + model_file = 'tests/non_existing_model' + with pytest.raises(ImportError) as excinfo: + get_model('hallo', model_file) + + assert f"Failed to import module: {model_file}" in str(excinfo.value) + + model_file = 'tests/example_data/example_models' custome_model_list = ['three_epoch_bottleneck', 'split_mig_fix_T', 'split_mig_fix_T_sel'] + # Test importing good custom models for custome_model in custome_model_list: - get_model(custome_model, 'tests/example_data/example_models') + get_model(custome_model, model_file) out, err = capfd.readouterr() assert out == "" and err == "" + # Test importing a custom model without .__param_names__ attribute with pytest.raises(ValueError) as e_info: - get_model('split_no_mig_missing_param_names', 'tests/example_data/example_models') + get_model('split_no_mig_missing_param_names', model_file) assert str(e_info.value) == "Demographic model needs a .__param_names__ attribute!\nAdd one by adding the line split_no_mig_missing_param_names.__param_name__ = [LIST_OF_PARAMS]\nReplacing LIST_OF_PARAMS with the names of the parameters as strings." + # Test importing a custom model not in example_models with pytest.raises(AttributeError) as e_info: - get_model('haha', 'tests/example_data/example_models') + get_model('haha', model_file) assert str(e_info.value) == "module 'example_models' has no attribute 'haha'" - - diff --git a/tests/test_Pdfs.py b/tests/test_Pdfs.py index a9bdbd10..ed620643 100644 --- a/tests/test_Pdfs.py +++ b/tests/test_Pdfs.py @@ -67,70 +67,97 @@ def test_print_available_pdfs(capfd): + "- mixture\n" ) +@pytest.fixture +def pdf_details(): + return { + "beta" : """ + Beta probability density function. -def test_print_pdf_details(capfd): + params = [alpha, beta] + """, + + "biv_ind_gamma" : """ + Bivariate independent gamma probability density function. + + If len(params) == 2, then params = [alpha, beta] by assuming alpha and beta are equal in the two populations + If len(params) == 4, then params = [alpha1, alpha2, beta1, beta2] + If len(params) == 3 or 5, then the last parameter is ignored + """, + + "biv_lognormal" : """ + Bivariate lognormal probability density function. + + If len(params) == 3, then params = [mu, sigma, rho] by assuming mu and sigma are equal in the two populations + If len(params) == 5, then params = [mu1, mu2, sigma1, sigma2, rho] + """, + + "exponential" : """ + Exponential probability density function. + + params = [scale] + """, + + "gamma" : """ + Gamma probability density function. + + params = [alpha, beta] = [shape, scale] + """, + + "lognormal" : """ + Lognormal probability density function. + + params = [log_mu, log_sigma] + """, + + "normal" : """ + Normal probability density function. + + params = [mu, sigma] + """, + + "mixture" : """ + Weighted summation of 1d and 2d distributions that share parameters. + The 1d distribution is equivalent to assuming selection coefficients are + perfectly correlated. + + params: Parameters for potential optimization. + It is assumed that last parameter is the weight for the 2d dist. + The second-to-last parameter is assumed to be the correlation + coefficient for the 2d distribution. + The remaining parameters as assumed to be shared between the + 1d and 2d distributions. + """, + } + + +def test_print_pdf_details(capfd, pdf_details): print_pdf_details("beta") out, err = capfd.readouterr() - assert ( - out - == "- beta:\n\n" - + " Beta probability density function.\n\n params = [alpha, beta]\n" - + " \n" - ) + assert pdf_details["beta"] in out print_pdf_details("biv_ind_gamma") out, err = capfd.readouterr() - assert ( - out - == "- biv_ind_gamma:\n\n" - + " Bivariate independent gamma probability density function.\n\n If len(params) == 2, then params = [alpha, beta] by assuming alpha and beta are equal in the two populations\n If len(params) == 4, then params = [alpha1, alpha2, beta1, beta2]\n If len(params) == 3 or 5, then the last parameter is ignored\n" - + " \n" - ) + assert pdf_details["biv_ind_gamma"] in out print_pdf_details("biv_lognormal") out, err = capfd.readouterr() - assert ( - out - == "- biv_lognormal:\n\n" - + " Bivariate lognormal probability density function.\n\n If len(params) == 3, then params = [mu, sigma, rho] by assuming mu and sigma are equal in the two populations\n If len(params) == 5, then params = [mu1, mu2, sigma1, sigma2, rho]\n" - + " \n" - ) + assert pdf_details["biv_lognormal"] in out print_pdf_details("exponential") out, err = capfd.readouterr() - assert ( - out - == "- exponential:\n\n" - + " Exponential probability density function.\n\n params = [scale]\n" - + " \n" - ) + assert pdf_details["exponential"] in out print_pdf_details("gamma") out, err = capfd.readouterr() - assert ( - out - == "- gamma:\n\n" - + " Gamma probability density function.\n\n params = [alpha, beta] = [shape, scale]\n" - + " \n" - ) + assert pdf_details["gamma"] in out print_pdf_details("lognormal") out, err = capfd.readouterr() - assert ( - out - == "- lognormal:\n\n" - + " Lognormal probability density function.\n\n params = [log_mu, log_sigma]\n" - + " \n" - ) + assert pdf_details["lognormal"] in out print_pdf_details("normal") out, err = capfd.readouterr() - assert ( - out - == "- normal:\n\n" - + " Normal probability density function.\n\n params = [mu, sigma]\n" - + " \n" - ) + assert pdf_details["normal"] in out with pytest.raises(Exception) as e_info: print_pdf_details("haha") diff --git a/tests/test_Plot.py b/tests/test_Plot.py index f40a6250..414ad746 100644 --- a/tests/test_Plot.py +++ b/tests/test_Plot.py @@ -43,6 +43,7 @@ def files(): pytest.fs3d_sampled = "tests/example_data/ooa_10000_sampled.fs" pytest.fs3d_popt = "tests/example_data/example.ooa.donni.pseudofit" + def test_plot_single_sfs(files): plot_single_sfs( fs=pytest.fs1_1d, @@ -112,6 +113,7 @@ def test_plot_comparison(files): resid_range=3, ) + def test_plot_fitted_demography(files): plot_fitted_demography( fs=pytest.fs1_1d, @@ -164,6 +166,7 @@ def test_plot_fitted_demography(files): resid_range=3, ) + def test_plot_fitted_dfe(files): # plot_fitted_dfe(fs, cache1d=pytest.fs1d_cache1d, cache2d=None, demo_popt=pytest.fs1d_demo_popt, sele_popt, projections, pdf, pdf2, nomisid, output, vmin, resid_range) plot_fitted_dfe( diff --git a/tests/test_SimulateFs.py b/tests/test_SimulateFs.py index 93562569..810d3a94 100644 --- a/tests/test_SimulateFs.py +++ b/tests/test_SimulateFs.py @@ -26,7 +26,16 @@ def test_simulate_demography_code(): misid = False output = "tests/test_results/simulate_two_epoch.fs" inference_file = False - simulate_demography(model, model_file, p0, ns, pts_l, misid, output, inference_file) + simulate_demography( + model=model, + model_file=model_file, + p0=p0, + ns=ns, + pts_l=pts_l, + misid=misid, + output=output, + inference_file=inference_file + ) dadi_cli_fs = dadi.Spectrum.from_file(output) dadi_fs = dadi.Numerics.make_extrap_func(dadi.Demographics1D.two_epoch)( p0, ns, pts_l @@ -42,7 +51,16 @@ def test_simulate_custom_demography_code(): misid = False output = "tests/test_results/simulate_three_epoch_bottleneck.fs" inference_file = False - simulate_demography(model, model_file, p0, ns, pts_l, misid, output, inference_file) + simulate_demography( + model=model, + model_file=model_file, + p0=p0, + ns=ns, + pts_l=pts_l, + misid=misid, + output=output, + inference_file=inference_file + ) dadi_cli_fs = dadi.Spectrum.from_file(output) from tests.example_data.example_models import three_epoch_bottleneck dadi_fs = dadi.Numerics.make_extrap_func(three_epoch_bottleneck)( @@ -59,7 +77,16 @@ def test_simulate_demography_misid_code(): misid = True output = "tests/test_results/simulate_two_epoch_with_misid.fs" inference_file = True - simulate_demography(model, model_file, p0, ns, pts_l, misid, output, inference_file) + simulate_demography( + model=model, + model_file=model_file, + p0=p0, + ns=ns, + pts_l=pts_l, + misid=misid, + output=output, + inference_file=inference_file + ) dadi_cli_fs = dadi.Spectrum.from_file(output) dadi_fs = dadi.Numerics.make_extrap_func( dadi.Numerics.make_anc_state_misid_func(dadi.Demographics1D.two_epoch) @@ -138,7 +165,13 @@ def test_simulate_demes_code(): pts_l = [20, 30, 40] pop_ids = ["YRI", "CEU", "CHB"] output = "tests/test_results/demes_gutenkunst_ooa_simulation.fs" - simulate_demes(demes_file, ns, pts_l, pop_ids, output) + simulate_demes( + demes_file=demes_file, + ns=ns, + pts_l=pts_l, + pop_ids=pop_ids, + output=output + ) dadi_cli_fs = dadi.Spectrum.from_file(output) dadi_fs = dadi.Spectrum.from_demes( demes_file, sampled_demes=pop_ids, sample_sizes=ns, pts=pts_l