From bf69147f1e149d07d52adb6d407e5ec8a5e91ed3 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:10:00 -0700 Subject: [PATCH] feat: simphenotype `--environment` option (#216) --- docs/commands/simphenotype.rst | 15 ++++++++-- docs/formats/haplotypes.rst | 8 ++++-- docs/project_info/contributing.rst | 9 ++++++ haptools/__main__.py | 16 ++++++++++- haptools/data/genotypes.py | 4 +-- haptools/data/haplotypes.py | 7 ++++- haptools/sim_phenotype.py | 45 ++++++++++++++++++++---------- tests/test_simphenotype.py | 28 ++++++++++++++++++- 8 files changed, 107 insertions(+), 25 deletions(-) diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index 01c82cee..2d8cf36e 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -14,6 +14,7 @@ Usage haptools simphenotype \ --replications INT \ + --environment FLOAT \ --heritability FLOAT \ --prevalence FLOAT \ --normalize \ @@ -45,15 +46,23 @@ where .. math:: - \sigma^2 = Var[\sum_j \beta_j \vec{Z_j}] * (\frac 1 {h^2} - 1) + \sigma^2 = v (\frac 1 {h^2} - 1) -The heritability :math:`h^2` is user-specified, but if it is not provided, then :math:`\sigma^2` will be computed purely from the effect sizes, instead: +The variable :math:`v` can be specified via the ``--environment`` parameter. When not provided, :math:`v` is inferred from the variance of the genotypes: + +.. math:: + + v = Var[\sum_j \beta_j \vec{Z_j}] + +The heritability :math:`h^2` can be specified via the ``--heritability`` parameter and defaults to 0.5 when not provided. + +When both :math:`v` and :math:`h^2` aren't provided, :math:`\sigma^2` is computed purely from the effect sizes, instead: .. math:: \sigma^2 = \Biggl \lbrace {1 - \sum \beta_j^2 \quad \quad {\sum \beta_j^2 \le 1} \atop 0 \quad \quad \quad \quad \quad \text{ otherwise }} -If a prevalence for the disease is specified, the final :math:`\vec{y}` value will be thresholded to produce a binary case/control trait with the desired fraction of diseased individuals. +If a prevalence for the disease is specified via the ``--prevalence`` parameter, the final :math:`\vec{y}` is thresholded to produce a binary case/control trait with the desired fraction of diseased individuals. Input ~~~~~ diff --git a/docs/formats/haplotypes.rst b/docs/formats/haplotypes.rst index 3e891391..4d5e93e2 100644 --- a/docs/formats/haplotypes.rst +++ b/docs/formats/haplotypes.rst @@ -325,13 +325,17 @@ You can download an example header with a *beta* extra field from `tests/data/si +++++++++++++ No extra fields are required here. +.. _formats-haplotypes-changelog: + Changelog ~~~~~~~~~ v0.2.0 ------ -Support for tandem repeats in the specification via a new 'R' line type that has similar fields to the 'H' line type. +Support for tandem repeats in the specification via a new 'R' line type. See `PR #209 `_. + +Also, ``.hap`` files no longer need to be sorted by their first field in order to be indexed. See `PR #208 `_. We have updated the recommended ``sort`` command to reflect this. The new command wraps ``sort`` in a call to ``awk`` to ensure header lines are kept at the beginning of the file. -Also, ``.hap`` files no longer need to be sorted by their first field in order to be indexed. We have updated the recommended ``sort`` command to reflect this. The new command wraps ``sort`` in a call to ``awk`` to ensure header lines are kept at the beginning of the file. +All v0.1.0 ``.hap`` files can be automatically updated to v0.2.0 by simply bumping the listed version number. v0.1.0 ------ diff --git a/docs/project_info/contributing.rst b/docs/project_info/contributing.rst index c8c34ab0..5b413a41 100644 --- a/docs/project_info/contributing.rst +++ b/docs/project_info/contributing.rst @@ -181,6 +181,15 @@ For new commands After you add a new command, you should make sure to create tests for it in the `tests/ directory `_. You should also create a new page in the *Commands* section of our documentation with sections for a short description, an abbreviated usage, example commands, and a detailed usage (which is auto-generated). You can refer to :ref:`the index command ` as an example. To ensure your new documentation page appears in our table of contents, add the name of the page to the list at the bottom of our `index.rst file `_. +----------------------------- +Modifying the ``.hap`` format +----------------------------- +If you modify the :doc:`.hap file format `, you should bump the version number, which is listed at the top of the `haptools/data/haplotypes.py `_ module and follows `semantic versioning `_. + +Please describe any modifications or new features in :doc:`the .hap docs ` and in the :ref:`Changelog at the bottom of that page `. + +After bumping the version number, you should also update all ``.hap`` and ``.hap.gz`` files in the `tests/data/ directory `_ to use the new version number. + .. _code-check-instructions: ----------- diff --git a/haptools/__main__.py b/haptools/__main__.py index f8f30afd..39cf3322 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -326,12 +326,19 @@ def simgenotype( show_default=True, help="Number of rounds of simulation to perform", ) +@click.option( + "--environment", + type=click.FloatRange(min=0), + default=None, + show_default=True, + help="Variance of environmental term; inferred if not specified", +) @click.option( "-h", "--heritability", type=click.FloatRange(min=0, max=1), default=None, - show_default=True, + show_default="0.5", help="Trait heritability", ) @click.option( @@ -448,6 +455,7 @@ def simphenotype( genotypes: Path, haplotypes: Path, replications: int = 1, + environment: float = None, heritability: float = None, prevalence: float = None, normalize: bool = True, @@ -498,11 +506,17 @@ def simphenotype( else: ids = None + if heritability is None and environment is None and not normalize: + # in this case, the assumptions of the model break + # The variances of both sides of the equation will no longer properly sum to 1 + log.error("A --heritability value should be specified with --no-normalize") + # Run simulation simulate_pt( genotypes, haplotypes, replications, + environment, heritability, prevalence, normalize, diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 32622a44..f8ea318d 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -201,7 +201,7 @@ def read( " contig name matches! For example, double-check the 'chr' prefix." ) # transpose the GT matrix so that samples are rows and variants are columns - self.log.info(f"Transposing genotype matrix of size {self.data.shape}.") + self.log.info(f"Transposing genotype matrix of size {self.data.shape}") self.data = self.data.transpose((1, 0, 2)) def _variant_arr(self, record: Variant): @@ -1455,7 +1455,7 @@ def write(self): # write the psam and pvar files self.write_samples() self.write_variants() - self.log.debug(f"Transposing genotype matrix of size {self.data.shape}.") + self.log.debug(f"Transposing genotype matrix of size {self.data.shape}") # transpose the data b/c pgenwriter expects things in "variant-major" order # (ie where variants are rows instead of samples) data = self.data.transpose((1, 0, 2))[:, :, :2] diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index 6f311dc5..14f0cb15 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -13,6 +13,10 @@ from .genotypes import GenotypesVCF +# the current version of the hap format spec +HAP_VERSION = "0.2.0" + + @dataclass class Extra: """ @@ -748,6 +752,8 @@ class Haplotypes(Data): >>> haps = haplotypes.data # a dictionary of Haplotype objects """ + version = HAP_VERSION + def __init__( self, fname: Path | str, @@ -762,7 +768,6 @@ def __init__( # otherwise, the write() method might create unsorted files self.types = {"H": haplotype, "V": variant, "R": repeat} self.type_ids = None - self.version = "0.2.0" @classmethod def load( diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 4f3a5a63..1ed53ab8 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -151,6 +151,7 @@ def run( heritability: float = None, prevalence: float = None, normalize: bool = True, + environment: float = None, ) -> npt.NDArray: """ Simulate phenotypes for an entry in the Genotypes object @@ -174,6 +175,9 @@ def run( normalize: bool, optional If True, normalize the genotypes before using them to simulate the phenotypes. Otherwise, use the raw values. + environment: float, optional + The variance (aka strength) of the environmental contribution to the trait. + This is inferred from the betas if it isn't specified. Returns ------- @@ -183,10 +187,9 @@ def run( # extract the relevant haplotype info from the Haplotype objects ids = [effect.id for effect in effects] betas = np.array([effect.beta for effect in effects]) - gts = self.gens.subset(variants=ids).data[:, :, :2].sum(axis=2) - - self.log.debug(f"Extracting haplotype genotypes for haps: {ids}") self.log.debug(f"Beta values are {betas}") + self.log.debug(f"Extracting haplotype genotypes for haps: {ids}") + gts = self.gens.subset(variants=ids).data[:, :, :2].sum(axis=2) if normalize: gts = self.normalize_gts(gts, ids) @@ -196,22 +199,30 @@ def run( # generate the genetic component pt = (betas * gts).sum(axis=1) # compute the heritability - if heritability is None: + if heritability is None and environment is None: self.log.debug("Computing heritability as the sum of the squared betas") heritability = np.power(betas, 2).sum() if heritability > 1: + self.log.warning( + "Variance of error term exceeds 1. Check your betas! Capping at 1 " + "for now." + ) heritability = 1 # compute the environmental effect noise = 1 - heritability else: - # compute the environmental effect - noise = np.var(pt) - if noise == 0: - self.log.warning( - "Your genotypes have a variance of 0. Creating artificial noise..." - ) - noise = 1 - # TODO: handle a heritability of 0 somehow + noise = environment + if environment is None: + # compute the environmental effect + noise = np.var(pt) + if noise == 0: + self.log.warning( + "Your genotypes have 0 variance. Creating artificial noise..." + ) + noise = 1 + elif heritability is None: + heritability = 0.5 + # TODO: handle a heritability of 0 somehow? noise *= np.reciprocal(heritability) - 1 self.log.info(f"Adding environmental component {noise} for h^2 {heritability}") # finally, add everything together to get the simulated phenotypes @@ -286,6 +297,7 @@ def simulate_pt( genotypes: Path, haplotypes: Path, num_replications: int = 1, + environment: float = None, heritability: float = None, prevalence: float = None, normalize: bool = True, @@ -359,6 +371,9 @@ def simulate_pt( repeats: Path, optional The path to a genotypes file containing tandem repeats. This is only necessary when simulating both haplotypes *and* repeats as causal effects + environment: float, optional + The variance (aka strength) of the environmental term. This will be inferred if + it isn't specified. seed: int, optional Seed for random processes output : Path, optional @@ -372,7 +387,7 @@ def simulate_pt( load_as_haps = True # either load SNPs from the snplist file or load haps/repeats from the hap file if haplotypes.suffix == ".snplist": - log.info("Loading SNP effects") + log.info("Loading from .snplist") with open(haplotypes) as snplist_file: effects = map(Effect.from_hap_spec, snplist_file.readlines()) if haplotype_ids is None: @@ -381,7 +396,7 @@ def simulate_pt( else: effects = list(filter(lambda e: e.id in haplotype_ids, effects)) else: - log.info("Loading haplotypes") + log.info("Loading from .hap") hp = Haplotypes(haplotypes, haplotype=Haplotype, repeat=Repeat, log=log) hp.read(region=region, haplotypes=haplotype_ids) effects = hp.data.values() @@ -440,6 +455,6 @@ def simulate_pt( log.info("Simulating phenotypes") pt_sim = PhenoSimulator(gt, output=output, seed=seed, log=log) for i in range(num_replications): - pt_sim.run(effects, heritability, prevalence, normalize) + pt_sim.run(effects, heritability, prevalence, normalize, environment) log.info("Writing phenotypes") pt_sim.write() diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py index 1768065c..f48e6d64 100644 --- a/tests/test_simphenotype.py +++ b/tests/test_simphenotype.py @@ -173,7 +173,7 @@ def test_one_hap_zero_noise_all_same_nonzero_heritability(self): expected.data = np.zeros((gts_shape[0], 1), dtype=expected.data.dtype) previous_std = np.inf - for h2 in (0, 0.5, 1): + for h2 in (0, 0.8, 1): pt_sim = PhenoSimulator(gts, seed=42) data = pt_sim.run(hps, heritability=h2) data = data[:, np.newaxis] @@ -276,6 +276,32 @@ def test_noise(self): diff3 = np.abs(phens.data[:, 3] - phens.data[:, 0]).sum() assert diff3 > diff2 + def test_user_noise(self): + gts = self._get_fake_gens() + hps = self._get_fake_haps()[:2] + expected = self._get_expected_phens() + + pt_sim = PhenoSimulator(gts, seed=42) + pt_sim.run(hps, environment=0) + pt_sim.run(hps, environment=0.04) + pt_sim.run(hps, environment=0.5) + pt_sim.run(hps, environment=0.7) + pt_sim.run(hps, environment=0.7, heritability=0.7) + + phens = pt_sim.phens + + # check the data and the generated phenotype object + assert phens.data.shape == (5, 5) + np.testing.assert_allclose(phens.data[:, 0], expected.data[:, 0]) + diff1 = np.abs(phens.data[:, 1] - phens.data[:, 0]).sum() + assert diff1 > 0 + diff2 = np.abs(phens.data[:, 2] - phens.data[:, 0]).sum() + assert diff2 > diff1 + diff3 = np.abs(phens.data[:, 3] - phens.data[:, 0]).sum() + assert diff3 > diff2 + diff4 = np.abs(phens.data[:, 4] - phens.data[:, 0]).sum() + assert diff4 < diff3 + def test_case_control(self): gts = self._get_fake_gens() hps = self._get_fake_haps()[:2]