From e21faaa98807eb9ad6eda4816381be944bf9fc8b Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Sun, 30 Oct 2022 13:17:58 +0000 Subject: [PATCH] Add `genotype_values()` method See https://github.com/tskit-dev/tsinfer/issues/739 --- python/CHANGELOG.rst | 6 ++++ python/tests/test_genotypes.py | 59 ++++++++++++++++++++++++++++++++++ python/tskit/genotypes.py | 29 +++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index ecddf5b99d..41245b42a7 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -165,6 +165,12 @@ to ``None``, which is treated as ``True``. Previously, passing ``None`` would result in an error. (:user:`hyanwong`, :pr:`2609`, :issue:`2608`) +**Features** + + - Variants have a `states()` method that returns the genotypes as an + (inefficient) array of strings, rather than integer indexes, to + aid comparison of genetic variation (:user:`hyanwong`, :pr:`2617`) + -------------------- [0.5.3] - 2022-10-03 diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 329867b600..8f1194d90a 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -655,6 +655,65 @@ def test_snipped_tree_sequence_mutations_over_isolated(self): assert non_missing_found assert missing_found + def get_missing_data_ts(self): + tables = tskit.TableCollection(1.0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + s = tables.sites.add_row(0, "A") + tables.mutations.add_row(site=s, derived_state="B", node=1) + tables.mutations.add_row(site=s, derived_state="C", node=2) + s = tables.sites.add_row(0.5, "") + tables.mutations.add_row(site=s, derived_state="A long string", node=2) + return tables.tree_sequence() + + def test_states(self): + ts = self.get_missing_data_ts() + v_iter = ts.variants(isolated_as_missing=False) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["A", "B", "C"])) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["", "", "A long string"])) + # With no mssing data, it shouldn't matter if the missing string = an allele + assert np.array_equal( + v.states(missing_data_string=""), np.array(["", "", "A long string"]) + ) + + v_iter = ts.variants(isolated_as_missing=True) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["N", "B", "C"])) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["N", "N", "A long string"])) + assert np.array_equal( + v.states(missing_data_string="MISSING"), + np.array(["MISSING", "MISSING", "A long string"]), + ) + + @pytest.mark.parametrize("missing", [True, False]) + def test_states_haplotypes_equiv(self, missing): + ts = msprime.sim_ancestry(2, sequence_length=20, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.1, random_seed=1) + assert ts.num_sites > 5 + tables = ts.dump_tables() + tables.delete_intervals([[0, ts.site(4).position]]) + tables.sites.replace_with(ts.tables.sites) + ts = tables.tree_sequence() + states = np.array( + [v.states() for v in ts.variants(isolated_as_missing=missing)] + ) + for h1, h2 in zip(ts.haplotypes(isolated_as_missing=missing), states.T): + assert h1 == "".join(h2) + + @pytest.mark.parametrize("s", ["", "A long string", True, np.nan, 0, -1]) + def test_bad_states(self, s): + ts = self.get_missing_data_ts() + v_iter = ts.variants(isolated_as_missing=True) + v = next(v_iter) + v = next(v_iter) + match = "existing allele" if isinstance(s, str) else "not a string" + with pytest.raises(ValueError, match=match): + v.states(missing_data_string=s) + class TestLimitInterval: def test_simple_case(self, ts_fixture): diff --git a/python/tskit/genotypes.py b/python/tskit/genotypes.py index 239e135777..78aa525f6e 100644 --- a/python/tskit/genotypes.py +++ b/python/tskit/genotypes.py @@ -245,6 +245,35 @@ def copy(self) -> Variant: variant_copy._ll_variant = self._ll_variant.restricted_copy() return variant_copy + def states(self, missing_data_string=None) -> np.ndarray: + """ + Returns the allelic states at this site as an array of strings. + + .. warning:: + Using this method is inefficient compared to working with the + underlying integer representation of genotypes as returned by + the :attr:`~Variant.genotypes` property. + + :param str missing_data_string: A string that will be used to represent missing + data. If any normal allele contains this character, an error is raised. + Default: `None`, treated as `'N'`. + :return: An numpy array of strings of length ``num_sites``. + """ + if missing_data_string is None: + missing_data_string = "N" + elif not isinstance(missing_data_string, str): + # Must explicitly test here, otherwise we output a numpy object array + raise ValueError("Missing data string is not a string") + alleles = self.alleles + if alleles[-1] is None: + if missing_data_string in alleles: + raise ValueError( + "An existing allele is equal to the " + f"missing data string '{missing_data_string}'" + ) + alleles = alleles[:-1] + (missing_data_string,) + return np.array(alleles)[self.genotypes] + def counts(self) -> typing.Counter[str | None]: """ Returns a :class:`python:collections.Counter` object providing counts for each