From 16a84d1fb5e1dee335b13fa0628e9abf4e53453a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:21:45 -0800 Subject: [PATCH] fix: allow `simphenotype` to accept TR PGENs without `--repeats` (#263) --- docs/api/data.rst | 2 ++ docs/formats/genotypes.rst | 11 +++++++++++ haptools/sim_phenotype.py | 8 ++++++-- tests/test_simphenotype.py | 11 +++++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/api/data.rst b/docs/api/data.rst index f236af14..3aaa9df0 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -183,6 +183,8 @@ The following methods from the :class:`Genotypes` class are disabled, however. 1. ``check_biallelic`` 2. ``check_maf`` +The constructor of the :class:`GenotypesTR` class also includes a :code:`vcftype` parameter. This can be helpful when the type of the TR file cannot be inferred automatically. Refer to `the TRTools docs `_ for a list of accepted types. + .. _api-data-genotypestr: GenotypesPLINK diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst index ad9be9e1..8f0935c0 100644 --- a/docs/formats/genotypes.rst +++ b/docs/formats/genotypes.rst @@ -37,3 +37,14 @@ To convert a VCF containing tandem repeats to PGEN, use this command, instead. .. code-block:: bash plink2 --vcf-half-call m --make-pgen 'pvar-cols=vcfheader,qual,filter,info' --vcf input.vcf --make-pgen --out output + +Tandem repeats +~~~~~~~~~~~~~~ +VCFs containing tandem repeats usually have a *type* indicating the tool from which they originated. We support whichever types are supported by `TRTools `_. + +We do our best to infer the *type* of a TR VCF automatically. However, there will be some instances when it cannot be inferred. +Users of TRTools know to specify :code:`--vcftype` in that situation. However, most haptools commands do not yet support the :code:`--vcftype` parameter. Instead, you can modify the header of your VCF to trick haptools into being able to infer the *type*. + +For example, if you're using HipSTR, you can add :code:`##command=hipstr...`. Refer to `this code in TRTools `_ for more details. + +Please note that all of this also applies to PVAR files created from TR VCFs. diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 133c36f1..a01f6fe4 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -405,8 +405,12 @@ def simulate_pt( # check if these are all repeat IDs, haplotype IDs, or a mix of them if len(hp.type_ids["R"]) >= len(haplotype_ids) and repeats is None: # if they're all repeat IDs or --repeats was specified - log.info("Loading TR genotypes") - gt = GenotypesTR(fname=genotypes, log=log) + if genotypes.suffix == ".pgen": + log.info("Loading TR genotypes from PGEN file") + gt = GenotypesPLINKTR(fname=genotypes, log=log, chunk_size=chunk_size) + else: + log.info("Loading TR genotypes from VCF/BCF file") + gt = GenotypesTR(fname=genotypes, log=log) load_as_haps = False else: # the genotypes variable must contain haplotype genotypes diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py index 4e72f2a2..eae1f0e9 100644 --- a/tests/test_simphenotype.py +++ b/tests/test_simphenotype.py @@ -670,6 +670,17 @@ def test_repeat(self, capfd): assert captured.out assert result.exit_code == 0 + def test_repeat_pgen(self, capfd): + gt_file = DATADIR / "simple-tr.pgen" + hp_file = DATADIR / "simple_tr.hap" + + cmd = f"simphenotype --id 1:10114:GTT {gt_file} {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + captured = capfd.readouterr() + assert captured.out + assert result.exit_code == 0 + def test_repeat_with_hapgts(self, capfd): tmp_transform = Path("temp-transform.vcf") with open(tmp_transform, "w") as file: