Revision lowpass main (#123)

* Add readme section on LowPass integration * Updates to StatDM for required arguments and defaults when arguments are missing * Misc updates for InferDFE with LowPass
xin-huang · Nov 21, 2024 · ec18455 · ec18455
1 parent 73c7fa1
commit ec18455
Show file tree

Hide file tree

Showing 10 changed files with 59 additions and 10 deletions.
diff --git a/dadi_cli/Stat.py b/dadi_cli/Stat.py
@@ -6,7 +6,7 @@
 from typing import Optional
 from dadi_cli.Models import get_model
 from dadi_cli.Pdfs import get_dadi_pdf
-from dadi_cli.utilities import get_opts_and_theta, convert_to_None
+from dadi_cli.utilities import get_opts_and_theta, convert_to_None, pts_l_func
 
 
 def godambe_stat_demograpy(
@@ -53,6 +53,8 @@ def godambe_stat_demograpy(
     fixed_params = convert_to_None(fixed_params, len(demo_popt))
     free_params = _free_params(demo_popt, fixed_params)
     fs = dadi.Spectrum.from_file(fs)
+    if grids is None:
+        grids = pts_l_func(fs.sample_sizes)
     fs_boot_files = glob.glob(bootstrap_dir + "/*.fs")
     if fs_boot_files == []:
         raise ValueError(f"ERROR:\n{bootstrap_dir} is empty\n")

diff --git a/dadi_cli/parsers/infer_dfe_parsers.py b/dadi_cli/parsers/infer_dfe_parsers.py
@@ -166,7 +166,7 @@ def _run_infer_dfe(args: argparse.Namespace) -> None:
     else:
         cache2d = args.cache2d
 
-    if not np.all(cache_ns == fs.sample_sizes):
+    if not np.all(cache_ns == fs.sample_sizes) and args.cov_args == []:
         raise ValueError('Cache and frequencey spectrum do not have the same sample sizes')
 
     if args.maxeval == False:

diff --git a/dadi_cli/parsers/plot_parsers.py b/dadi_cli/parsers/plot_parsers.py
@@ -195,13 +195,6 @@ def add_plot_parsers(subparsers: argparse.ArgumentParser) -> None:
         default=0.1,
         help="Minimum value to be plotted in the frequency spectrum, default: 0.1.",
     )
-    parser.add_argument(
-        "--ratio",
-        type=positive_num,
-        default=False,
-        required=False,
-        help="Ratio for the nonsynonymous mutations to the synonymous mutations.",
-    )
     parser.add_argument(
         "--interactive",
         default=False,

diff --git a/dadi_cli/parsers/stat_dm_parsers.py b/dadi_cli/parsers/stat_dm_parsers.py
@@ -106,6 +106,7 @@ def add_stat_dm_parsers(subparsers: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--demo-popt",
         type=existed_file,
+        required=True,
         dest="demo_popt",
         help="File contains the bestfit demographic parameters, generated by `dadi-cli BestFit`.",
     )

diff --git a/docs/index.md b/docs/index.md
@@ -46,12 +46,13 @@ To get help information, users can use
 dadi-cli -h
 ```
 
-There are thirteen subcommands in `dadi-cli`:
+There are fourteen subcommands in `dadi-cli`:
 
 - GenerateFs 
 - GenerateCache 
 - InferDM 
 - InferDFE 
+- LowPass 
 - BestFit 
 - StatDM 
 - StatDFE 

diff --git a/docs/userguide/lowpass.md b/docs/userguide/lowpass.md
@@ -0,0 +1,38 @@
+# dadi.LowPass Integration
+
+dadi-cli can utilize the LowPass module of dadi (url). This enables the ability to correct models based on the coverage each sample has.
+
+## GenerateFs Input
+
+Before you use LowPass you'll want to make sure your VCF has the allele depth (AD) entry.
+<pre>
+FORMAT
+GT:<b>AD</b>:DP:GQ:PL
+</pre>
+The information from the AD entry is used for generating a Demographic model. Users can save the information in a pickled dictionary when running `GenerateFs` with the `--calc-coverage`:
+<pre>
+dadi-cli GenerateFs --vcf examples/data/mus.syn.subset.vcf.gz --pop-info examples/data/mouse.popfile.txt --pop-ids Mmd_IRA Mmd_FRA --projections 4 8 --polarized --output examples/results/lowpass/mus.syn.fs <b>--calc-coverage</b>
+dadi-cli GenerateFs --vcf examples/data/mus.nonsyn.subset.vcf.gz --pop-info examples/data/mouse.popfile.txt --pop-ids Mmd_IRA Mmd_FRA --projections 4 8 --polarized --output examples/results/lowpass/mus.nonsyn.fs <b>--calc-coverage</b>
+</pre>
+In addition to the `mus.syn.fs` and `mus.nonsyn.fs` SFS files, `mus.syn.fs.coverage.pickle` and `mus.nonsyn.fs.coverage.pickle` are generated. These files will be used for `InferDM` and `InferDFE` to preform the LowPass model correction.
+
+## InferDM with LowPass
+
+LowPass will correct the demographic and DFE models based on the coverage of samples, to use it users will need to use the `--coverage-model` flag, which `--coverage-model` takes the `.coverage.pickle` file and total number of haplotypes in the data for each population. There were 10 Mmd_IRA samples and 16 Mmd_FRA samples being requested from the `--pop-info` file from `GenerateFs`.
+<pre>
+dadi-cli InferDM --fs examples/results/lowpass/mus.syn.fs --model split_mig --p0 4.5 0.8 0.8 0.36 0.01 --lbounds 1e-5 1e-5 1e-5 1e-5 1e-5 --ubounds 10 10 1 10 1 --output-prefix examples/results/lowpass/mus.split_mig.lowpass --optimizations 20 --check-convergence 5 --grids 50 60 70 <b>--coverage-model examples/results/lowpass/mus.syn.fs.coverage.pickle 10 16<b>
+</pre>
+
+There are no extra parameters for the LowPass model correction, so results can look similar to non-LowPass corrected results.
+
+## InferDFE with LowPass
+
+InferDFE is largely the same as InferDM. When users run `GenerateCache`, user's will want to use the number of haplotypes in the data for `--sample-sizes` instead of the sample sizes in the SFS file. Given that there were 10 Mmd_IRA samples and 16 Mmd_FRA haplotypes users would use `--sample-sizes 10 16` instead of `--sample-sizes 4 8`:
+<pre>
+dadi-cli GenerateCache --model split_mig_sel_single_gamma <b>--sample-sizes 10 16</b> --grids 50 60 70 --gamma-pts 20 --gamma-bounds 1e-4 100 --demo-popt examples/results/lowpass/mus.split_mig.lowpass.InferDM.bestfits --output examples/results/lowpass/mus.1d.cache
+</pre>
+
+When users run InferDFE they should use the `--coverage-model` flag with the similar inputs as `InferDM`, except using the `.coverage.pickle` file for the nonsynonymous data.
+<pre>
+dadi-cli InferDFE --fs examples/results/lowpass/mus.nonsyn.fs --demo-popt examples/results/lowpass/mus.split_mig.lowpass.InferDM.bestfits --cache1d examples/results/lowpass/mus.1d.cache --pdf1d lognormal --ratio 2.31 --p0 1 1 0.01 --lbounds 1e-5 1e-5 1e-5 --ubounds 30 10 1 --optimizations 20 --check-convergence 5 --output-prefix examples/results/lowpass/mus.lognormal.lowpass <b>--coverage-model examples/results/lowpass/mus.nonsyn.fs.coverage.pickle 10 16</b>
+</pre>
diff --git a/examples/data/mouse.popfile.txt b/examples/data/mouse.popfile.txt
@@ -0,0 +1,13 @@
+Mmd_IRA1_AH15.variant3	Mmd_IRA
+Mmd_IRA2_AH23.variant3	Mmd_IRA
+Mmd_IRA5_JR2-F1C.variant3	Mmd_IRA
+Mmd_IRA7_JR7-F1C.variant3	Mmd_IRA
+Mmd_IRA8_JR8-F1A.variant3	Mmd_IRA
+Mmd_FRA1_14.variant	Mmd_FRA
+Mmd_FRA2_15B.variant	Mmd_FRA
+Mmd_FRA3_16B.variant	Mmd_FRA
+Mmd_FRA4_18B.variant	Mmd_FRA
+Mmd_FRA5_B2C.variant	Mmd_FRA
+Mmd_FRA6_C1.variant	Mmd_FRA
+Mmd_FRA7_E1.variant	Mmd_FRA
+Mmd_FRA8_F1B.variant	Mmd_FRA
diff --git a/examples/data/mus.nonsyn.subset.vcf.gz b/examples/data/mus.nonsyn.subset.vcf.gz
diff --git a/examples/data/mus.syn.subset.vcf.gz b/examples/data/mus.syn.subset.vcf.gz
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -33,6 +33,7 @@ nav:
         - Demographic inference: 'userguide/demog.md'
         - DFE inference: 'userguide/dfe.md'
         - Joint DFE inference: 'userguide/jdfe.md'
+        - LowPass sequence coverage correction: 'userguide/lowpass.md'
         - Statistical testing: 'userguide/stat.md'
         - Cloud computing: 'userguide/cloud.md'
         - Simulation: 'userguide/simulation.md'