adapt workflow to simulate STRs

gymrek-lab · Dec 5, 2024 · c4f1dcf · c4f1dcf
1 parent b291264
commit c4f1dcf
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 85 deletions.
diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
@@ -46,8 +46,10 @@ modes:
   pos: 19:45903859
   id: 19:45903859
   reps: 10
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
  snp:
   pos: 19:45910672 # rs1046282
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
  hap:
   alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
   beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
@@ -80,10 +82,7 @@ min_maf: 0.05
 
 # Sample sizes to use
 # sample_size: [777, 800, 1600, 2400, 3200]
-sample_size: 777
-
-# A threshold on the pvalues of the haplotypes
-out_thresh: 5e-3
+sample_size: 3200
 
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the

diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
@@ -50,7 +50,8 @@ module genotypes:
     config: config
 
 use rule * from genotypes as genotypes_*
-config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf
+if mode == "str":
+    config["gts_str_panel"] = config["str_panel"]
 if check_config("sample_size"):
     sampsize = config["sample_size"]
     config["gts_snp_panel"] = rules.genotypes_subset.output.pgen
@@ -94,24 +95,32 @@ if check_config("covar"):
     if "pheno_matrix" in covar_config:
         covar_file = rules.covariates_merge.output.covar
 
-if mode in ("snp", "hap", "ld_range"):
+if mode in ("str", "snp", "hap", "ld_range"):
     happler_config = {
         "pheno": rules.simulate_simphenotype.output.pheno,
         "hap_file": rules.simulate_transform.input.hap,
         "snp_panel": config["gts_snp_panel"],
-        "out": config["out"] + "/happler/"+mode+"/{beta}",
+        "out": config["out"] + "/happler/"+mode+"/beta_{beta}",
         "random": None,
         "covar": covar_file,
         "min_maf": check_config("min_maf", 0),
-        "out_thresh": check_config("out_thresh", 5e-8),
+        "out_thresh": 1, # artificially set to 1 to avoid filtering
     }
     if mode in ("hap", "ld_range"):
         happler_config["snps"] = []
+        happler_config["causal_gt"] = rules.simulate_transform.output
         if mode == "hap":
             happler_config["snps"] = config["modes"][mode]["alleles"]
-        happler_config["causal_gt"] = rules.simulate_transform.output
-    if mode == "ld_range":
-        happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
+        elif mode == "ld_range":
+            happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
+    elif mode in ("str", "snp"):
+        # also provide the SNP and STR IDs so that they can be used in the manhattan plot
+        happler_config["snps"] = (config["modes"][mode]["id"],)
+        # TODO: consider uncommenting this to compute LD with causal STR
+        # if mode == "str":
+        #     happler_config["causal_gt"] = config["gts_str_panel"]
+    if check_config("reps", place=config["modes"][mode]):
+        happler_config["out"] = happler_config["out"] + "/rep_{rep}"
 elif mode == "run":
     happler_config = {
         "pheno": config["modes"][mode]["pheno"],
@@ -125,8 +134,6 @@ elif mode == "run":
     }
     if check_config("SVs", place=config["modes"][mode]):
         happler_config["SVs"] = config["modes"][mode]["SVs"]
-elif "str" == mode:
-    pass
 else:
     raise ValueError("Unsupported operating 'mode' in config")
 
@@ -145,25 +152,6 @@ if mode == "ld_range" and config["modes"][mode]["random"]:
         config: happler_config
     use rule * from happler_random as happler_random_*
 
-# if mode in ("hap", "str", "ld_range"):
-#     finemappers_config = {
-#         "pheno": rules.simulate_simphenotype.output.pheno,
-#         "out": config["out"] + "/finemappers/"+mode+"/{beta}",
-#         "snp_panel": config["gts_snp_panel"],
-#     }
-#     if mode in ("hap", "ld_range"):
-#         finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen
-#         if mode == "ld_range":
-#             finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
-#     else:
-#         raise ValueError("Not yet implemented operating mode 'str' in config")
-#     module finemappers:
-#         snakefile: "rules/finemappers.smk"
-#         config: finemappers_config
-#     use rule * from finemappers as finemappers_*
-# else:
-#     raise ValueError(f"Unsupported operating mode '{mode}' in config")
-
 plots_config = {
     "out": config["out"] + "/plots",
     "mode": mode,
@@ -237,16 +225,22 @@ elif mode == "run":
 else:
     FINAL_OUTPUT = expand(
         [
+            rules.happler_cond_linreg.output.png,
             rules.happler_tree.output.png,
-            rules.happler_manhattan.output.png,
             # rules.finemappers_results.output.susie_pdf,
             rules.happler_results.output.susie_pdf,
-            rules.plots_params.output.png,
+            # rules.plots_params.output.png,
         ],
         ex=(("in",) if mode == "hap" else ("ex", "in")),
         beta=config["modes"]["hap"]["beta"],
         allow_missing=True,
     )
+    if check_config("reps", place=config["modes"][mode]):
+        FINAL_OUTPUT = expand(
+            FINAL_OUTPUT,
+            rep=range(config["modes"][mode]["reps"]),
+            allow_missing=True,
+        )
 
 # If '--config debug=true' is specified, we won't try to build the entire DAG
 # This can be helpful when you want to play around with a specific target output

diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk
@@ -162,28 +162,6 @@ rule vcf2plink:
         "--threads {threads}{params.samps} --out {params.prefix} &>{log}"
 
 
-rule subset_str:
-    """ subset samples from an STR VCF """
-    input:
-        vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0],
-        vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]),
-        samples=rules.keep_samps.output.samples,
-    output:
-        vcf=out+"/strs.bcf",
-        vcf_idx=out+"/strs.bcf.csi",
-    resources:
-        runtime=30,
-    log:
-        logs + "/subset_str",
-    benchmark:
-        bench + "/subset_str",
-    conda:
-        "../envs/default.yml"
-    shell:
-        "bcftools view -S {input.samples} --write-index "
-        "-Ob -o {output.vcf} {input.vcf}"
-
-
 def subset_input():
     if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"):
         return {

diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
@@ -68,7 +68,7 @@ rule run:
         out_thresh=check_config("out_thresh", 5e-8),
         keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "",
     output:
-        hap=out + "/happler.hap",
+        hap=ensure(out + "/happler.hap", non_empty=True),
         gz=out + "/happler.hap.gz",
         idx=out + "/happler.hap.gz.tbi",
         dot=out + "/happler.dot",

diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk
@@ -24,19 +24,23 @@ def parse_locus(locus):
 
 hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap"
 
+gts_panel = Path(
+    config["gts_str_panel"] if mode == "str" else config["gts_snp_panel"]
+)
+
 checkpoint create_hap_ld_range:
     """ create a hap file suitable for haptools transform and simphenotype """
     input:
         gts=Path(config["gts_snp_panel"]),
         gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
         gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"),
     params:
-        min_ld = config["modes"]["ld_range"]["min_ld"],
-        max_ld = config["modes"]["ld_range"]["max_ld"],
-        step = config["modes"]["ld_range"]["step"],
-        min_af = config["modes"]["ld_range"]["min_af"],
-        max_af = config["modes"]["ld_range"]["max_af"],
-        num_haps = "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
+        min_ld = lambda wildcards: config["modes"]["ld_range"]["min_ld"],
+        max_ld = lambda wildcards: config["modes"]["ld_range"]["max_ld"],
+        step = lambda wildcards: config["modes"]["ld_range"]["step"],
+        min_af = lambda wildcards: config["modes"]["ld_range"]["min_af"],
+        max_af = lambda wildcards: config["modes"]["ld_range"]["max_af"],
+        num_haps = lambda wildcards: "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
         out = lambda wildcards: expand(
             hap_ld_range_output, **wildcards, allow_missing=True,
         ),
@@ -61,13 +65,14 @@ checkpoint create_hap_ld_range:
 rule create_hap:
     """ create a hap file suitable for haptools transform and simphenotype """
     input:
-        gts=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
+        gts=gts_panel.with_suffix(".pvar"),
     params:
         ignore="this", # the first parameter is always ignored for some reason
         chrom=lambda wildcards: parse_locus(wildcards.locus)[0],
         locus=lambda wildcards: wildcards.locus.split("_")[1].replace('-', '\t'),
         beta=0.99,
-        alleles=lambda wildcards: config["modes"]["hap"]["alleles"],
+        alleles=lambda wildcards: config["modes"]["hap"]["alleles"] if mode == "hap" else [],
+        repeat=lambda wildcards: config["modes"]["str"]["id"] if mode == "str" else 0,
     output:
         hap=out + "/haplotype.hap"
     resources:
@@ -113,13 +118,13 @@ rule simphenotype:
     """ use the hap file to create simulated phenotypes for the haplotype """
     input:
         hap=rules.transform.input.hap,
-        pgen=rules.transform.output.pgen,
-        pvar=rules.transform.output.pvar,
-        psam=rules.transform.output.psam,
+        pgen=rules.transform.output.pgen if mode != "str" else config["gts_str_panel"],
+        pvar=rules.transform.output.pvar if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".pvar"),
+        psam=rules.transform.output.psam if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".psam"),
     params:
         beta=lambda wildcards: wildcards.beta,
         seed = 42,
-        reps = config["modes"]["ld_range"]["reps"],
+        reps = lambda wildcards: config["modes"]["ld_range"]["reps"],
     output:
         pheno=out + "/{beta}.pheno",
     resources:

diff --git a/analysis/workflow/scripts/create_hap_file.sh b/analysis/workflow/scripts/create_hap_file.sh
@@ -2,23 +2,31 @@
 
 exec 2> "${snakemake_log}"
 
-echo -e "#\torderH\tbeta" > "${snakemake_output[hap]}"
-echo -e "#\tversion\t0.1.0" >> "${snakemake_output[hap]}"
-echo -e "#H\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}"
+echo -e "#\tversion\t0.2.0" >> "${snakemake_output[hap]}"
+echo -e "#R\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}"
 
-starts=()
-ends=()
-for snp in ${snakemake_params[4]}; do
-    snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')"
-    start_pos="$(grep -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)"
-    end_pos=$((start_pos+1))
-    echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}"
-    starts+=("$start_pos")
-    ends+=("$end_pos")
-done
+if [ "${snakemake_params[repeat]}" -eq "0" ]; then
+    starts=()
+    ends=()
+    for snp in ${snakemake_params[4]}; do
+        snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')"
+        start_pos="$(grep -Ev '^#' -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)"
+        end_pos=$((start_pos+1))
+        echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}"
+        starts+=("$start_pos")
+        ends+=("$end_pos")
+    done
 
-start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)"
-end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)"
-echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
+    start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)"
+    end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)"
+    echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
+else
+    for str in ${snakemake_params[5]}; do
+        str_id=$'\t'"$str"$'\t'
+        start_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | cut -f2)"
+        end_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | grep -Po '(?<=;END=)\d+(?=;PERIOD)')"
+        echo -e "R\t${snakemake_params[1]}\t$start_pos\t$end_pos\t$str\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
+    done
+fi
 
 LC_ALL=C sort -k1,4 -o "${snakemake_output[hap]}" "${snakemake_output[hap]}"