Skip to content

Commit

Permalink
adapt workflow to simulate STRs
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm committed Dec 5, 2024
1 parent b291264 commit c4f1dcf
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 85 deletions.
7 changes: 3 additions & 4 deletions analysis/config/config-simulations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ modes:
pos: 19:45903859
id: 19:45903859
reps: 10
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
snp:
pos: 19:45910672 # rs1046282
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
hap:
alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
Expand Down Expand Up @@ -80,10 +82,7 @@ min_maf: 0.05

# Sample sizes to use
# sample_size: [777, 800, 1600, 2400, 3200]
sample_size: 777

# A threshold on the pvalues of the haplotypes
out_thresh: 5e-3
sample_size: 3200

# Whether to include the causal variant in the set of genotypes provided to the
# finemapping methods. Set this to true if you're interested in seeing how the
Expand Down
54 changes: 24 additions & 30 deletions analysis/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ module genotypes:
config: config

use rule * from genotypes as genotypes_*
config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf
if mode == "str":
config["gts_str_panel"] = config["str_panel"]
if check_config("sample_size"):
sampsize = config["sample_size"]
config["gts_snp_panel"] = rules.genotypes_subset.output.pgen
Expand Down Expand Up @@ -94,24 +95,32 @@ if check_config("covar"):
if "pheno_matrix" in covar_config:
covar_file = rules.covariates_merge.output.covar

if mode in ("snp", "hap", "ld_range"):
if mode in ("str", "snp", "hap", "ld_range"):
happler_config = {
"pheno": rules.simulate_simphenotype.output.pheno,
"hap_file": rules.simulate_transform.input.hap,
"snp_panel": config["gts_snp_panel"],
"out": config["out"] + "/happler/"+mode+"/{beta}",
"out": config["out"] + "/happler/"+mode+"/beta_{beta}",
"random": None,
"covar": covar_file,
"min_maf": check_config("min_maf", 0),
"out_thresh": check_config("out_thresh", 5e-8),
"out_thresh": 1, # artificially set to 1 to avoid filtering
}
if mode in ("hap", "ld_range"):
happler_config["snps"] = []
happler_config["causal_gt"] = rules.simulate_transform.output
if mode == "hap":
happler_config["snps"] = config["modes"][mode]["alleles"]
happler_config["causal_gt"] = rules.simulate_transform.output
if mode == "ld_range":
happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
elif mode == "ld_range":
happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
elif mode in ("str", "snp"):
# also provide the SNP and STR IDs so that they can be used in the manhattan plot
happler_config["snps"] = (config["modes"][mode]["id"],)
# TODO: consider uncommenting this to compute LD with causal STR
# if mode == "str":
# happler_config["causal_gt"] = config["gts_str_panel"]
if check_config("reps", place=config["modes"][mode]):
happler_config["out"] = happler_config["out"] + "/rep_{rep}"
elif mode == "run":
happler_config = {
"pheno": config["modes"][mode]["pheno"],
Expand All @@ -125,8 +134,6 @@ elif mode == "run":
}
if check_config("SVs", place=config["modes"][mode]):
happler_config["SVs"] = config["modes"][mode]["SVs"]
elif "str" == mode:
pass
else:
raise ValueError("Unsupported operating 'mode' in config")

Expand All @@ -145,25 +152,6 @@ if mode == "ld_range" and config["modes"][mode]["random"]:
config: happler_config
use rule * from happler_random as happler_random_*

# if mode in ("hap", "str", "ld_range"):
# finemappers_config = {
# "pheno": rules.simulate_simphenotype.output.pheno,
# "out": config["out"] + "/finemappers/"+mode+"/{beta}",
# "snp_panel": config["gts_snp_panel"],
# }
# if mode in ("hap", "ld_range"):
# finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen
# if mode == "ld_range":
# finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
# else:
# raise ValueError("Not yet implemented operating mode 'str' in config")
# module finemappers:
# snakefile: "rules/finemappers.smk"
# config: finemappers_config
# use rule * from finemappers as finemappers_*
# else:
# raise ValueError(f"Unsupported operating mode '{mode}' in config")

plots_config = {
"out": config["out"] + "/plots",
"mode": mode,
Expand Down Expand Up @@ -237,16 +225,22 @@ elif mode == "run":
else:
FINAL_OUTPUT = expand(
[
rules.happler_cond_linreg.output.png,
rules.happler_tree.output.png,
rules.happler_manhattan.output.png,
# rules.finemappers_results.output.susie_pdf,
rules.happler_results.output.susie_pdf,
rules.plots_params.output.png,
# rules.plots_params.output.png,
],
ex=(("in",) if mode == "hap" else ("ex", "in")),
beta=config["modes"]["hap"]["beta"],
allow_missing=True,
)
if check_config("reps", place=config["modes"][mode]):
FINAL_OUTPUT = expand(
FINAL_OUTPUT,
rep=range(config["modes"][mode]["reps"]),
allow_missing=True,
)

# If '--config debug=true' is specified, we won't try to build the entire DAG
# This can be helpful when you want to play around with a specific target output
Expand Down
22 changes: 0 additions & 22 deletions analysis/workflow/rules/genotypes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -162,28 +162,6 @@ rule vcf2plink:
"--threads {threads}{params.samps} --out {params.prefix} &>{log}"


rule subset_str:
""" subset samples from an STR VCF """
input:
vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0],
vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]),
samples=rules.keep_samps.output.samples,
output:
vcf=out+"/strs.bcf",
vcf_idx=out+"/strs.bcf.csi",
resources:
runtime=30,
log:
logs + "/subset_str",
benchmark:
bench + "/subset_str",
conda:
"../envs/default.yml"
shell:
"bcftools view -S {input.samples} --write-index "
"-Ob -o {output.vcf} {input.vcf}"


def subset_input():
if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"):
return {
Expand Down
2 changes: 1 addition & 1 deletion analysis/workflow/rules/happler.smk
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ rule run:
out_thresh=check_config("out_thresh", 5e-8),
keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "",
output:
hap=out + "/happler.hap",
hap=ensure(out + "/happler.hap", non_empty=True),
gz=out + "/happler.hap.gz",
idx=out + "/happler.hap.gz.tbi",
dot=out + "/happler.dot",
Expand Down
29 changes: 17 additions & 12 deletions analysis/workflow/rules/simulate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,23 @@ def parse_locus(locus):

hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap"

gts_panel = Path(
config["gts_str_panel"] if mode == "str" else config["gts_snp_panel"]
)

checkpoint create_hap_ld_range:
""" create a hap file suitable for haptools transform and simphenotype """
input:
gts=Path(config["gts_snp_panel"]),
gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"),
params:
min_ld = config["modes"]["ld_range"]["min_ld"],
max_ld = config["modes"]["ld_range"]["max_ld"],
step = config["modes"]["ld_range"]["step"],
min_af = config["modes"]["ld_range"]["min_af"],
max_af = config["modes"]["ld_range"]["max_af"],
num_haps = "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
min_ld = lambda wildcards: config["modes"]["ld_range"]["min_ld"],
max_ld = lambda wildcards: config["modes"]["ld_range"]["max_ld"],
step = lambda wildcards: config["modes"]["ld_range"]["step"],
min_af = lambda wildcards: config["modes"]["ld_range"]["min_af"],
max_af = lambda wildcards: config["modes"]["ld_range"]["max_af"],
num_haps = lambda wildcards: "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
out = lambda wildcards: expand(
hap_ld_range_output, **wildcards, allow_missing=True,
),
Expand All @@ -61,13 +65,14 @@ checkpoint create_hap_ld_range:
rule create_hap:
""" create a hap file suitable for haptools transform and simphenotype """
input:
gts=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
gts=gts_panel.with_suffix(".pvar"),
params:
ignore="this", # the first parameter is always ignored for some reason
chrom=lambda wildcards: parse_locus(wildcards.locus)[0],
locus=lambda wildcards: wildcards.locus.split("_")[1].replace('-', '\t'),
beta=0.99,
alleles=lambda wildcards: config["modes"]["hap"]["alleles"],
alleles=lambda wildcards: config["modes"]["hap"]["alleles"] if mode == "hap" else [],
repeat=lambda wildcards: config["modes"]["str"]["id"] if mode == "str" else 0,
output:
hap=out + "/haplotype.hap"
resources:
Expand Down Expand Up @@ -113,13 +118,13 @@ rule simphenotype:
""" use the hap file to create simulated phenotypes for the haplotype """
input:
hap=rules.transform.input.hap,
pgen=rules.transform.output.pgen,
pvar=rules.transform.output.pvar,
psam=rules.transform.output.psam,
pgen=rules.transform.output.pgen if mode != "str" else config["gts_str_panel"],
pvar=rules.transform.output.pvar if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".pvar"),
psam=rules.transform.output.psam if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".psam"),
params:
beta=lambda wildcards: wildcards.beta,
seed = 42,
reps = config["modes"]["ld_range"]["reps"],
reps = lambda wildcards: config["modes"]["ld_range"]["reps"],
output:
pheno=out + "/{beta}.pheno",
resources:
Expand Down
40 changes: 24 additions & 16 deletions analysis/workflow/scripts/create_hap_file.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,31 @@

exec 2> "${snakemake_log}"

echo -e "#\torderH\tbeta" > "${snakemake_output[hap]}"
echo -e "#\tversion\t0.1.0" >> "${snakemake_output[hap]}"
echo -e "#H\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}"
echo -e "#\tversion\t0.2.0" >> "${snakemake_output[hap]}"
echo -e "#R\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}"

starts=()
ends=()
for snp in ${snakemake_params[4]}; do
snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')"
start_pos="$(grep -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)"
end_pos=$((start_pos+1))
echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}"
starts+=("$start_pos")
ends+=("$end_pos")
done
if [ "${snakemake_params[repeat]}" -eq "0" ]; then
starts=()
ends=()
for snp in ${snakemake_params[4]}; do
snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')"
start_pos="$(grep -Ev '^#' -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)"
end_pos=$((start_pos+1))
echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}"
starts+=("$start_pos")
ends+=("$end_pos")
done

start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)"
end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)"
echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)"
end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)"
echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
else
for str in ${snakemake_params[5]}; do
str_id=$'\t'"$str"$'\t'
start_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | cut -f2)"
end_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | grep -Po '(?<=;END=)\d+(?=;PERIOD)')"
echo -e "R\t${snakemake_params[1]}\t$start_pos\t$end_pos\t$str\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
done
fi

LC_ALL=C sort -k1,4 -o "${snakemake_output[hap]}" "${snakemake_output[hap]}"

0 comments on commit c4f1dcf

Please sign in to comment.