-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathconfig.yaml
162 lines (146 loc) · 5.38 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
####################################
# GLOBAL PARAMETERS #
####################################
global_parameters:
random_seed: 123
chromosome: 1 # "all" or a number from 1 to 22
superpopulation: none # "none" or a specific superpopulation (AFR, AMR, EAS, EUR, CSA, MID)
memory: 8000 # amount of memory available (in MB) for memory-intensive commands
batchsize: 10000 # batchsize for writing plink output during genotype generation
####################################
# FILEPATHS #
####################################
# - the chromosome number can be given as a wildcard by specifying {chromosome} in the filepath
# - the superpopulation can be given as a wildcard by specifying {superpopulation} in the filepath
filepaths:
general:
output_dir: data/outputs/test
output_prefix: test_chr-{chromosome}
genotype:
vcf_input_raw: data/inputs/raw/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.final.vcf.gz
vcf_input_processed: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.final.recode.vcf
vcf_metadata: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.metadata
popfile_raw: data/inputs/processed/1KG+HGDP/merged_pop_adjusted.tsv
popfile_processed: data/inputs/processed/1KG+HGDP/merged_pop.tsv
variant_list: data/inputs/processed/1KG+HGDP/hapmap_variant_list_chr{chromosome}.txt
remove_list: data/inputs/processed/1KG+HGDP/remove.txt
rsid_list: data/inputs/processed/1KG+HGDP/rsid_map_list_chr{chromosome}.txt
genetic_mapfile: data/inputs/raw/1KG+HGDP/genetic_maps/chr{chromosome}.interpolated_genetic_map
genetic_distfile: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.distfile
mutation_mapfile: data/inputs/raw/1KG+HGDP/mutation_maps/atlas.chr{chromosome}.csv
mutation_agefile: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.agefile
hap1_matrix: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.h1
hap2_matrix: data/inputs/processed/1KG+HGDP/1KG+HGDP.chr{chromosome}.hapmap.h2
phenotype:
causal_list: data/inputs/processed/1KG+HGDP/Test.CausalList
reference: data/inputs/processed/1KG+HGDP/Africa.Annot
plink_override: none # can set to a value if using pre-simulated genetics input
software:
plink: plink
plink2: plink2
king: king
vcftools: vcftools
mapthin: mapthin
phenoalg: phenoalg
####################################
# GENOTYPE DATA #
####################################
genotype_data:
samples:
use_default: true # setting this to true will ignore the custom population groups
custom: # add your custom population groups below if using use_default=false
- id: EUR_pop
nsamples: 100
populations:
- EUR: 100
- id: AFR_pop
nsamples: 200
populations:
- AFR: 100
- id: admix_pop
nsamples: 100
populations:
- EUR: 50
- AFR: 50
default:
nsamples: 1000 # used by the algorithm if use_default=true, otherwise custom population groups are used
# recombination rate
rho:
AFR: 0.77
AMR: 0.80
EAS: 0.58
EUR: 0.68
CSA: 0.73
MID: 0.65
# effective population size
Ne:
AFR: 11900
AMR: 10400
EAS: 11700
EUR: 11700
CSA: 11500
MID: 8100
####################################
# PHENOTYPE DATA #
####################################
phenotype_data:
nPopulation: 6
nTrait: 1
a: -0.4
b: -1
c: 0.5
nComponent: 1
PropotionGeno: 0.1,0.1,0.1,0.1,0.1,0.1
PropotionCovar: 0,0,0,0,0,0
Prevalence: 0.5,0.5,0.5,0.5,0.5,0.5
TraitCorr: 1
PopulationCorr: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CompWeight: 1,5,10
Causality:
UseCausalList: false # if true the algorithm will use the causal_list filepath
Polygenicity: 0.005 # only required if UseCausalList is false
Pleiotropy: 1 # only required if UseCausalList is false
####################################
# EVALUATION #
####################################
# Set to true if you want the script to calculate the metric
evaluation:
metrics:
aats: true # nearest neighbour adversarial accuracy
kinship: true # relatedness, including kinship density and IBS plots
ld_corr: true # linkage disequilibrium (LD) correlation matrix
ld_decay: true # linkage disequilibrium (LD) decay plot (and distance)
maf: true # minor allele frequency divergences
pca: true # principal components analysis
gwas: false # GWAS, manhattan plot and qqplot
####################################
# OPTIMISATION #
####################################
# Note that this code uses a single superpopulation and ignores custom population structures
optimisation:
# prior distributions - specify lower/upper bounds for uniform priors
priors:
rho:
uniform_lower: 0
uniform_upper: 3
Ne:
uniform_lower: 0
uniform_upper: 50000
# inference type - simulation-based rejection ABC or emulation-based rejection ABC
simulation_rejection_ABC:
run: true
n_particles: 500
threshold: 0.15
max_iter: 500
write_progress: true
emulation_rejection_ABC:
run: false
n_particles: 500
threshold: 0.15
n_design_points: 50
max_iter: 500
write_progress: true
# choice of summary statistic/s
summary_statistics:
ld_decay: true
kinship: true