-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml~nobamutil
357 lines (303 loc) · 14.1 KB
/
config.yaml~nobamutil
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
##########################################################################################
# __ __ _ ____ _ ____ _ _ _____ #
# | \/ | / \ | _ \ / \ / ___| | | | ____| #
# | |\/| | / _ \ | |_) / _ \| | | |_| | _| #
# | | | |/ ___ \| __/ ___ \ |___| _ | |___ #
# |_| |_/_/ \_\_| /_/ \_\____|_| |_|_____| #
# #
##########################################################################################
##########################################################################################
# Samples
##########################################################################################
# Example samples file. Modify to map to your own samples.
sample_file: config/samples_sarakina.tsv
# Some statistics (like depth of coverage) can be computed for BAM files that were obtained
# by other means (e.g., downloaded from public repositories). Note that in that case, mapache
# assumes that the data were mapped to the first reference genome listed in the config file.
## by default ("") the stats are computed on the final bam files from the mapping pipeline.
## To use on other pre-computed bam files,
## one can pass a list of bam files either in an external file (in this case the parameter
## 'external_sample' should contain the file name) in the format:
## SM Bam Genome
## ind1 bam1.bam hg19
## ind2 bam2.bam hg19
## ...
## or directly in the config file in the yaml format as:
## external_sample:
## hg19:
## ind1: path_to_bam1.bam
## ind2: path_to_bam2.bam
## ...
## if the deliminator to read the sample file has to be adapted
#delim: "\s+"
##########################################################################################
# OUTPUT FOLDER
##########################################################################################
# By default, all outputs will be stored in a folder named "results"
result_dir: 'results_sarakina'
#workdir:
##########################################################################################
# Reference genome
##########################################################################################
# Lis the genome(s) to which each sample will be mapped.
# Include meaningful names for each genome. Final BAM(s) file will be named after it
# (e.g., ind1.hg19.bam)
genome:
hs37d5: /home/ref_genomes/hs37d5.fa
##########################################################################################
# Mapping workflow
##########################################################################################
# ## genome/FASTA indexing
# This step will be run once per reference genome
indexing:
mem: 16
threads: 1
time: 2
bwa_params: ''
bowtie2_params: ''
samtools_params: ''
picard_params: ''
#-----------------------------------------------------------------------------------------
## FASTQ level
# To consider when assigning runtime and memory:
# These steps will be run once per FASTQ file specified in the "samples" file.
## subsampling (optional)
subsampling:
run: False
params: '-s1'
number: 1000
# adapter_removal (optional)
cleaning:
run: 'adapterremoval' # options: adapterremoval (default), fastp, False
params_adapterremoval:
DS_index8_8: '--minlength 30 --trimns --trimqualities --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
DS_index8_7: '--minlength 30 --trimns --trimqualities --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
DS_index6_8: '--minlength 30 --trimns --trimqualities --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
DS_index6_7: '--minlength 30 --trimns --trimqualities --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG --adapter2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTNNNNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT --collapse --minalignmentlength 11'
DS_index8: '--minlength 30 --trimns --trimqualities --adapter1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG'
collapse_opt: only_collapse
params_fastp: ''
threads: 24
mem: 8 ## in GB
time: 10
# mapping (compulsory)
mapping:
mapper: 'bwa_aln' # options: bwa_aln, bwa_mem, bowtie2
bwa_aln_params: '-l 1024 -n 0.01 -o 2'
bwa_samse_params: '-n 3'
bwa_sampe_params: ''
bwa_mem_params: ''
bowtie2_params: ''
pl: 'ILLUMINA'
threads: 32
mem: 32
time: 36
## samtools_sort
sorting:
threads: 4
mem: 16
time: 10
## samtools_filter
filtering:
run: True
mapq: 30
threads: 4
mem: 16
time: 2
# Save low-quality and unmapped reads to an extra BAM file?
save_low_qual: False
#-----------------------------------------------------------------------------------------
## library level
# The configuration for the merging step applies to both
# merging mapped files per library and per sample
merging:
threads: 4
mem: 16
time: 2
# ## extract_duplicates (optional)
remove_duplicates:
run: markduplicates # (False (default), markduplicates, dedup)
params_markduplicates: '--REMOVE_DUPLICATES true'
params_dedup: '-m'
threads: 4 ## 1 for dedup
mem: 32
time: 24
# mapDamage2 (optional, not run by default)
# Do you wish to rescale base qualities in the mapped BAMs with mapDamage2?
# To compute only damage statistics (read length, deamination rates),
# without rescaling qualities, see the entry below under "stats"
damage_rescale:
run: False
params: ''
time: 2
threads: 1 ## hardcoded at 1
mem: 8
bamutil:
run: False
params:
UDG_2: '2'
nonUDG_6: '6'
nonUDG_10: '10'
default: ''
threads: 1 ## hardcoded at 1
mem: 8
#-----------------------------------------------------------------------------------------
## sample level
# realign around indels with GATK
realign:
run: True
threads: 24
mem: 32
time: 8
# samtools_calmd (optional)
compute_md:
run: True
threads: 4
mem: 16
time: 2
##########################################################################################
## Statistics for final bam files
##########################################################################################
# This section corresponds to the statistics reported by mapache.
# By default, it is run together with the mapping pipeline for all the combinations of
# input FASTQ files and genomes.
stats:
# Default: run it for samples in samples file.
# If intended to compute stats on BAMs downloaded or mapped with other tools, see instructions above
fastqc_time: 4
# Output qualimap and multiqc reports?
qualimap: True
qualimap_mem: 8
multiqc: True
# specific to plots in the html report
plots:
# x_axis: Which value should be included as the variable on the x-axis?
# E.g.,
# sample vs DoC (and bars colored by sample name)
# genome vs DoC (and bars colored by genome name)
# We recommend:
# "sample" if n_samples > n_genomes
# "genome" if n_genomes > n_samples
# if set to "auto", mapache will follow the above recommendation
x_axis: auto # either "auto" (default), "sample" or "genome"
n_col: 1 # number of panels per row if multiple samples and genomes are used
# Width and height of plots, in inches
width: 18
height: 12
# This is to draw rectangles for the thresholds of the sex assignment;
# Adjust according to the thresholds/labels previously defined
sex_ribbons: 'c("XX"="#7e3075", "XY"="#003e83")'
color: "#f2ad78"
##########################################################################################
## Analyses
##########################################################################################
damage:
## damage statistics (read length, deamination rates)
run: 'bamdamage' # Options: ("False", "bamdamage", "mapDamage")
bamdamage_fraction: 10000
# This step can take a lot of time. We recommend to run it on a small subset of the mapped
# reads in order to get an estimate of the damage. The value can be an integer (number of reads)
# or a float (fraction of mapped reads)
bamdamage_params: ' --rlength 200 --plot_length 30'
# This parameter indicates that the estimation will be done with the first 100 bp of each read
# and the plots will include the first and last 30 bp.
depth:
hs37d5:
run: True
# Mapache outputs the depth of coverage (DoC) for each of the genomes.
# If you also want the DoC to be reported for some chromosomes in the main stats table, list them below.
# Their names have to match their IDs in the FASTA file specified above.
chromosomes: ["X", "Y", "MT"]
# parameters specific to sex inference
sex_inference:
hs37d5:
run: True
# uncomment options below if you need to change any value
# List with the autosomes names. You can paste any python expression here, in single quotes ('), for example:
# '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]'
# '[f"chr{x}" for x in range(1,23)]' # chr1,chr2,chr3,...,chr22.
# '[x for x in range(1,23)]' # 1,2,3,...,22.
# the outcome should be a python list
autosomes: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
sex_chr: X
# Possible options: XY, ZW, XO, ZO
system: XY
signif: 0.9
# These are the thresholds used to assign molecular sex, in R syntax.
# For humans, we expect a ratio of DoC(X)/DoC(genome) = 1 for females and DoC(X)/DoC(genome) = 0.5 for males.
# Note that sex will be assigned by considering the confidence intervals around the estimate for such ratio,
# which can be a bit lower or higher than 1 or 0.5.
# This option can be commented as the sex script comes with default values for different sex systems.
thresholds: list( "XX"=c(0.8, 1.2), "XY"=c(0, 0.6), "consistent with XX but not XY"=c(0.6, 1.2), "consistent with XY but not XX"=c(0, 0.8) )
imputation:
hs37d5:
run: False
## The imputation is done using the first named reference genome 'genome/{name}/fasta'
gp_filter: [0.8]
## path_map and path_panel may specify a single file OR may contain the variable '{chr}' to specify it per chromosome
path_map: "/work/FAC/FBM/DBC/amalaspi/popgen/shared_ressources/genetic_maps_b37/chr{chr}.b37.gmap.gz"
path_panel: "/work/FAC/FBM/DBC/amalaspi/popgen/shared_ressources/1000Genomes/ALL.chr{chr}.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"
# Imputation will be run on the chromosomes present in the panel
# The version of the panel and chromosome names should match those of
# the reference genome to which individual samples were mapped.
chromosomes: '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 X Y MT]'
#chromosomes: [20,21]
glimse_chunk_params: ""
glimse_phase_params: ""
# Input BAM files can be listed individually under the "paths" keyword
# Alternatively, you can pass a plain text file (bam_list)
# with the paths in it
#bam_list: None
#paths:
# ind1: results/03_sample/03_final_sample/01_bam/ind1.hg19.bam
# ind2: results/03_sample/03_final_sample/01_bam/ind2.hg19.bam
bcftools_mpileup_threads: 1
bcftools_mpileup_mem: 4
bcftools_mpileup_time: 6
split_genome_mem: 2
split_genome_time: 4
extract_positions_mem: 4
extract_positions_time: 6
impute_phase_threads: 1
impute_phase_mem: 2
impute_phase_time: 2
ligate_chunks_mem: 8
##########################################################################################
# Job resubmission
##########################################################################################
# On an HPC system, a job might be cancelled/unfinished sue to insufficient resources
# (time or memory) allocated to it.
# global variables to define about resource increments after job failure
# 0: no change between failures
# 1: doubling after each failure
memory_increment_ratio: 0.5
runtime_increment_ratio: 0.5
##########################################################################################
# Software
##########################################################################################
software:
picard_jar: 'picard'
gatk3_jar: 'GenomeAnalysisTK'
# By default, mapache uses the software installed with the conda environment. Thus, no need
# to modify this section.
# If you want to use a different version of a software (for instance, one that can
# be loaded with module load module_name on your server), you can add the
# name and version of it in this section.
# For this to take effect, you need to invoke the pipeline with the option --use-envmodules
envmodules:
samtools: "gcc samtools/1.12"
bowtie2: "gcc bowtie2/2.4.2"
bwa: "gcc bwa/0.7.17"
picard: "gcc picard/2.24.0"
gatk3: "gcc gatk/3.8-1"
fastqc: "gcc fastqc/0.11.9"
r: "gcc r/4.0.4"
adapterremoval: "gcc adapterremoval/2.3.2"
fastp: ""
mapdamage: ""
bedtools: "gcc bedtools2/2.29.2"
dedup: ""
seqtk: ""
qualimap: ""
multiqc: ""
glimpse: ""