-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrqcfilter.wdl
318 lines (282 loc) · 9.29 KB
/
rqcfilter.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# metaT QC workflow
version 1.0
workflow metaTReadsQC {
input{
Array[File?] input_files
String database="/refdata/"
String proj
Int rqc_mem = 180
Int rqc_thr = 16
Int interleave_mem = 10
Boolean gcloud_env=false
String prefix=sub(proj, ":", "_")
String bbtools_container="microbiomedata/bbtools@sha256:df16e8343661effe8060478d14a9b4620cf484efba0d4818a467276d4bb07a6d"
String workflow_container = "microbiomedata/workflowmeta:1.1.1"
}
if (length(input_files) == 1) {
call stage_single {
input:
container=bbtools_container,
input_file = input_files[0]
}
}
if (length(input_files) > 1) {
call stage_interleave {
input:
container=bbtools_container,
memory=interleave_mem,
input_fastq1=input_files[0],
input_fastq2=input_files[1]
}
}
# Estimate RQC runtime at an hour per compress GB
call rqcfilter as qc {
input:
input_fastq = if length(input_files) == 1 then stage_single.reads_fastq else stage_interleave.reads_fastq,
database = database,
gcloud_env = gcloud_env,
memory = rqc_mem,
threads = rqc_thr,
container = bbtools_container
}
call make_info_file {
input: info_file = qc.info_file,
container = bbtools_container,
prefix = prefix
}
call finish_rqc {
input: container = workflow_container,
prefix = prefix,
filtered = qc.filtered,
filtered_stats = qc.stat,
filtered_stats2 = qc.stat2,
filtered_ribo = qc.filtered_ribo
}
output {
File filtered_final = finish_rqc.filtered_final
File filtered_stats_final = finish_rqc.filtered_stats_final
File filtered_stats2_final = finish_rqc.filtered_stats2_final
File rqc_info = make_info_file.rqc_info
File filtered_ribo_final = finish_rqc.filtered_ribo_final
}
}
task stage_single {
input{
String container
String target="raw.fastq.gz"
File? input_file
}
command <<<
set -oeu pipefail
if [ $( echo ~{input_file}|egrep -c "https*:") -gt 0 ] ; then
wget ~{input_file} -O ~{target}
else
ln ~{input_file} ~{target} || ln -s ~{input_file} ~{target}
fi
# Capture the start time
date --iso-8601=seconds > start.txt
>>>
output{
File reads_fastq = "~{target}"
String start = read_string("start.txt")
}
runtime {
memory: "1 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task stage_interleave {
input{
String container
Int memory
String target_reads_1="raw_reads_1.fastq.gz"
String target_reads_2="raw_reads_2.fastq.gz"
String output_interleaved="raw_interleaved.fastq.gz"
File? input_fastq1
File? input_fastq2
}
command <<<
set -oeu pipefail
if [ $( echo ~{input_fastq1} | egrep -c "https*:") -gt 0 ] ; then
wget ~{input_fastq1} -O ~{target_reads_1}
wget ~{input_fastq2} -O ~{target_reads_2}
else
set +o pipefail
ln ~{input_fastq1} ~{target_reads_1} || ln -s ~{input_fastq1} ~{target_reads_1}
ln ~{input_fastq2} ~{target_reads_2} || ln -s ~{input_fastq2} ~{target_reads_2}
fi
reformat.sh -Xmx~{memory}G in1=~{target_reads_1} in2=~{target_reads_2} out=~{output_interleaved}
# Capture the start time
date --iso-8601=seconds > start.txt
# Validate that the read1 and read2 files are sorted correct to interleave
reformat.sh -Xmx~{memory}G verifypaired=t in=~{output_interleaved}
>>>
output{
File reads_fastq = "~{output_interleaved}"
String start = read_string("start.txt")
}
runtime {
memory: "~{memory} GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task rqcfilter{
input{
File? input_fastq
String container
String database
String rqcfilterdata = database + "/RQCFilterData"
Boolean gcloud_env=false
String gcloud_db_path = "/cromwell_root/workflows_refdata/refdata/RQCFilterData"
Int memory
Int xmxmem = floor(memory * 0.85)
Int threads
String filename_outlog="stdout.log"
String filename_errlog="stderr.log"
String filename_stat="filtered/filterStats.txt"
String filename_stat2="filtered/filterStats2.txt"
String filename_stat_json="filtered/filterStats.json"
String filename_reproduce="filtered/reproduce.sh"
String filename_ribo = "filtered/rRNA.fastq.gz"
}
command<<<
export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P"
set -eou pipefail
if ~{gcloud_env}; then
dbdir=`ls -d /mnt/*/*/RQCFilterData`
if [ ! -z $dbdir ]; then
ln $dbdir ~{gcloud_db_path} || ln -s $dbdir ~{gcloud_db_path}
else
echo "Cannot find gcloud refdb" 1>&2
fi
fi
rqcfilter2.sh \
barcodefilter=f \
chastityfilter=f \
clumpify=t \
extend=f \
jni=t \
usejni=f \
khist=t \
log=status.log \
maq=10 \
maxns=1 \
minlen=51 \
mlf=0.33 \
mtst=t \
outribo=rRNA.fastq.gz \
path=filtered \
phix=t \
pigz=t \
qtrim=r \
removecat=t \
removedog=t \
removehuman=t \
removemicrobes=t \
removemouse=t \
removeribo=t \
rna=t \
rqcfilterdata=~{if gcloud_env then gcloud_db_path else rqcfilterdata} \
sketch=t \
stats=~{filename_stat} \
trimfragadapter=t \
trimpolyg=5 \
trimq=0 \
unpigz=t \
~{if (defined(memory)) then "-Xmx" + xmxmem + "G" else "-Xmx101077m" } \
~{if (defined(threads)) then "threads=" + threads else "threads=auto" } \
in=~{input_fastq} \
> >(tee -a ~{filename_outlog}) \
2> >(tee -a ~{filename_errlog} >&2)
# moved input file to end for processing of resubmit.sh line, the rest is kinda alphabetized
python <<CODE
import json
f = open("~{filename_stat}",'r')
d = dict()
for line in f:
if not line.rstrip():continue
key,value=line.rstrip().split('=')
d[key]=float(value) if 'Ratio' in key else int(value)
with open("~{filename_stat_json}", 'w') as outfile:
json.dump(d, outfile)
CODE
>>>
output {
File stdout = filename_outlog
File stderr = filename_errlog
File stat = filename_stat
File stat2 = filename_stat2
File info_file = filename_reproduce
File filtered = "filtered/raw.anqrpht.fastq.gz"
File filtered_ribo = filename_ribo
File json_out = filename_stat_json
}
runtime {
docker: container
memory: "~{memory} GiB"
maxRetries: 1
cpu: threads
}
}
task make_info_file {
input{
File info_file
String prefix
String container
}
command<<<
set -oeu pipefail
sed -n 2,5p ~{info_file} 2>&1 | \
sed -e 's/^#//; s/in=[^ ]*/in=filename.fastq.gz/; s/#//g; s/BBTools/BBTools(1)/g;' > \
~{prefix}_readsQC.info
echo -e "\n(1) B. Bushnell: BBTools software package, http://bbtools.jgi.doe.gov/" >> \
~{prefix}_readsQC.info
# perl -ne 's:in=/.*/(.*) :in=$1:; s/#//; s/BBTools/BBTools(1)/; print;' > \ # for container microbiomedata/workflowmeta:1.1.1
>>>
output {
File rqc_info = "~{prefix}_readsQC.info"
}
runtime {
memory: "1 GiB"
cpu: 1
maxRetries: 1
docker: container
}
}
task finish_rqc {
input {
File filtered_stats
File filtered_stats2
File filtered
File filtered_ribo
String container
String prefix
}
command<<<
set -oeu pipefail
end=`date --iso-8601=seconds`
# Generate QA objects
ln ~{filtered} ~{prefix}_filtered.fastq.gz || ln -s ~{filtered} ~{prefix}_filtered.fastq.gz
ln ~{filtered_stats} ~{prefix}_filterStats.txt || ln -s ~{filtered_stats} ~{prefix}_filterStats.txt
ln ~{filtered_stats2} ~{prefix}_filterStats2.txt || ln -s ~{filtered_stats2} ~{prefix}_filterStats2.txt
ln ~{filtered_ribo} ~{prefix}_rRNA.fastq.gz || ln -s ~{filtered_ribo} ~{prefix}_rRNA.fastq.gz
# Generate stats but rename some fields untilt the script is fixed.
/scripts/rqcstats.py ~{filtered_stats} > stats.json
ln stats.json ~{prefix}_qa_stats.json || ln -s stats.json ~{prefix}_qa_stats.json
>>>
output {
File filtered_final = "~{prefix}_filtered.fastq.gz"
File filtered_stats_final = "~{prefix}_filterStats.txt"
File filtered_stats2_final = "~{prefix}_filterStats2.txt"
File filtered_ribo_final = "~{prefix}_rRNA.fastq.gz"
}
runtime {
docker: container
memory: "1 GiB"
cpu: 1
}
}