forked from eresearchqut/ontvisc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
971 lines (833 loc) · 34.8 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
def helpMessage () {
log.info """
ONTViSc - ONT-based Viral Screening for Biosecurity
Marie-Emilie Gauthier 23/05/2023
Magda Antczak
Craig Windell
Roberto Barrero
Usage:
Run the command
nextflow run eresearchqut/ontvisc {optional arguments}...
Optional arguments:
-resume Resume a failed run
--outdir Path to save the output file
'results'
--samplesheet '[path/to/file]' Path to the csv file that contains the list of
samples to be analysed by this pipeline.
Default: 'index.csv'
Contents of samplesheet csv:
sampleid,sample_files
SAMPLE01,/user/folder/sample.fastq.gz
SAMPLE02,/user/folder/*.fastq.gz
sample_files can refer to a folder with a number of
files that will be merged in the pipeline
#### Pre-processing and QC options ####
--qc_only Only perform preliminary QC step using Nanoplot
--analysis_mode clustering, denovo_assembly, read_classification, map2ref
Default: ''
--adapter_trimming Run porechop step
[False]
--porechop_options Porechop_ABI options
[null]
--qual_filt Run quality filtering step
[False]
--chopper_options Chopper options
[null]
--host_filtering Run host filtering step using Minimap2
Default: false
--host_fasta Fasta file of nucleotide sequences to filter
[null]
--canu Use Canu for de novo assembly step
Default: false
--canu_options Canu options
Default: ''
--flye Use Flye for de novo assembly step
--flye_ont_mode Select from nano-raw, nano-corr, nano-hq
Default: 'nano-raw'
--flye_options Flye options
Default: ''
--rattle_clustering_options Rattle clustering options
Default: ''
--rattle_polishing_options Rattle polishing options
Default: ''
--final_primer_check Performs a final primer check after performing de novo assembly step
Default: false
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
if (params.blastn_db != null) {
blastn_db_name = file(params.blastn_db).name
blastn_db_dir = file(params.blastn_db).parent
}
if (params.reference != null) {
reference_name = file(params.reference).name
reference_dir = file(params.reference).parent
}
if (params.kaiju_nodes != null & params.kaiju_dbname != null & params.kaiju_names != null) {
kaiju_dbs_dir = file(params.kaiju_nodes).parent
}
if (params.krkdb != null) {
krkdb_dir = file(params.krkdb).parent
}
switch (workflow.containerEngine) {
case "singularity":
bindbuild = "";
if (params.blastn_db != null) {
bindbuild = (bindbuild + "-B ${blastn_db_dir} ")
}
if (params.reference != null) {
bindbuild = (bindbuild + "-B ${reference_dir} ")
}
if (params.kaiju_nodes != null & params.kaiju_dbname != null & params.kaiju_names != null) {
bindbuild = (bindbuild + "-B ${kaiju_dbs_dir} ")
}
if (params.krkdb != null) {
bindbuild = (bindbuild + "-B ${krkdb_dir} ")
}
bindOptions = bindbuild;
break;
default:
bindOptions = "";
}
process MERGE {
//publishDir "${params.outdir}/${sampleid}/merge", pattern: '*.fastq.gz', mode: 'link'
tag "${sampleid}"
label 'setting_1'
input:
tuple val(sampleid), path(lanes)
output:
tuple val(sampleid), path("${sampleid}.fastq.gz"), emit: merged
script:
"""
cat ${lanes} > ${sampleid}.fastq.gz
"""
}
process QCREPORT {
publishDir "${params.outdir}/qc_report", mode: 'copy'
containerOptions "${bindOptions}"
input:
path multiqc_files
output:
path("run_qc_report_*txt")
script:
"""
seq_run_qc_report.py --host_filtering ${params.host_filtering} --adapter_trimming ${params.adapter_trimming} --quality_trimming ${params.qual_filt}
"""
}
process CUTADAPT {
publishDir "${params.outdir}/${sampleid}/assembly", pattern: '*_filtered.fa', mode: 'link'
publishDir "${params.outdir}/${sampleid}/assembly", pattern: '*_cutadapt.log', mode: 'link'
tag "${sampleid}"
label 'medium'
input:
tuple val(sampleid), path(sample)
output:
file("${sampleid}_cutadapt.log")
file("${sample.baseName}_filtered.fa")
tuple val(sampleid), path("${sample.baseName}_filtered.fa"), emit: trimmed
script:
"""
cutadapt -j ${task.cpus} -g "AAGCAGTGGTATCAACGCAGAGTACGCGGG;min_overlap=14" -a "CCCGCGTACTCTGCGTTGATACCACTGCTT;min_overlap=14" -o ${sample.baseName}_filtered.fa ${sample} > ${sampleid}_cutadapt.log
sed -i 's/len=[0-9]* reads/reads/' ${sample.baseName}_filtered.fa
sed -i 's/ trim=.*\$//' ${sample.baseName}_filtered.fa
"""
}
process CHOPPER {
publishDir "${params.outdir}/${sampleid}/preprocessing/chopper", pattern: '*_chopper.log', mode: 'link'
tag "${sampleid}"
label 'setting_3'
input:
tuple val(sampleid), path(sample)
output:
path("${sampleid}_chopper.log")
tuple val(sampleid), path("${sampleid}_filtered.fastq.gz"), emit: chopper_filtered_fq
script:
def chopper_options = (params.chopper_options) ? " ${params.chopper_options}" : ''
"""
gunzip -c ${sample} | chopper ${chopper_options} 2> ${sampleid}_chopper.log | gzip > ${sampleid}_filtered.fastq.gz
"""
}
/*
#recommended settings for CANU using metagnomics data
https://github.com/marbl/canu/issues/2079
genomeSize=20m
maxInputCoverage=10000 corOutCoverage=10000
corMhapSensitivity=high
corMinCoverage=0
redMemory=32 oeaMemory=32 batMemory=64
useGrid=false
minReadLength=200
minOverlapLength=50
maxThreads=4
minInputCoverage=0
stopOnLowCoverage=0
maxInputCoverage=10000 corOutCoverage=10000 corMhapSensitivity=high corMinCoverage=0 redMemory=32 oeaMemory=32 batMemory=64 useGrid=false minReadLength=200 minOverlapLength=50 maxThreads=4 minInputCoverage=0 stopOnLowCoverage=0
*/
process CANU {
publishDir "${params.outdir}/${sampleid}/assembly/canu", mode: 'copy', pattern: '{*.fasta,*.log}'
tag "${sampleid}"
label 'setting_8'
input:
tuple val(sampleid), path(fastq)
output:
path("${sampleid}_canu_assembly.fasta")
path("${sampleid}_canu.log")
tuple val(sampleid), path("${sampleid}_canu.fastq.gz"), path("${sampleid}_canu_assembly.fasta"), emit: assembly
tuple val(sampleid), path("${sampleid}_canu_assembly.fasta"), emit: assembly2
script:
def canu_options = (params.canu_options) ? " ${params.canu_options}" : ''
"""
canu -p ${sampleid} -d ${sampleid} \
genomeSize=${params.canu_genome_size} \
-nanopore ${fastq} ${canu_options} 2> ${sampleid}_canu.log
if [[ ! -s ${sampleid}/${sampleid}.contigs.fasta ]]
then
touch ${sampleid}_canu_assembly.fasta
else
cat ${sampleid}/${sampleid}.contigs.fasta ${sampleid}/${sampleid}.unassembled.fasta > ${sampleid}_canu_assembly.fasta
fi
cp ${fastq} ${sampleid}_canu.fastq.gz
"""
}
process FLYE {
publishDir "${params.outdir}/${sampleid}/assembly/flye", mode: 'copy', pattern: '{*.fasta,*.log}'
tag "${sampleid}"
label 'setting_9'
input:
tuple val(sampleid), path(fastq)
output:
path("${sampleid}_flye_assembly.fasta")
path("${sampleid}_flye.log")
tuple val(sampleid), path("${sampleid}_flye.fastq.gz"), path("${sampleid}_flye_assembly.fasta"), emit: assembly
tuple val(sampleid), path("${sampleid}_flye_assembly.fasta"), emit: assembly2
script:
def flye_options = (params.flye_options) ? " ${params.flye_options}" : ''
"""
flye ${flye_options} --out-dir outdir --threads ${task.cpus} --${params.flye_mode} ${fastq}
if [[ ! -s outdir/assembly.fasta ]]
then
touch ${sampleid}_flye_assembly.fasta
else
cp outdir/assembly.fasta ${sampleid}_flye_assembly.fasta
cp outdir/flye.log ${sampleid}_flye.log
fi
cp ${fastq} ${sampleid}_flye.fastq.gz
"""
}
/*
errorStrategy 'ignore'
*/
process BLASTN2REF {
publishDir "${params.outdir}/${sampleid}", mode: 'copy', pattern: '*/*/*txt'
tag "${sampleid}"
label 'setting_1'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(assembly)
output:
path "*/*/${sampleid}_blastn_reference_vs_*.txt"
script:
"""
if [[ ${assembly} == *_assembly*.fa* ]] ;
then
if [[ ${assembly} == *canu_assembly*.fa* ]] ;
then
mkdir -p assembly/blast_to_ref
blastn -query ${assembly} -subject ${reference_dir}/${reference_name} -evalue 1e-3 -out ${sampleid}_blastn_reference_vs_canu_assembly_tmp.txt \
-outfmt '6 qseqid sacc length pident mismatch gapopen qstart qend qlen sstart send slen evalue bitscore qcovhsp qcovs' -max_target_seqs 5
echo "qseqid\tsacc\tlength\tpident\tmismatch\tgapopen\tqstart\tqend\tqlen\tsstart\tsend\tslen\tevalue\tbitscore\tqcovhsp\tqcovs" > header
cat header ${sampleid}_blastn_reference_vs_canu_assembly_tmp.txt > assembly/blast_to_ref/${sampleid}_blastn_reference_vs_canu_assembly.txt
elif [[ ${assembly} == *flye_assembly*.fasta ]] ;
then
mkdir -p assembly/blast_to_ref
blastn -query ${assembly} -subject ${reference_dir}/${reference_name} -evalue 1e-3 -out ${sampleid}_blastn_reference_vs_flye_assembly_tmp.txt \
-outfmt '6 qseqid sacc length pident mismatch gapopen qstart qend qlen sstart send slen evalue bitscore qcovhsp qcovs' -max_target_seqs 5
echo "qseqid\tsacc\tlength\tpident\tmismatch\tgapopen\tqstart\tqend\tqlen\tsstart\tsend\tslen\tevalue\tbitscore\tqcovhsp\tqcovs" > header
cat header ${sampleid}_blastn_reference_vs_flye_assembly_tmp.txt > assembly/blast_to_ref/${sampleid}_blastn_reference_vs_flye_assembly.txt
fi
elif [[ ${assembly} == *clustering.fasta ]] ;
then
mkdir -p clustering/blast_to_ref
blastn -query ${assembly} -subject ${reference_dir}/${reference_name} -evalue 1e-3 -out ${sampleid}_blastn_reference_vs_clustering_tmp.txt \
-outfmt '6 qseqid sacc length pident mismatch gapopen qstart qend qlen sstart send slen evalue bitscore qcovhsp qcovs' -max_target_seqs 5
echo "qseqid\tsacc\tlength\tpident\tmismatch\tgapopen\tqstart\tqend\tqlen\tsstart\tsend\tslen\tevalue\tbitscore\tqcovhsp\tqcovs" > header
cat header ${sampleid}_blastn_reference_vs_clustering_tmp.txt > clustering/blast_to_ref/${sampleid}_blastn_reference_vs_clustering_assembly.txt
fi
"""
}
process MINIMAP2_REF {
tag "${sampleid}"
label 'setting_2'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(sample)
output:
tuple val(sampleid), file("${sampleid}_aln.sam"), emit: aligned_sample
script:
"""
minimap2 -ax map-ont --MD --sam-hit-only ${reference_dir}/${reference_name} ${sample} > ${sampleid}_aln.sam
"""
}
process SAMTOOLS {
publishDir "${params.outdir}/${sampleid}/mapping", mode: 'copy'
tag "${sampleid}"
label 'setting_2'
input:
tuple val(sampleid), path(sample)
output:
path "${sampleid}_aln.sorted.bam"
path "${sampleid}_aln.sorted.bam.bai"
path "${sampleid}_coverage.txt"
path "${sampleid}_histogram"
tuple val(sampleid), path("${sampleid}_aln.sorted.bam"), path("${sampleid}_aln.sorted.bam.bai"), emit: sorted_sample
script:
"""
samtools view -Sb ${sample} | samtools sort -o ${sampleid}_aln.sorted.bam
samtools index ${sampleid}_aln.sorted.bam
samtools coverage ${sampleid}_aln.sorted.bam > ${sampleid}_histogram.txt > ${sampleid}_coverage.txt
samtools coverage -A -w 50 ${sampleid}_aln.sorted.bam > ${sampleid}_histogram
"""
}
/*
process BAMCOVERAGE {
publishDir "${params.outdir}/${sampleid}/mapping", mode: 'link'
tag "${sampleid}"
label 'setting_11'
input:
tuple val(sampleid), path(bam), path(bai)
output:
path "${sampleid}.bw"
script:
"""
bamCoverage -b ${bam} -o ${sampleid}.bw
"""
}
process BAMCOVERAGE {
publishDir "${params.outdir}/${sampleid}/mapping", mode: 'link'
tag "${sampleid}"
label 'setting_11'
input:
tuple val(sampleid), path(bam), path(bai)
output:
path "${sampleid}.bamcov.txt"
script:
"""
bamcov -H ${bam} > ${sampleid}.bamcov.txt
"""
}
*/
process PORECHOP_ABI {
tag "${sampleid}"
publishDir "$params.outdir/${sampleid}/preprocessing/porechop", mode: 'link', pattern: '*_porechop.log'
label "setting_8"
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(sample)
output:
file("${sampleid}_porechop_trimmed.fastq.gz")
file("${sampleid}_porechop.log")
tuple val(sampleid), file("${sampleid}_porechop_trimmed.fastq.gz"), emit: porechopabi_trimmed_fq
script:
def porechop_options = (params.porechop_options) ? " ${params.porechop_options}" : ''
"""
if [[ ${params.porechop_custom_primers} == true ]]; then
porechop_abi -i ${sample} -t ${task.cpus} -o ${sampleid}_porechop_trimmed.fastq.gz --custom_adapters ${projectDir}/bin/adapters.txt ${porechop_options} > ${sampleid}_porechop.log
else
porechop_abi -i ${sample} -t ${task.cpus} -o ${sampleid}_porechop_trimmed.fastq.gz ${porechop_options} > ${sampleid}_porechop.log
fi
"""
}
//trims fastq read names after the first whitespace
process REFORMAT {
tag "${sampleid}"
label "setting_3"
publishDir "$params.outdir/${sampleid}/preprocessing", mode: 'copy'
input:
tuple val(sampleid), path(fastq)
output:
tuple val(sampleid), path("${sampleid}_preprocessed.fastq.gz"), emit: reformatted_fq
script:
"""
reformat.sh in=${fastq} out=${sampleid}_preprocessed.fastq.gz trd qin=33
"""
}
process CAP3 {
tag "${sampleid}"
label "setting_3"
time "3h"
publishDir "$params.outdir/$sampleid/clustering/cap3", mode: 'copy', pattern: '*_clustering.fasta'
publishDir "$params.outdir/$sampleid/clustering/rattle", mode: 'copy', pattern: '*_rattle.fasta'
input:
tuple val(sampleid), path(fasta)
output:
file("${sampleid}_rattle.fasta")
tuple val(sampleid), path("${sampleid}_clustering.fasta"), emit: scaffolds
script:
"""
cap3 ${fasta}
cat ${fasta}.cap.singlets ${fasta}.cap.contigs > ${sampleid}_clustering.fasta
cp ${fasta} ${sampleid}_rattle.fasta
"""
}
process EXTRACT_VIRAL_BLAST_HITS {
tag "${sampleid}"
label "setting_2"
publishDir "$params.outdir/$sampleid", mode: 'copy', pattern: '*/*/*txt'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(blast_results)
output:
file "*/*/${sampleid}*_blastn_top_hits.txt"
file "*/*/${sampleid}*_blastn_top_viral_hits.txt"
file "*/*/${sampleid}*_blastn_top_viral_spp_hits.txt"
file "*/*/${sampleid}*_queryid_list_with_viral_match.txt"
file "*/*/${sampleid}*_viral_spp_abundance.txt"
script:
"""
if [[ ${blast_results} == *clustering_blastn.bls ]] ;
then
mkdir -p clustering/blastn
cat ${blast_results} > clustering/blastn/${sampleid}_blastn.txt
cd clustering/blastn
select_top_blast_hit.py --sample_name ${sampleid} --blastn_results ${sampleid}_blastn.txt --analysis_method clustering --mode ${params.blast_mode}
elif [[ ${blast_results} == *assembly*_blastn.bls ]] ;
then
mkdir -p assembly/blastn
cat ${blast_results} > assembly/blastn/${sampleid}_blastn.txt
cd assembly/blastn
select_top_blast_hit.py --sample_name ${sampleid} --blastn_results ${sampleid}_blastn.txt --analysis_method assembly --mode ${params.blast_mode}
fi
"""
}
process EXTRACT_VIRAL_BLAST_SPLIT_HITS {
tag "${sampleid}"
label "setting_2"
publishDir "$params.outdir/$sampleid/read_classification", mode: 'copy', pattern: 'homology_search/*txt'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(blast_results)
output:
file "*/${sampleid}*_blastn_top_hits.txt"
file "*/${sampleid}*_blastn_top_viral_hits.txt"
file "*/${sampleid}*_blastn_top_viral_spp_hits.txt"
file "*/${sampleid}*_queryid_list_with_viral_match.txt"
file "*/${sampleid}*_viral_spp_abundance.txt"
script:
"""
mkdir homology_search
cat ${blast_results} > homology_search/${sampleid}_blastn.txt
cd homology_search
select_top_blast_hit.py --sample_name ${sampleid} --blastn_results ${sampleid}_blastn.txt --analysis_method read_classification --mode ${params.blast_mode}
"""
}
process CONCATENATE_FASTA {
tag "${sampleid}"
label "setting_2"
publishDir "${params.outdir}/${sampleid}", mode: 'link'
input:
tuple val(sampleid), path("${sampleid}_canu_assembly.fasta")
tuple val(sampleid), path("${sampleid}_cap3.fasta")
tuple val(sampleid), path("${sampleid}.fasta")
output:
file "${sampleid}_merged.fasta"
tuple val(sampleid), path("*_merged.fasta"), emit: assembly
script:
"""
seqtk seq -l0 ${sampleid}_canu_assembly.fasta > ${sampleid}_canu_assembly_1l.fasta
seqtk seq -l0 ${sampleid}_cap3.fasta > ${sampleid}_cap3_1l.fasta
seqtk seq -l0 ${sampleid}.fasta > ${sampleid}_1l.fasta
cat ${sampleid}_canu_assembly_1l.fasta ${sampleid}_cap3_1l.fasta ${sampleid}.fasta > ${sampleid}_merged.fasta
"""
}
/*
KAIJU notes
The default run mode is Greedy with three allowed mismatches.
The number of allowed mismatches can be changed using option -e.
In Greedy mode, matches are filtered by a minimum length and score and their E-value (similar to blastp)
The cutoffs for minimum required match length and match score can be changed using the options -m (default: 11) and -s (default: 65)
Minimum E-value can be adjusted with the option -E (default 0.01).
For fastest classification, , use MEM mode with option '-a mem' and multiple parallel threads (-z)
Greedy run mode yields a higher sensitivity compared with MEM mode.
For lowest memory usage use the proGenomes reference database. The number of parallel threads has only little impact on memory usage.
Further, the choice of the minimum required match length (-m) in MEM mode or match score (-s) in Greedy mode governs the trade-off between
sensitivity and precision of the classification. Please refer to the paper for a discussion on this topic.
Option -x enables filtering of query sequences containing low-complexity regions by using the SEG algorithm from the blast+ package.
It is enabled by default and can be disabled by the -X option. SEG filtering is always recommended in order to avoid
false positive taxon assignments that are caused by spurious matches due to simple repeat patterns or other sequencing noise.
The accuracy of the classification depends both on the choice of the reference database and the chosen options when running Kaiju.
These choices also affect the speed and memory usage of Kaiju.
For highest sensitivity, it is recommended to use the nr database (+eukaryotes) as a reference database because it is the most comprehensive
set of protein sequences. Alternatively, use proGenomes over Refseq for increased sensitivity.
NOTE, viroid will not be detected using this approach
Mandatory arguments:
-t FILENAME Name of nodes.dmp file
-f FILENAME Name of database (.fmi) file
-i FILENAME Name of input file containing reads in FASTA or FASTQ format
Optional arguments:
-j FILENAME Name of second input file for paired-end reads
-o FILENAME Name of output file. If not specified, output will be printed to STDOUT
-z INT Number of parallel threads for classification (default: 1)
-a STRING Run mode, either "mem" or "greedy" (default: greedy)
-e INT Number of mismatches allowed in Greedy mode (default: 3) #set to 1 in kodoja
-m INT Minimum match length (default: 11) #set to 15 in kodoja
-s INT Minimum match score in Greedy mode (default: 65) #set to 85 in Kodoja
-E FLOAT Minimum E-value in Greedy mode
-x Enable SEG low complexity filter (enabled by default)
-X Disable SEG low complexity filter
-p Input sequences are protein sequences
-v Enable verbose output
*/
process KAIJU {
publishDir "${params.outdir}/${sampleid}/read_classification/kaiju", mode: 'copy'
label 'setting_4'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(fastq)
output:
tuple val(sampleid), path('*kaiju.krona'), emit: results
file "${sampleid}_kaiju_name.tsv"
file "${sampleid}_kaiju_summary*.tsv"
file "${sampleid}_kaiju.krona"
script:
"""
c1grep() { grep "\$@" || test \$? = 1; }
kaiju \
-z ${params.kaiju_threads} \
-t ${params.kaiju_nodes} \
-f ${params.kaiju_dbname} \
-o ${sampleid}_kaiju.tsv \
-i ${fastq} \
-v
kaiju-addTaxonNames -t ${params.kaiju_nodes} -n ${params.kaiju_names} -i ${sampleid}_kaiju.tsv -o ${sampleid}_kaiju_name.tsv
kaiju2table -e -t ${params.kaiju_nodes} -r species -n ${params.kaiju_names} -o ${sampleid}_kaiju_summary.tsv ${sampleid}_kaiju.tsv
kaiju2krona -t ${params.kaiju_nodes} -n ${params.kaiju_names} -i ${sampleid}_kaiju.tsv -o ${sampleid}_kaiju.krona
c1grep "taxon_id\\|virus\\|viroid\\|viricota\\|viridae\\|viriform\\|virales\\|virinae\\|viricetes\\|virae\\|viral" ${sampleid}_kaiju_summary.tsv > ${sampleid}_kaiju_summary_viral.tsv
awk -F'\\t' '\$2>=0.05' ${sampleid}_kaiju_summary_viral.tsv > ${sampleid}_kaiju_summary_viral_filtered.tsv
"""
}
process KRONA {
publishDir "${params.outdir}/${sampleid}/read_classification/kaiju", mode: 'link'
label 'setting_3'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(krona_input)
output:
file "${sampleid}_krona.html"
script:
"""
ktImportText -o ${sampleid}_krona.html ${krona_input}
"""
}
/*
KRAKEN2
https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown
If you get the Loading database information...classify: Error reading in hash table error allocate more memory to the job. Keep incrementing the memory request until you see a status message in the job log like
Usage: kraken2 [options] <filename(s)>
Options:
--db NAME Name for Kraken 2 DB
(default: none)
--threads NUM Number of threads (default: 1)
--quick Quick operation (use first hit or hits)
--unclassified-out FILENAME
Print unclassified sequences to filename
--classified-out FILENAME
Print classified sequences to filename
--output FILENAME Print output to filename (default: stdout); "-" will
suppress normal output
--confidence FLOAT Confidence score threshold (default: 0.0); must be
in [0, 1].
--minimum-base-quality NUM
Minimum base quality used in classification (def: 0,
only effective with FASTQ input).
--report FILENAME Print a report with aggregrate counts/clade to file
--use-mpa-style With --report, format report output like Kraken 1's
kraken-mpa-report
--report-zero-counts With --report, report counts for ALL taxa, even if
counts are zero
--report-minimizer-data With --report, report minimizer and distinct minimizer
count information in addition to normal Kraken report
--memory-mapping Avoids loading database into RAM
--paired The filenames provided have paired-end reads
--use-names Print scientific names instead of just taxids
--gzip-compressed Input files are compressed with gzip
--bzip2-compressed Input files are compressed with bzip2
--minimum-hit-groups NUM
Minimum number of hit groups (overlapping k-mers
sharing the same minimizer) needed to make a call
(default: 3)
--help Print this message
--version Print version information
32 min using 4 cpus and 164 Gb of mem, test with 2 cpus instead
*/
process KRAKEN2 {
tag "${sampleid}"
label 'setting_5'
publishDir "$params.outdir/$sampleid/read_classification/kraken", mode: 'copy'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(fastq)
output:
file("${sampleid}.kraken2")
file("${sampleid}_kraken_report.txt")
file("${sampleid}_unclassified.fastq")
tuple val(sampleid), path("${sampleid}_kraken_report.txt"), emit: results
script:
"""
kraken2 --db ${params.krkdb} \
--use-names \
--threads ${task.cpus} \
--report ${sampleid}_kraken_report.txt \
--report-minimizer-data \
--minimum-hit-groups 3 \
--unclassified-out ${sampleid}_unclassified.fastq \
${fastq} > ${sampleid}.kraken2
"""
}
process BRACKEN {
tag "${sampleid}"
label 'setting_2'
publishDir "$params.outdir/$sampleid/read_classification/kraken", mode: 'link'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(kraken_report)
output:
file("${sampleid}_bracken_report*.txt")
script:
"""
c1grep() { grep "\$@" || test \$? = 1; }
est_abundance.py -i ${kraken_report} \
-k ${params.krkdb}/database50mers.kmer_distrib \
-t 1 \
-l S -o ${sampleid}_bracken_report.txt
c1grep "taxonomy_id\\|virus\\|viroid" ${sampleid}_bracken_report.txt > ${sampleid}_bracken_report_viral.txt
awk -F'\\t' '\$7>=0.0001' ${sampleid}_bracken_report_viral.txt > ${sampleid}_bracken_report_viral_filtered.txt
"""
}
process RATTLE {
publishDir "${params.outdir}/${sampleid}/clustering/rattle", mode: 'copy', pattern: 'transcriptome.fq'
tag "${sampleid}"
label 'setting_7'
containerOptions "${bindOptions}"
input:
tuple val(sampleid), path(fastq)
output:
file("transcriptome.fq")
tuple val(sampleid), path("transcriptome.fq"), emit: clusters
tuple val(sampleid), path(fastq), path("transcriptome.fq"), emit: clusters2
def rattle_polishing_options = (params.rattle_polishing_options) ? " ${params.rattle_polishing_options}" : ''
def rattle_clustering_options = (params.rattle_clustering_options) ? " ${params.rattle_clustering_options}" : ''
script:
"""
rattle cluster -i ${fastq} -t ${task.cpus} ${rattle_clustering_options} -o . --rna
rattle cluster_summary -i ${fastq} -c clusters.out > ${sampleid}_cluster_summary.txt
mkdir clusters
rattle extract_clusters -i ${fastq} -c clusters.out -l ${sampleid} -o clusters --fastq
rattle correct -i ${fastq} -c clusters.out -t ${task.cpus} -l ${sampleid}
rattle polish -i consensi.fq -t ${task.cpus} --summary ${rattle_polishing_options}
"""
}
include { MINIMAP2_ALIGN_RNA } from './modules.nf'
include { EXTRACT_READS as EXTRACT_READS_STEP1 } from './modules.nf'
include { EXTRACT_READS as EXTRACT_READS_STEP2 } from './modules.nf'
include { EXTRACT_READS as EXTRACT_READS_STEP3 } from './modules.nf'
include { MINIMAP2_ALIGN_DNA as MAP_BACK_TO_CONTIGS } from './modules.nf'
include { MINIMAP2_ALIGN_DNA as MAP_BACK_TO_CLUSTERS } from './modules.nf'
include { FASTQ2FASTA } from './modules.nf'
include { FASTQ2FASTA as FASTQ2FASTA_STEP1} from './modules.nf'
include { FASTQ2FASTA as FASTQ2FASTA_STEP2} from './modules.nf'
include { NANOPLOT as QC_PRE_DATA_PROCESSING } from './modules.nf'
include { NANOPLOT as QC_POST_DATA_PROCESSING } from './modules.nf'
include { BLASTN as READ_CLASSIFICATION_BLASTN } from './modules.nf'
include { BLASTN as ASSEMBLY_BLASTN } from './modules.nf'
include { BLASTN as CLUSTERING_BLASTN } from './modules.nf'
workflow {
if (params.samplesheet) {
Channel
.fromPath(params.samplesheet, checkIfExists: true)
.splitCsv(header:true)
.map{ row-> tuple((row.sampleid), file(row.sample_files)) }
.set{ ch_sample }
} else { exit 1, "Input samplesheet file not specified!" }
if ( params.analysis_mode == 'clustering' | params.analysis_mode == 'denovo_assembly' | (params.analysis_mode == 'read_classification' & params.megablast)) {
if (!params.blast_vs_ref) {
if ( params.blastn_db == null) {
error("Please provide the path to a blast database using the parameter --blastn_db.")
}
}
else if (params.blast_vs_ref ) {
if ( params.reference == null) {
error("Please provide the path to a reference fasta file with the parameter --reference.")
}
}
}
else if ( params.analysis_mode == 'map2ref' ) {
if ( params.reference == null) {
error("Please provide the path to a reference fasta file with the parameter --reference.")
}
}
if (params.merge) {
MERGE ( ch_sample )
QC_PRE_DATA_PROCESSING ( MERGE.out.merged )
fq = MERGE.out.merged
}
else {
fq = ch_sample
QC_PRE_DATA_PROCESSING ( fq )
}
// Data pre-processing
// Remove adapters uisng either PORECHOP_ABI or CUTADAPT
if (!params.qc_only) {
if (params.adapter_trimming) {
PORECHOP_ABI ( fq )
trimmed_fq = PORECHOP_ABI.out.porechopabi_trimmed_fq
}
else {
trimmed_fq = fq
}
// Quality filtering of reads using chopper
if (params.qual_filt) {
CHOPPER ( trimmed_fq)
filtered_fq = CHOPPER.out.chopper_filtered_fq
}
else { filtered_fq = trimmed_fq
}
REFORMAT( filtered_fq )
if ( params.qual_filt & params.adapter_trimming | !params.qual_filt & params.adapter_trimming | params.qual_filt & !params.adapter_trimming) {
QC_POST_DATA_PROCESSING ( filtered_fq )
}
if (params.host_filtering) {
if ( params.host_fasta == null) {
error("Please provide the path to a fasta file of host sequences that need to be filtered with the parameter --host_fasta.")
}
MINIMAP2_ALIGN_RNA ( REFORMAT.out.reformatted_fq, params.host_fasta )
EXTRACT_READS_STEP1 ( MINIMAP2_ALIGN_RNA.out.sequencing_ids )
final_fq = EXTRACT_READS_STEP1.out.unaligned_fq
}
else {
final_fq = REFORMAT.out.reformatted_fq
}
if ( ( params.qual_filt | params.adapter_trimming ) & params.host_filtering) {
ch_multiqc_files = Channel.empty()
ch_multiqc_files = ch_multiqc_files.mix(QC_PRE_DATA_PROCESSING.out.read_counts.collect().ifEmpty([]))
ch_multiqc_files = ch_multiqc_files.mix(EXTRACT_READS_STEP1.out.read_counts.collect().ifEmpty([]))
ch_multiqc_files = ch_multiqc_files.mix(QC_POST_DATA_PROCESSING.out.read_counts.collect().ifEmpty([]))
QCREPORT(ch_multiqc_files.collect())
}
else if ( params.host_filtering & !params.adapter_trimming & !params.qual_filt ) {
ch_multiqc_files = Channel.empty()
ch_multiqc_files = ch_multiqc_files.mix(QC_PRE_DATA_PROCESSING.out.read_counts.collect().ifEmpty([]))
ch_multiqc_files = ch_multiqc_files.mix(EXTRACT_READS_STEP1.out.read_counts.collect().ifEmpty([]))
QCREPORT(ch_multiqc_files.collect())
}
else if ( ( params.qual_filt | params.adapter_trimming ) & !params.host_filtering) {
ch_multiqc_files = Channel.empty()
ch_multiqc_files = ch_multiqc_files.mix(QC_PRE_DATA_PROCESSING.out.read_counts.collect().ifEmpty([]))
ch_multiqc_files = ch_multiqc_files.mix(QC_POST_DATA_PROCESSING.out.read_counts.collect().ifEmpty([]))
QCREPORT(ch_multiqc_files.collect())
}
/*
if (params.hybrid) {
if (params.canu) {
CANU( filtered_fq )
MAP_BACK_TO_ASSEMBLY ( CANU.out.assembly )
EXTRACT_READS_STEP2( MAP_BACK_TO_ASSEMBLY.out.sequencing_ids )
RATTLE ( EXTRACT_READS_STEP2.out.unaligned_fq )
FASTQ2FASTA_STEP1( RATTLE.out.clusters )
CAP3( FASTQ2FASTA_STEP1.out.fasta )
MAP_BACK_TO_CLUSTERS ( RATTLE.out.clusters2 )
EXTRACT_READS_STEP3 ( MAP_BACK_TO_CLUSTERS.out.sequencing_ids )
FASTQ2FASTA_STEP2( EXTRACT_READS_STEP3.out.unaligned_fq )
CONCATENATE_FASTA(CANU.out.assembly2, CAP3.out.contigs, FASTQ2FASTA_STEP2.out.fasta)
BLASTN_SPLIT( CONCATENATE_FASTA.out.assembly).splitFasta(by: 25000, file: true)
BLASTN_SPLIT.out.blast_results
.groupTuple()
.set { ch_blastresults }
EXTRACT_VIRAL_BLAST_HITS( ch_blastresults )
}
else if (params.flye) {
FLYE( filtered_fq )
MAP_BACK_TO_ASSEMBLY ( FLYE.out.assembly )
EXTRACT_READS_STEP2( MAP_BACK_TO_ASSEMBLY.out.sequencing_ids )
FASTQ2FASTA_STEP1( RATTLE.out.clusters )
CAP3( FASTQ2FASTA_STEP1.out.fasta )
BLASTN_SPLIT( FLYE.out.assembly.mix(CAP3.out.contigs).collect().splitFasta(by: 25000, file: true) )
}
}
*/
//perform clustering using Rattle
if ( params.analysis_mode == 'clustering' ) {
RATTLE ( final_fq )
FASTQ2FASTA( RATTLE.out.clusters )
CAP3( FASTQ2FASTA.out.fasta )
contigs = CAP3.out.scaffolds
if (params.blast_vs_ref) {
BLASTN2REF ( contigs )
}
else {
CLUSTERING_BLASTN ( contigs )
EXTRACT_VIRAL_BLAST_HITS ( CLUSTERING_BLASTN.out.blast_results )
}
}
//perform de novo assembly using either canu or flye
else if ( params.analysis_mode == 'denovo_assembly' ) {
if (params.canu) {
CANU ( final_fq )
contigs = CANU.out.assembly2
}
else if (params.flye) {
FLYE ( final_fq )
contigs = FLYE.out.assembly2
}
if (params.final_primer_check) {
CUTADAPT ( contigs )
contigs = CUTADAPT.out.trimmed
}
//limit blast homology search to a reference
if (params.blast_vs_ref) {
BLASTN2REF ( contigs )
}
//blast against a database
//else {
// BLASTN( contigs.splitFasta(by: 5000, file: true) )
// BLASTN.out.blast_results
// .groupTuple()
// .set { ch_blastresults }
else {
ASSEMBLY_BLASTN ( contigs )
EXTRACT_VIRAL_BLAST_HITS ( ASSEMBLY_BLASTN.out.blast_results )
}
}
else if ( params.analysis_mode == 'read_classification') {
//just perform direct read search
if (params.megablast) {
FASTQ2FASTA_STEP1( final_fq )
READ_CLASSIFICATION_BLASTN( FASTQ2FASTA_STEP1.out.fasta.splitFasta(by: 10000, file: true) )
READ_CLASSIFICATION_BLASTN.out.blast_results
.groupTuple()
.set { ch_blastresults }
EXTRACT_VIRAL_BLAST_SPLIT_HITS( ch_blastresults )
}
if (params.kaiju) {
KAIJU ( final_fq )
KRONA ( KAIJU.out.results)
}
if (params.kraken2) {
KRAKEN2 ( final_fq )
BRACKEN ( KRAKEN2.out.results )
}
}
//just perform direct alignment
else if ( params.analysis_mode == 'map2ref') {
MINIMAP2_REF ( final_fq )
SAMTOOLS ( MINIMAP2_REF.out.aligned_sample )
}
else {
error("Analysis mode (read_classification, clustering, denovo_assembly, map2ref) not specified with e.g. '--analysis_mode clustering' or via a detectable config file.") }
}
}