From c4b1f62444a04415423e1a051072cc1003e2d005 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 15:00:00 +0000 Subject: [PATCH 01/47] Updated subsetReads to optionally take in single reads. Added PROFILE back to run_dev_se.nf --- modules/local/subsetReads/main.nf | 35 +++++++++++++++++++++++++++++- subworkflows/local/profile/main.nf | 7 ++++-- workflows/run_dev_se.nf | 6 +++++ 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/modules/local/subsetReads/main.nf b/modules/local/subsetReads/main.nf index 6a16ef8d..6ef64936 100644 --- a/modules/local/subsetReads/main.nf +++ b/modules/local/subsetReads/main.nf @@ -52,7 +52,7 @@ process SUBSET_READS_PAIRED_MERGED { ''' } -// Subsample reads with seqtk with an autocomputed read fraction +// Subsample reads with seqtk with an autocomputed read fraction (paired-end) process SUBSET_READS_PAIRED_TARGET { label "seqtk" label "single" @@ -88,3 +88,36 @@ process SUBSET_READS_PAIRED_TARGET { echo "Output reads: $(zcat ${out1} | wc -l | awk '{ print $1/4 }')" ''' } + +// Subsample reads with seqtk with an autocomputed read fraction (single-end) +process SUBSET_READS_SINGLE_TARGET { + label "seqtk" + label "single" + input: + tuple val(sample), path(reads) + val readTarget + output: + tuple val(sample), path("${sample}_subset.${params.suffix}.gz") + shell: + ''' + # Define input/output + in=!{reads} + out=!{sample}_subset.!{params.suffix}.gz + # Count reads and compute target fraction + n_reads=$(zcat ${in} | wc -l | awk '{ print $1/4 }') + echo "Input reads: ${n_reads}" + echo "Target reads: !{readTarget}" + if (( ${n_reads} <= !{readTarget} )); then + echo "Target larger than input; returning all reads." + cp ${in} ${out} + else + frac=$(awk -v a=${n_reads} -v b=!{readTarget} 'BEGIN {result = b/a; print (result > 1) ? 1.0 : result}') + echo "Read fraction for subsetting: ${frac}" + # Carry out subsetting + seed=${RANDOM} + seqtk sample -s ${seed} ${in} ${frac} | gzip -c > ${out} + fi + # Count reads for validation + echo "Output reads: $(zcat ${out} | wc -l | awk '{ print $1/4 }')" + ''' +} diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index a27c7bc5..e9e8d878 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -6,7 +6,10 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -include { SUBSET_READS_PAIRED_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") +include { + SUBSET_READS_${params.read_type == 'paired_end' ? 'PAIRED' : 'SINGLE'}_TARGET as SUBSET_READS_TARGET +} from "../../../modules/local/subsetReads" addParams(suffix: "fastq") + include { BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}") include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}") @@ -24,7 +27,7 @@ workflow PROFILE { ref_dir main: // Randomly subset reads to target number - subset_ch = SUBSET_READS_PAIRED_TARGET(reads_ch, n_reads) + subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads) // Separate ribosomal reads ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz" ribo_ch = BBDUK(subset_ch, ribo_path, params.min_kmer_fraction, params.k) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index e368cc06..52ba8eca 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -41,6 +41,9 @@ workflow RUN_DEV_SE { RAW(samplesheet, params.n_reads_trunc) CLEAN(RAW.out.reads, params.adapters) + // Taxonomic profiling + PROFILE(CLEAN.out.reads, kraken_db_path, params.n_reads_profile, params.ref_dir) + // Process output qc_ch = RAW.out.qc.concat(CLEAN.out.qc) PROCESS_OUTPUT(qc_ch) @@ -65,4 +68,7 @@ workflow RUN_DEV_SE { PROCESS_OUTPUT.out.adapt >> "results/qc" PROCESS_OUTPUT.out.qbase >> "results/qc" PROCESS_OUTPUT.out.qseqs >> "results/qc" + + PROFILE.out.bracken >> "results/taxonomy" + PROFILE.out.kraken >> "results/taxonomy" } \ No newline at end of file From 193dd22f896c51f0cf6bf1b6725a5195f8bbf88f Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 15:59:00 +0000 Subject: [PATCH 02/47] Added BBDUK single read script. --- modules/local/bbduk/main.nf | 30 +++++++++++++++++++++++++++++- subworkflows/local/profile/main.nf | 14 ++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf index 413a2953..c0d78130 100644 --- a/modules/local/bbduk/main.nf +++ b/modules/local/bbduk/main.nf @@ -1,5 +1,5 @@ // Detection and removal of contaminant reads -process BBDUK { +process BBDUK_PAIRED { label "large" label "BBTools" input: @@ -30,6 +30,34 @@ process BBDUK { ''' } +process BBDUK_SINGLE { + label "large" + label "BBTools" + input: + tuple val(sample), path(reads) + path(contaminant_ref) + val(min_kmer_fraction) + val(k) + output: + tuple val(sample), path("${sample}_${params.suffix}_bbduk_pass.fastq.gz"), emit: reads + tuple val(sample), path("${sample}_${params.suffix}_bbduk_fail.fastq.gz"), emit: fail + tuple val(sample), path("${sample}_${params.suffix}_bbduk.stats.txt"), emit: log + shell: + ''' + # Define input/output + in=!{reads} + op=!{sample}_!{params.suffix}_bbduk_pass.fastq.gz + of=!{sample}_!{params.suffix}_bbduk_fail.fastq.gz + stats=!{sample}_!{params.suffix}_bbduk.stats.txt + ref=!{contaminant_ref} + io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}" + # Define parameters + par="minkmerfraction=!{min_kmer_fraction} k=!{k} t=!{task.cpus} -Xmx30g" + # Execute + bbduk.sh ${io} ${par} + ''' +} + // Detection and removal of contaminant reads (use minkmerhits instead of minkmerfraction) process BBDUK_HITS { label "large" diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index e9e8d878..2483e79f 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -6,11 +6,17 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -include { - SUBSET_READS_${params.read_type == 'paired_end' ? 'PAIRED' : 'SINGLE'}_TARGET as SUBSET_READS_TARGET -} from "../../../modules/local/subsetReads" addParams(suffix: "fastq") -include { BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) +if (params.read_type == "single_end") { + include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") +} else if (params.read_type == "paired_end") { + include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") +} +if (params.read_type == "single_end") { + include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) +} else if (params.read_type == "paired_end") { + include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) +} include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}") include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}") include { MERGE_TAXONOMY_RIBO } from "../../../modules/local/mergeTaxonomyRibo" From 0e081f2ff1370e8288daaecbddbcc15446b06e1d Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 18:16:13 +0000 Subject: [PATCH 03/47] Added subsetting for single reads/ --- modules/local/subsetReads/main.nf | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/modules/local/subsetReads/main.nf b/modules/local/subsetReads/main.nf index 6ef64936..4b766768 100644 --- a/modules/local/subsetReads/main.nf +++ b/modules/local/subsetReads/main.nf @@ -25,6 +25,30 @@ process SUBSET_READS_PAIRED { ''' } +// Subsample reads with seqtk (single-end) +process SUBSET_READS_SINGLE { + label "seqtk" + label "single" + input: + tuple val(sample), path(reads) + val readFraction + output: + tuple val(sample), path("${sample}_subset.${params.suffix}.gz") + shell: + ''' + # Define input/output + in=!{reads} + out=!{sample}_subset.!{params.suffix}.gz + # Count reads for validation + echo "Input reads: $(zcat ${in} | wc -l | awk '{ print $1/4 }')" + # Carry out subsetting + seed=${RANDOM} + seqtk sample -s ${seed} ${in} !{readFraction} | gzip -c > ${out} + # Count reads for validation + echo "Output reads: $(zcat ${out} | wc -l | awk '{ print $1/4 }')" + ''' +} + // Subsample reads with seqtk (no sample name) process SUBSET_READS_PAIRED_MERGED { label "seqtk" From 4e140aa65b7053cb85573aa7c50f06ccae2b29b5 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 18:16:27 +0000 Subject: [PATCH 04/47] Added selection of single or paired end reads. --- subworkflows/local/hv_screen/main.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/hv_screen/main.nf b/subworkflows/local/hv_screen/main.nf index 0edb7489..a4b77a43 100644 --- a/subworkflows/local/hv_screen/main.nf +++ b/subworkflows/local/hv_screen/main.nf @@ -11,7 +11,12 @@ if (params.read_type == "single_end") { } else if (params.read_type == "paired_end") { include { FASTP_PAIRED as FASTP } from "../../../modules/local/fastp" } -include { BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) + +if (params.read_type == "single_end") { + include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) +} else if (params.read_type == "paired_end") { + include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) +} include { PROCESS_BOWTIE2_SAM_PAIRED } from "../../../modules/local/processBowtie2Sam" /*********** From d270d3f091890cb56adadf431d0380f99c8ff1a1 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 18:17:04 +0000 Subject: [PATCH 05/47] WIP edits to taxonomy main scripts (commenting, subselecting processes, if clauses in the workflow) --- subworkflows/local/taxonomy/main.nf | 89 +++++++++++++++++++---------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index 6a8a7a7d..47022585 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -6,14 +6,18 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -include { SUBSET_READS_PAIRED } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") -include { BBMERGE } from "../../../modules/local/bbmerge" -include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" -include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" -include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" -include { JOIN_FASTQ } from "../../../modules/local/joinFastq" -include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" -include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}") +if (params.read_type == "paired_end") { + include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") +} else if (params.read_type == "single_end") { + include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") +} +include { BBMERGE } from "../../../modules/local/bbmerge" // probalby skippable in single read version +include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" // probalby skippable in single read version +include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" // probalby no change needed in single read version +include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" // already has a single read version. +include { JOIN_FASTQ } from "../../../modules/local/joinFastq" // probably not needed in single read version +include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" // already has a single read version. +include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}") // probs not changes needed include { LABEL_KRAKEN_REPORTS } from "../../../modules/local/labelKrakenReports" include { MERGE_TSVS as MERGE_KRAKEN_REPORTS } from "../../../modules/local/mergeTsvs" addParams(name: "kraken_reports") include { MERGE_TSVS as MERGE_BRACKEN } from "../../../modules/local/mergeTsvs" addParams(name: "bracken_reports") @@ -33,26 +37,38 @@ workflow TAXONOMY { if ( params.read_fraction == 1 ){ subset_ch = reads_ch } else { - subset_ch = SUBSET_READS_PAIRED(reads_ch, params.read_fraction) + subset_ch = SUBSET_READS(reads_ch, params.read_fraction) } - // Deduplicate reads (if applicable) - if ( params.dedup_rc ){ - paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch) - } else { - paired_dedup_ch = subset_ch - } - // Prepare reads - merged_ch = BBMERGE(paired_dedup_ch) - // Only want to summarize the merged elements - summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]}) - joined_ch = JOIN_FASTQ(merged_ch.reads) - // Deduplicate reads (if applicable) - if ( params.dedup_rc ){ - dedup_ch = CLUMPIFY_SINGLE(joined_ch) - } else { - dedup_ch = joined_ch + if (params.read_type == "paired_end") { + // Deduplicate reads (if applicable) + if ( params.dedup_rc ){ + paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch) + } else { + paired_dedup_ch = subset_ch + } + // Prepare reads + merged_ch = BBMERGE(paired_dedup_ch) + // Only want to summarize the merged elements + summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]}) + joined_ch = JOIN_FASTQ(merged_ch.reads) + // Deduplicate reads (if applicable) + if ( params.dedup_rc ){ + dedup_ch = CLUMPIFY_SINGLE(joined_ch) + } else { + dedup_ch = joined_ch + } + + } else if (params.read_type == "single_end") { + // Deduplicate reads (if applicable) + + if (params.dedup_rc) { + dedup_ch = CLUMPIFY_SINGLE(subset_ch) + } else { + dedup_ch = subset_ch + } } + // Summarize last of the output summarize_dedup_ch = SUMMARIZE_DEDUP(dedup_ch) @@ -64,10 +80,21 @@ workflow TAXONOMY { bracken_ch = BRACKEN(kraken_ch.report, kraken_db_ch, params.classification_level) bracken_label_ch = LABEL_BRACKEN_REPORTS(bracken_ch) bracken_merge_ch = MERGE_BRACKEN(bracken_label_ch.collect().ifEmpty([])) - emit: - kraken_output = kraken_ch.output - kraken_reports = kraken_merge_ch - bracken = bracken_merge_ch - bbmerge_summary = summarize_bbmerge_ch - dedup_summary = summarize_dedup_ch + + if (params.read_type == "paired_end") { + emit: + kraken_output = kraken_ch.output + kraken_reports = kraken_merge_ch + bracken = bracken_merge_ch + bbmerge_summary = summarize_bbmerge_ch + dedup_summary = summarize_dedup_ch + } + + else if (params.read_type == "single_end") { + emit: + kraken_output = kraken_ch.output + kraken_reports = kraken_merge_ch + bracken = bracken_merge_ch + dedup_summary = summarize_dedup_ch + } } From bad278746f103966c972b7b8c55556026e586dc9 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 18:17:28 +0000 Subject: [PATCH 06/47] Adding kraken path to run_dev_se.nf --- workflows/run_dev_se.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index 52ba8eca..7dd99e88 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -38,6 +38,9 @@ workflow RUN_DEV_SE { .splitCsv(header: true) .map{row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2))} } + // Prepare Kraken DB + kraken_db_path = "${params.ref_dir}/results/kraken_db" +// Preprocessing RAW(samplesheet, params.n_reads_trunc) CLEAN(RAW.out.reads, params.adapters) From 79f922219b44b2a1a7d1719601af69465a55b84b Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 28 Oct 2024 18:18:03 +0000 Subject: [PATCH 07/47] Adding WIP changes to test/nextflow.config to run on Simon's S3. --- test/nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nextflow.config b/test/nextflow.config index 39150408..a63b3600 100644 --- a/test/nextflow.config +++ b/test/nextflow.config @@ -6,7 +6,7 @@ params { mode = "run" // Directories - base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3) + base_dir = "s3://nao-mgs-simon/test" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-mgs-wb/index-20240714/output" // Reference/index directory (generated by index workflow) // Files @@ -28,4 +28,4 @@ includeConfig "${projectDir}/configs/logging.config" includeConfig "${projectDir}/configs/containers.config" includeConfig "${projectDir}/configs/resources.config" includeConfig "${projectDir}/configs/profiles.config" -process.queue = "will-batch-queue" // AWS Batch job queue +process.queue = "simon-batch-queue" // AWS Batch job queue From 50adde14a7c517eec541891ec8bc1c224c668283 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Thu, 7 Nov 2024 19:14:11 +0000 Subject: [PATCH 08/47] fixed wrong indent in run_dev_se.nf --- workflows/run_dev_se.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index 7dd99e88..c2690db7 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -40,7 +40,7 @@ workflow RUN_DEV_SE { } // Prepare Kraken DB kraken_db_path = "${params.ref_dir}/results/kraken_db" -// Preprocessing + // Preprocessing RAW(samplesheet, params.n_reads_trunc) CLEAN(RAW.out.reads, params.adapters) From daa6c3ebb0ed62e1b073151e897bf2be5788f45a Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Thu, 7 Nov 2024 19:16:33 +0000 Subject: [PATCH 09/47] Slight edit to inline commments in taxonomy/main.nf. Reverted emit statement to original version. --- subworkflows/local/taxonomy/main.nf | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index 47022585..897c2a6b 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -60,8 +60,9 @@ workflow TAXONOMY { } } else if (params.read_type == "single_end") { + // No merging in single read version + summarize_bbmerge_ch = Channel.empty() // Deduplicate reads (if applicable) - if (params.dedup_rc) { dedup_ch = CLUMPIFY_SINGLE(subset_ch) } else { @@ -81,20 +82,13 @@ workflow TAXONOMY { bracken_label_ch = LABEL_BRACKEN_REPORTS(bracken_ch) bracken_merge_ch = MERGE_BRACKEN(bracken_label_ch.collect().ifEmpty([])) - if (params.read_type == "paired_end") { - emit: - kraken_output = kraken_ch.output - kraken_reports = kraken_merge_ch - bracken = bracken_merge_ch - bbmerge_summary = summarize_bbmerge_ch - dedup_summary = summarize_dedup_ch - } - else if (params.read_type == "single_end") { - emit: - kraken_output = kraken_ch.output - kraken_reports = kraken_merge_ch - bracken = bracken_merge_ch - dedup_summary = summarize_dedup_ch - } + emit: + kraken_output = kraken_ch.output + kraken_reports = kraken_merge_ch + bracken = bracken_merge_ch + bbmerge_summary = summarize_bbmerge_ch + dedup_summary = summarize_dedup_ch + + } From 39c797c9ce0c9d2644ec48a01c4802528799efbb Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Thu, 7 Nov 2024 19:28:07 +0000 Subject: [PATCH 10/47] Revert "Adding WIP changes to test/nextflow.config to run on Simon's S3." This reverts commit 79f922219b44b2a1a7d1719601af69465a55b84b. --- test/nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nextflow.config b/test/nextflow.config index a63b3600..39150408 100644 --- a/test/nextflow.config +++ b/test/nextflow.config @@ -6,7 +6,7 @@ params { mode = "run" // Directories - base_dir = "s3://nao-mgs-simon/test" // Parent for working and output directories (can be S3) + base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-mgs-wb/index-20240714/output" // Reference/index directory (generated by index workflow) // Files @@ -28,4 +28,4 @@ includeConfig "${projectDir}/configs/logging.config" includeConfig "${projectDir}/configs/containers.config" includeConfig "${projectDir}/configs/resources.config" includeConfig "${projectDir}/configs/profiles.config" -process.queue = "simon-batch-queue" // AWS Batch job queue +process.queue = "will-batch-queue" // AWS Batch job queue From 2da99f1a21a7347deb934fa7b251ac769c998886 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Thu, 7 Nov 2024 19:29:58 +0000 Subject: [PATCH 11/47] removed white space in two files --- subworkflows/local/profile/main.nf | 1 - subworkflows/local/taxonomy/main.nf | 1 - 2 files changed, 2 deletions(-) diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index 2483e79f..24906dbd 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -6,7 +6,6 @@ | MODULES AND SUBWORKFLOWS | ***************************/ - if (params.read_type == "single_end") { include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") } else if (params.read_type == "paired_end") { diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index 897c2a6b..c330e494 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -82,7 +82,6 @@ workflow TAXONOMY { bracken_label_ch = LABEL_BRACKEN_REPORTS(bracken_ch) bracken_merge_ch = MERGE_BRACKEN(bracken_label_ch.collect().ifEmpty([])) - emit: kraken_output = kraken_ch.output kraken_reports = kraken_merge_ch From 1d301159811013bca33bd2e64cc938f7257477b2 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Thu, 7 Nov 2024 19:39:51 +0000 Subject: [PATCH 12/47] Removed unneeded comments in taxonomy/main.nf --- subworkflows/local/taxonomy/main.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index c330e494..2e5dcaeb 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -11,13 +11,13 @@ if (params.read_type == "paired_end") { } else if (params.read_type == "single_end") { include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") } -include { BBMERGE } from "../../../modules/local/bbmerge" // probalby skippable in single read version -include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" // probalby skippable in single read version -include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" // probalby no change needed in single read version -include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" // already has a single read version. -include { JOIN_FASTQ } from "../../../modules/local/joinFastq" // probably not needed in single read version -include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" // already has a single read version. -include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}") // probs not changes needed +include { BBMERGE } from "../../../modules/local/bbmerge" +include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" +include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" +include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" +include { JOIN_FASTQ } from "../../../modules/local/joinFastq" +include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" +include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}") include { LABEL_KRAKEN_REPORTS } from "../../../modules/local/labelKrakenReports" include { MERGE_TSVS as MERGE_KRAKEN_REPORTS } from "../../../modules/local/mergeTsvs" addParams(name: "kraken_reports") include { MERGE_TSVS as MERGE_BRACKEN } from "../../../modules/local/mergeTsvs" addParams(name: "bracken_reports") From d4b7239ad858e1e5ed848aecc50193ab883add31 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:43:06 +0000 Subject: [PATCH 13/47] dropped params.* --- modules/local/bbduk/main.nf | 13 +++++++------ modules/local/subsetReads/main.nf | 10 ++++++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf index de2c012b..804e4423 100644 --- a/modules/local/bbduk/main.nf +++ b/modules/local/bbduk/main.nf @@ -39,17 +39,18 @@ process BBDUK_SINGLE { path(contaminant_ref) val(min_kmer_fraction) val(k) + val(suffix) output: - tuple val(sample), path("${sample}_${params.suffix}_bbduk_pass.fastq.gz"), emit: reads - tuple val(sample), path("${sample}_${params.suffix}_bbduk_fail.fastq.gz"), emit: fail - tuple val(sample), path("${sample}_${params.suffix}_bbduk.stats.txt"), emit: log + tuple val(sample), path("${sample}_${suffix}_bbduk_pass.fastq.gz"), emit: reads + tuple val(sample), path("${sample}_${suffix}_bbduk_fail.fastq.gz"), emit: fail + tuple val(sample), path("${sample}_${suffix}_bbduk.stats.txt"), emit: log shell: ''' # Define input/output in=!{reads} - op=!{sample}_!{params.suffix}_bbduk_pass.fastq.gz - of=!{sample}_!{params.suffix}_bbduk_fail.fastq.gz - stats=!{sample}_!{params.suffix}_bbduk.stats.txt + op=!{sample}_!{suffix}_bbduk_pass.fastq.gz + of=!{sample}_!{suffix}_bbduk_fail.fastq.gz + stats=!{sample}_!{suffix}_bbduk.stats.txt ref=!{contaminant_ref} io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}" # Define parameters diff --git a/modules/local/subsetReads/main.nf b/modules/local/subsetReads/main.nf index 5fd33d28..91b8f53d 100644 --- a/modules/local/subsetReads/main.nf +++ b/modules/local/subsetReads/main.nf @@ -33,13 +33,14 @@ process SUBSET_READS_SINGLE { input: tuple val(sample), path(reads) val readFraction + val suffix output: - tuple val(sample), path("${sample}_subset.${params.suffix}.gz") + tuple val(sample), path("${sample}_subset.${suffix}.gz") shell: ''' # Define input/output in=!{reads} - out=!{sample}_subset.!{params.suffix}.gz + out=!{sample}_subset.!{suffix}.gz # Count reads for validation echo "Input reads: $(zcat ${in} | wc -l | awk '{ print $1/4 }')" # Carry out subsetting @@ -123,13 +124,14 @@ process SUBSET_READS_SINGLE_TARGET { input: tuple val(sample), path(reads) val readTarget + val suffix output: - tuple val(sample), path("${sample}_subset.${params.suffix}.gz") + tuple val(sample), path("${sample}_subset.${suffix}.gz") shell: ''' # Define input/output in=!{reads} - out=!{sample}_subset.!{params.suffix}.gz + out=!{sample}_subset.!{suffix}.gz # Count reads and compute target fraction n_reads=$(zcat ${in} | wc -l | awk '{ print $1/4 }') echo "Input reads: ${n_reads}" From 487c6c50585b6eb274bdd14c08edf5d7754dda01 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:43:24 +0000 Subject: [PATCH 14/47] added selection of correct concat_group process in extractViralReads --- subworkflows/local/extractViralReads/main.nf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf index 0b4f5b6a..81997075 100644 --- a/subworkflows/local/extractViralReads/main.nf +++ b/subworkflows/local/extractViralReads/main.nf @@ -27,7 +27,11 @@ include { COLLAPSE_VIRUS_READS } from "../../../modules/local/collapseVirusReads include { ADD_FRAG_DUP_TO_VIRUS_READS } from "../../../modules/local/addFragDupToVirusReads" include { MAKE_VIRUS_READS_FASTA } from "../../../modules/local/makeVirusReadsFasta" include { COUNT_VIRUS_CLADES } from "../../../modules/local/countVirusClades" -include { CONCAT_GROUP } from "../../../modules/local/concatGroup" +if (params.single_end) { + include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup" +} else { + include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup" +} /*********** | WORKFLOW | From dd71ea80c75b448d1ac99e222c2f90c096225ad3 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:43:42 +0000 Subject: [PATCH 15/47] added paired paired and single version for concat_group --- modules/local/concatGroup/main.nf | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/modules/local/concatGroup/main.nf b/modules/local/concatGroup/main.nf index 8544327a..cb67d5ef 100644 --- a/modules/local/concatGroup/main.nf +++ b/modules/local/concatGroup/main.nf @@ -1,5 +1,5 @@ // Copy a file to a new location with a custom path -process CONCAT_GROUP { +process CONCAT_GROUP_PAIRED { label "base" label "single" input: @@ -14,3 +14,19 @@ process CONCAT_GROUP { cat ${fastq_2_list.join(' ')} > ${group}_R2.fastq.gz """ } + + +process CONCAT_GROUP_SINGLE { + label "base" + label "single" + input: + tuple val(samples), path(fastq_list), val(group) + + output: + tuple val(group), path("${group}.fastq.gz") + + script: + """ + cat ${fastq_list.join(' ')} > ${group}.fastq.gz + """ +} From 329acb1112c4d881ca4a3bf3fd98a3cd1da57864 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:44:07 +0000 Subject: [PATCH 16/47] Adapted profile to be cleaner and take in changes introduced by Will. --- subworkflows/local/profile/main.nf | 56 +++++++++++++++++------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index 1126a58a..b81799e3 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -6,27 +6,22 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -if (params.read_type == "single_end") { - include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") -} else if (params.read_type == "paired_end") { - include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") +if (params.single_end) { + include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" + include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" + include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup" + include { SUBSET_READS_SINGLE_TARGET; SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads" +} else { + include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" + include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads" + include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" + include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup" } -if (params.read_type == "single_end") { - include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) -} else if (params.read_type == "paired_end") { - include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix) -} -include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}") -include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}") - -// FIX ABOVE -include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_PAIRED_TARGET_GROUP } from "../../../modules/local/subsetReads" -include { BBDUK } from "../../../modules/local/bbduk" +include { BBDUK_HITS } from "../../../modules/local/bbduk" include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" include { MERGE_TAXONOMY_RIBO } from "../../../modules/local/mergeTaxonomyRibo" -include { CONCAT_GROUP } from "../../../modules/local/concatGroup" /**************** | MAIN WORKFLOW | @@ -44,24 +39,34 @@ workflow PROFILE { bbduk_suffix kraken_memory grouping + single_end main: // Randomly subset reads to target number - subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads) // DROP - subset_ch = SUBSET_READS_PAIRED_TARGET(reads_ch, n_reads, "fastq") + subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads, "fastq") + if (grouping){ // Join samplesheet with trimmed_reads and update fastq files - subset_group_ch = group_ch.join(subset_ch, by: 0) - .map { sample, group, reads -> tuple(sample, reads[0], reads[1], group) } - .groupTuple(by: 3) + if (single_end) { + subset_group_ch = group_ch.join(subset_ch, by: 0) + .map { sample, group, reads -> tuple(sample, reads, group) } + .groupTuple(by: 2) + // Single-sample groups are already subsetted to target number + single_sample_groups = subset_group_ch.filter { it[0].size() == 1 } + .map { samples, read_list, group -> tuple(group, [read_list[0]]) } + + } else { + subset_group_ch = group_ch.join(subset_ch, by: 0) + .map { sample, group, reads -> tuple(sample, reads[0], reads[1], group) } + .groupTuple(by: 3) + single_sample_groups = subset_group_ch.filter { it[0].size() == 1 } + .map { samples, fwd_list, rev_list, group -> tuple(group, [fwd_list[0], rev_list[0]]) } + } // Split into multi-sample groups, these need to be subsetted to target number multi_sample_groups = subset_group_ch.filter { it[0].size() > 1 } - // These are already subsetted to target number - single_sample_groups = subset_group_ch.filter { it[0].size() == 1 } - .map { samples, fwd_list, rev_list, group -> tuple(group, [fwd_list[0], rev_list[0]]) } // Concatenate multi-sample groups grouped_samples = CONCAT_GROUP(multi_sample_groups) // Randomly subset multi-sample groups to target number - subset_grouped_ch = SUBSET_READS_PAIRED_TARGET_GROUP(grouped_samples, n_reads, "fastq") + subset_grouped_ch = SUBSET_READS_TARGET_GROUP(grouped_samples, n_reads, "fastq") // Mix with subsetted multi-sample group with already subsetted single-sample groups grouped_ch = subset_grouped_ch.mix(single_sample_groups) } else { @@ -79,6 +84,7 @@ workflow PROFILE { br_ribo = tax_ribo_ch.bracken.collectFile(name: "bracken_reports_ribo.tsv.gz") br_noribo = tax_noribo_ch.bracken.collectFile(name: "bracken_reports_noribo.tsv.gz") merge_ch = MERGE_TAXONOMY_RIBO(kr_ribo, kr_noribo, br_ribo, br_noribo) + emit: bracken = merge_ch.bracken kraken = merge_ch.kraken From ea74362aa9fea5ec147ff6c4fda79b4f445914da Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:44:34 +0000 Subject: [PATCH 17/47] Cleaned up taxonomy and took into account changes of v2.5.0 --- subworkflows/local/taxonomy/main.nf | 69 +++++++++-------------------- 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index a1da46c8..acfe39c2 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -6,20 +6,17 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -if (params.read_type == "paired_end") { - include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") -} else if (params.read_type == "single_end") { - include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq") -} - -// fix above +if (params.single_end) { + include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" +} else { + include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" //addParams(suffix: "fastq") + include { JOIN_FASTQ } from "../../../modules/local/joinFastq" + include { BBMERGE } from "../../../modules/local/bbmerge" + include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" -include { SUBSET_READS_PAIRED } from "../../../modules/local/subsetReads" -include { BBMERGE } from "../../../modules/local/bbmerge" -include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" +} include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" -include { JOIN_FASTQ } from "../../../modules/local/joinFastq" include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" include { KRAKEN } from "../../../modules/local/kraken" include { LABEL_KRAKEN_REPORTS } from "../../../modules/local/labelKrakenReports" @@ -45,13 +42,23 @@ workflow TAXONOMY { if ( read_fraction == 1 ){ subset_ch = reads_ch } else { - // OLD VERSION below subset_ch = SUBSET_READS(reads_ch, params.read_fraction) } - if (params.read_type == "paired_end") { + if (params.single_end) { + // No merging in single read version + summarize_bbmerge_ch = Channel.empty() // Deduplicate reads (if applicable) - if ( params.dedup_rc ){ + if (params.dedup_rc) { + dedup_ch = CLUMPIFY_SINGLE(subset_ch) + } else { + dedup_ch = subset_ch + } + // No merging in single read version + summarize_bbmerge_ch = Channel.empty() + } else { + // Deduplicate reads (if applicable) + if ( dedup_rc ){ paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch) } else { paired_dedup_ch = subset_ch @@ -62,44 +69,12 @@ workflow TAXONOMY { summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]}) joined_ch = JOIN_FASTQ(merged_ch.reads) // Deduplicate reads (if applicable) - if ( params.dedup_rc ){ + if ( dedup_rc ){ dedup_ch = CLUMPIFY_SINGLE(joined_ch) } else { dedup_ch = joined_ch } - - } else if (params.read_type == "single_end") { - // No merging in single read version - summarize_bbmerge_ch = Channel.empty() - // Deduplicate reads (if applicable) - if (params.dedup_rc) { - dedup_ch = CLUMPIFY_SINGLE(subset_ch) - } else { - dedup_ch = subset_ch - } -// NEW VERSION below - - subset_ch = SUBSET_READS_PAIRED(reads_ch, read_fraction, "fastq") - } - - // Deduplicate reads (if applicable) - if ( dedup_rc ){ - paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch) - } else { - paired_dedup_ch = subset_ch - } - // Prepare reads - merged_ch = BBMERGE(paired_dedup_ch) - // Only want to summarize the merged elements - summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]}) - joined_ch = JOIN_FASTQ(merged_ch.reads) - // Deduplicate reads (if applicable) - if ( dedup_rc ){ - dedup_ch = CLUMPIFY_SINGLE(joined_ch) - } else { - dedup_ch = joined_ch } -// NEW VERSION // Summarize last of the output summarize_dedup_ch = SUMMARIZE_DEDUP(dedup_ch) From bbced47051a92bbff7fa1e3cc92da46ddcca55e0 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:45:37 +0000 Subject: [PATCH 18/47] Fixed single-end flags in run.nf --- workflows/run.nf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/workflows/run.nf b/workflows/run.nf index 8c347ec3..a2c7b108 100644 --- a/workflows/run.nf +++ b/workflows/run.nf @@ -28,9 +28,6 @@ workflow RUN { start_time = new Date() start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)") - single_end = file(params.sample_sheet).readLines()[0].split(',').contains('fastq_2') ? false : true - - // Prepare samplesheet if ( params.grouping ) { samplesheet = Channel @@ -50,8 +47,8 @@ workflow RUN { // Prepare Kraken DB kraken_db_path = "${params.ref_dir}/results/kraken_db" // Preprocessing - RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end) - CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end) + RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end) + CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end) // Extract and count human-viral reads EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "3", "21", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", "${params.kraken_memory}", params.grouping) // Process intermediate output for chimera detection @@ -64,7 +61,7 @@ workflow RUN { BLAST_VIRAL(EXTRACT_VIRAL_READS.out.fasta, blast_db_path, blast_db_prefix, params.blast_viral_fraction, "32", "256 GB", "32 GB") } // Taxonomic profiling - PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping) + PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping, params.single_end) // Process output qc_ch = RAW.out.qc.concat(CLEAN.out.qc) PROCESS_OUTPUT(qc_ch) From 4a1edda332e4b27ab134d632e85b228ce510e2a5 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:46:32 +0000 Subject: [PATCH 19/47] added "params" to the single_end flags to run_dev_se.nf --- workflows/run_dev_se.nf | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index 54983633..4ff59e8c 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -12,6 +12,7 @@ import java.time.LocalDateTime include { RAW } from "../subworkflows/local/raw" include { CLEAN } from "../subworkflows/local/clean" include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput" +include { PROFILE } from "../subworkflows/local/profile" nextflow.preview.output = true /***************** @@ -24,8 +25,6 @@ workflow RUN_DEV_SE { start_time = new Date() start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)") - single_end = file(params.sample_sheet).readLines()[0].split(',').contains('fastq_2') ? false : true - // Prepare samplesheet if (single_end) { if (params.grouping) { @@ -60,14 +59,16 @@ workflow RUN_DEV_SE { group_ch = Channel.empty() } } + // Prepare Kraken DB + kraken_db_path = "${params.ref_dir}/results/kraken_db" // Preprocessing - RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end) - CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end) + RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end) + CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end) // Taxonomic profiling - PROFILE(CLEAN.out.reads, kraken_db_path, params.n_reads_profile, params.ref_dir) + PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping, params.single_end) // Process output qc_ch = RAW.out.qc.concat(CLEAN.out.qc) @@ -95,4 +96,7 @@ workflow RUN_DEV_SE { PROCESS_OUTPUT.out.adapt >> "results" PROCESS_OUTPUT.out.qbase >> "results" PROCESS_OUTPUT.out.qseqs >> "results" + // Final results + PROFILE.out.bracken >> "results" + PROFILE.out.kraken >> "results" } From 753982d642c9e6f0ac55a10ceaca9d22721bbf12 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:47:29 +0000 Subject: [PATCH 20/47] Fixing run_dev_se.config file --- configs/run_dev_se.config | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config index d271d93f..9c90bb37 100644 --- a/configs/run_dev_se.config +++ b/configs/run_dev_se.config @@ -5,17 +5,22 @@ params { mode = "run_dev_se" - read_type = "single_end" // Directories base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3) - ref_dir = "s3://nao-mgs-wb/index-20240714/output" // Reference/index directory (generated by index workflow) + ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow) // Files sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming + // Whether the underlying data is paired-end or single-end + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true + + // println "Single end mode: ${single_end}" + // Numerical + grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20) @@ -23,10 +28,12 @@ params { kraken_memory = "128 GB" // Memory needed to safely load Kraken DB quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64) fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2) + host_taxon = "vertebrate" } includeConfig "${projectDir}/configs/logging.config" includeConfig "${projectDir}/configs/containers.config" includeConfig "${projectDir}/configs/resources.config" includeConfig "${projectDir}/configs/profiles.config" -process.queue = "will-batch-queue" // AWS Batch job queue +includeConfig "${projectDir}/configs/output.config" +process.queue = "simon-batch-queue" // AWS Batch job queue From 5c3cb7333db5c937e558c9a970fc48ee0d43175c Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Wed, 20 Nov 2024 19:48:33 +0000 Subject: [PATCH 21/47] updated config in paired end test dataset --- test-paired-end/nextflow.config | 7 +++++-- test-single-read/nextflow.config | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test-paired-end/nextflow.config b/test-paired-end/nextflow.config index f29ccc30..3556d11b 100644 --- a/test-paired-end/nextflow.config +++ b/test-paired-end/nextflow.config @@ -5,7 +5,6 @@ params { mode = "run_dev_se" - // Directories base_dir = "s3://nao-mgs-simon/test_paired_end" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow) @@ -14,8 +13,12 @@ params { sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming + + // Whether the underlying data is paired-end or single-end + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true + // Numerical - grouping = true // Whether to group samples by 'group' column in samplesheet + grouping = false // Whether to group samples by 'group' column in samplesheet // Numerical n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling diff --git a/test-single-read/nextflow.config b/test-single-read/nextflow.config index 3efa0181..a8be84d5 100644 --- a/test-single-read/nextflow.config +++ b/test-single-read/nextflow.config @@ -15,10 +15,10 @@ params { adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming // Whether the underlying data is paired-end or single-end - single_end = params.sample_sheet.toString().readLines()[0].contains('fastq_2') ? false : true + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true // Numerical - grouping = true // Whether to group samples by 'group' column in samplesheet + grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20) @@ -34,4 +34,4 @@ includeConfig "${projectDir}/configs/containers.config" includeConfig "${projectDir}/configs/resources.config" includeConfig "${projectDir}/configs/profiles.config" includeConfig "${projectDir}/configs/output.config" -process.queue = "simon-batch-queue" // AWS Batch job queue \ No newline at end of file +process.queue = "simon-batch-queue" // AWS Batch job queue From 5c29398b021ca1cf05c0ef1c0e09802c834a921e Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Fri, 22 Nov 2024 19:52:54 +0000 Subject: [PATCH 22/47] added params to single end. --- workflows/run_dev_se.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index 4ff59e8c..d628814e 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -26,7 +26,7 @@ workflow RUN_DEV_SE { start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)") // Prepare samplesheet - if (single_end) { + if (params.single_end) { if (params.grouping) { samplesheet = Channel .fromPath(params.sample_sheet) From 8fa88faa5be2e18d246f9d8396f4d6eaf2cf9c07 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Sat, 30 Nov 2024 17:49:32 +0000 Subject: [PATCH 23/47] Dropped params.* for single_end. --- subworkflows/local/profile/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index 8636bca0..d97d6eee 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -6,7 +6,7 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -if (params.single_end) { +if (single_end) { include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup" From abe95bf5bc6a5d928a671d8583f474938eaf8269 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 2 Dec 2024 15:50:36 +0000 Subject: [PATCH 24/47] moved single_end parameter definition to .config files. --- configs/run.config | 4 +++ configs/run_dev_se.config | 1 - modules/local/concatGroup/main.nf | 2 +- subworkflows/local/extractViralReads/main.nf | 3 +- subworkflows/local/profile/main.nf | 9 +++-- subworkflows/local/taxonomy/main.nf | 13 +++---- test-data/single-end-samplesheet.csv | 2 +- test-se/nextflow.config | 36 ++++++++++++++++++++ tests/run.config | 3 ++ tests/run_dev_se.config | 1 - workflows/run.nf | 11 +++--- workflows/run_dev_se.nf | 11 +++--- 12 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 test-se/nextflow.config diff --git a/configs/run.config b/configs/run.config index 98199092..7657476c 100644 --- a/configs/run.config +++ b/configs/run.config @@ -13,6 +13,10 @@ params { sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming + // Whether the underlying data is paired-end or single-end + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true + + // Numerical grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config index a8be84d5..5e8f11cc 100644 --- a/configs/run_dev_se.config +++ b/configs/run_dev_se.config @@ -5,7 +5,6 @@ params { mode = "run_dev_se" - // Directories base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow) diff --git a/modules/local/concatGroup/main.nf b/modules/local/concatGroup/main.nf index 54913c46..5bb687f3 100644 --- a/modules/local/concatGroup/main.nf +++ b/modules/local/concatGroup/main.nf @@ -1,5 +1,5 @@ // Copy a file to a new location with a custom path -process CONCAT_GROUP { +process CONCAT_GROUP_PAIRED { label "coreutils" label "single" input: diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf index facb4868..6520211e 100644 --- a/subworkflows/local/extractViralReads/main.nf +++ b/subworkflows/local/extractViralReads/main.nf @@ -52,6 +52,7 @@ workflow EXTRACT_VIRAL_READS { encoding fuzzy_match grouping + single_end main: // Get reference paths viral_genome_path = "${ref_dir}/results/virus-genomes-filtered.fasta.gz" @@ -94,7 +95,7 @@ workflow EXTRACT_VIRAL_READS { human_bbm_ch = BBMAP_HUMAN(other_bt2_ch.reads_unconc, bbm_human_index_path, "human") other_bbm_ch = BBMAP_OTHER(human_bbm_ch.reads_unmapped, bbm_other_index_path, "other") // Run Kraken on filtered viral candidates - tax_ch = TAXONOMY(other_bbm_ch.reads_unmapped, kraken_db_ch, true, "F") + tax_ch = TAXONOMY(other_bbm_ch.reads_unmapped, kraken_db_ch, true, "F", single_end) // Process Kraken output and merge with Bowtie2 output across samples kraken_output_ch = PROCESS_KRAKEN_VIRAL(tax_ch.kraken_output, virus_db_path, host_taxon) bowtie2_kraken_merged_ch = MERGE_SAM_KRAKEN(kraken_output_ch.combine(bowtie2_sam_ch, by: 0)) diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index d97d6eee..13bd2a1b 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -6,7 +6,8 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -if (single_end) { + +if (params.single_end) { include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup" @@ -40,6 +41,8 @@ workflow PROFILE { grouping single_end main: + + // Randomly subset reads to target number subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads, "fastq") @@ -75,8 +78,8 @@ workflow PROFILE { ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz" ribo_ch = BBDUK(grouped_ch, ribo_path, min_kmer_fraction, k, bbduk_suffix) // Run taxonomic profiling separately on ribo and non-ribo reads - tax_ribo_ch = TAXONOMY_RIBO(ribo_ch.fail, kraken_db_ch, false, "D") - tax_noribo_ch = TAXONOMY_NORIBO(ribo_ch.reads, kraken_db_ch, false, "D") + tax_ribo_ch = TAXONOMY_RIBO(ribo_ch.fail, kraken_db_ch, false, "D", single_end) + tax_noribo_ch = TAXONOMY_NORIBO(ribo_ch.reads, kraken_db_ch, false, "D", single_end) // Merge ribo and non-ribo outputs kr_ribo = tax_ribo_ch.kraken_reports.collectFile(name: "kraken_reports_ribo.tsv.gz") kr_noribo = tax_noribo_ch.kraken_reports.collectFile(name: "kraken_reports_noribo.tsv.gz") diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index 3b9937c2..ee4416af 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -35,22 +35,23 @@ workflow TAXONOMY { kraken_db_ch dedup_rc classification_level + single_end main: - if (params.single_end) { + if (single_end) { // No merging in single read version summarize_bbmerge_ch = Channel.empty() // Deduplicate reads (if applicable) - if (params.dedup_rc) { - dedup_ch = CLUMPIFY_SINGLE(subset_ch) + if (dedup_rc) { + dedup_ch = CLUMPIFY_SINGLE(reads_ch) } else { - dedup_ch = subset_ch + dedup_ch = reads_ch } } else { // Deduplicate reads (if applicable) if ( dedup_rc ){ - paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch) + paired_dedup_ch = CLUMPIFY_PAIRED(reads_ch) } else { - paired_dedup_ch = subset_ch + paired_dedup_ch = reads_ch } // Prepare reads merged_ch = BBMERGE(paired_dedup_ch) diff --git a/test-data/single-end-samplesheet.csv b/test-data/single-end-samplesheet.csv index 5b6a53f0..f59a9a86 100644 --- a/test-data/single-end-samplesheet.csv +++ b/test-data/single-end-samplesheet.csv @@ -1,2 +1,2 @@ sample,fastq -230926Esv_D23-14904-1,s3://nao-testing/gold-standard-test/raw/gold_standard_R1.fastq.gz \ No newline at end of file +gold_standard,s3://nao-testing/gold-standard-test/raw/gold_standard_R1.fastq.gz \ No newline at end of file diff --git a/test-se/nextflow.config b/test-se/nextflow.config new file mode 100644 index 00000000..345229bc --- /dev/null +++ b/test-se/nextflow.config @@ -0,0 +1,36 @@ +/************************************************ +| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW | +************************************************/ + +params { + mode = "run_dev_se" + + // Directories + base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3) + ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow) + + // Files + sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV + adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming + + // Whether the underlying data is paired-end or single-end + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true + + // Numerical + grouping = false // Whether to group samples by 'group' column in samplesheet + n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) + n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling + bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20) + blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST) + kraken_memory = "128 GB" // Memory needed to safely load Kraken DB + quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64) + fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2) + host_taxon = "vertebrate" +} + +includeConfig "${projectDir}/configs/logging.config" +includeConfig "${projectDir}/configs/containers.config" +includeConfig "${projectDir}/configs/resources.config" +includeConfig "${projectDir}/configs/profiles.config" +includeConfig "${projectDir}/configs/output.config" +process.queue = "simon-batch-queue" // AWS Batch job queue \ No newline at end of file diff --git a/tests/run.config b/tests/run.config index fc157874..b81f7342 100644 --- a/tests/run.config +++ b/tests/run.config @@ -11,6 +11,9 @@ params { base_dir = "./" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow) + // Whether the underlying data is paired-end or single-end + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true + // Files sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config index 0c7133cd..95562d62 100644 --- a/tests/run_dev_se.config +++ b/tests/run_dev_se.config @@ -13,7 +13,6 @@ params { sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming - // Whether the underlying data is paired-end or single-end single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true diff --git a/workflows/run.nf b/workflows/run.nf index ccba712c..c0aceceb 100644 --- a/workflows/run.nf +++ b/workflows/run.nf @@ -40,9 +40,6 @@ workflow RUN { } } - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Prepare samplesheet if ( params.grouping ) { samplesheet = Channel @@ -60,10 +57,10 @@ workflow RUN { group_ch = Channel.empty() } // Preprocessing - RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end) - CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end) + RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end) + CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end) // Extract and count human-viral reads - EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping) + EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping, params.single_end) // Process intermediate output for chimera detection raw_processed_ch = EXTRACT_VIRAL_READS.out.bbduk_match.join(RAW.out.reads, by: 0) EXTRACT_RAW_READS_FROM_PROCESSED(raw_processed_ch, "raw_viral_subset") @@ -77,7 +74,7 @@ workflow RUN { blast_paired_ch = Channel.empty() } // Taxonomic profiling - PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, single_end) + PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, params.single_end) // Process output qc_ch = RAW.out.qc.concat(CLEAN.out.qc) PROCESS_OUTPUT(qc_ch) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index c74fbd15..503edb6b 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -25,11 +25,8 @@ workflow RUN_DEV_SE { start_time = new Date() start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)") - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Prepare samplesheet - if (single_end) { + if (params.single_end) { if (params.grouping) { samplesheet = Channel .fromPath(params.sample_sheet) @@ -67,11 +64,11 @@ workflow RUN_DEV_SE { // Preprocessing - RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end) - CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end) + RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end) + CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end) // Taxonomic profiling - PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping, single_end) + PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, params.single_end) // Process output qc_ch = RAW.out.qc.concat(CLEAN.out.qc) From 15facd15c7fbf7506847f5e01430aaa5501001a7 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Mon, 2 Dec 2024 16:05:45 +0000 Subject: [PATCH 25/47] moved order of single_end definition. --- tests/run.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/run.config b/tests/run.config index b81f7342..2de10f16 100644 --- a/tests/run.config +++ b/tests/run.config @@ -11,13 +11,13 @@ params { base_dir = "./" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow) - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Files sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming + // Whether the underlying data is paired-end or single-end + single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true + // Numerical grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) From 8f5afe86261dfee49796357f4c181fe293a5d353 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 13:40:59 +0000 Subject: [PATCH 26/47] Reformatted tests/run-dev-se.config to be thesame as tests/run.config --- configs/run.config | 1 - tests/run_dev_se.config | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/configs/run.config b/configs/run.config index 7657476c..b405bb17 100644 --- a/configs/run.config +++ b/configs/run.config @@ -16,7 +16,6 @@ params { // Whether the underlying data is paired-end or single-end single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Numerical grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config index 95562d62..523c2f82 100644 --- a/tests/run_dev_se.config +++ b/tests/run_dev_se.config @@ -10,7 +10,7 @@ params { ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow) // Files - sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV + sample_sheet = "${launchDir}/test-data/single-end-samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming // Whether the underlying data is paired-end or single-end @@ -26,11 +26,10 @@ params { quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64) fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2) host_taxon = "vertebrate" + + blast_db_prefix = "nt_others" } -includeConfig "${projectDir}/configs/logging.config" includeConfig "${projectDir}/configs/containers.config" -includeConfig "${projectDir}/configs/resources.config" includeConfig "${projectDir}/configs/profiles.config" -includeConfig "${projectDir}/configs/output.config" -process.queue = "simon-batch-queue" // AWS Batch job queue \ No newline at end of file +includeConfig "${projectDir}/configs/output.config" \ No newline at end of file From b0d67372a758817c1e163f8aa80e6fed8ca6889b Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 13:42:15 +0000 Subject: [PATCH 27/47] removed unneded whitespace --- subworkflows/local/profile/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf index 13bd2a1b..e0bc1db3 100644 --- a/subworkflows/local/profile/main.nf +++ b/subworkflows/local/profile/main.nf @@ -86,7 +86,6 @@ workflow PROFILE { br_ribo = tax_ribo_ch.bracken.collectFile(name: "bracken_reports_ribo.tsv.gz") br_noribo = tax_noribo_ch.bracken.collectFile(name: "bracken_reports_noribo.tsv.gz") merge_ch = MERGE_TAXONOMY_RIBO(kr_ribo, kr_noribo, br_ribo, br_noribo) - emit: bracken = merge_ch.bracken kraken = merge_ch.kraken From 58270449b9fea8da184843132318718a9bebb8a9 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 13:44:24 +0000 Subject: [PATCH 28/47] reset test-data/nextflowconfig to have settings set to will defaults. --- test-data/nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test-data/nextflow.config b/test-data/nextflow.config index 635a9601..8a84c703 100644 --- a/test-data/nextflow.config +++ b/test-data/nextflow.config @@ -6,7 +6,7 @@ params { mode = "run" // Directories - base_dir = "s3://nao-mgs-simon/test" // Parent for working and output directories (can be S3) + base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3) ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow) // Files @@ -34,4 +34,4 @@ includeConfig "${projectDir}/configs/containers.config" includeConfig "${projectDir}/configs/resources.config" includeConfig "${projectDir}/configs/profiles.config" includeConfig "${projectDir}/configs/output.config" -process.queue = "will-batch-queue" // AWS Batch job queue +process.queue = "will-batch-queue" // AWS Batch job queue \ No newline at end of file From 8302f251134dd9090d6fffc7769224a085c5fc66 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 13:45:48 +0000 Subject: [PATCH 29/47] removed test-se configfile. --- test-se/nextflow.config | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 test-se/nextflow.config diff --git a/test-se/nextflow.config b/test-se/nextflow.config deleted file mode 100644 index 345229bc..00000000 --- a/test-se/nextflow.config +++ /dev/null @@ -1,36 +0,0 @@ -/************************************************ -| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW | -************************************************/ - -params { - mode = "run_dev_se" - - // Directories - base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3) - ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow) - - // Files - sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV - adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming - - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - - // Numerical - grouping = false // Whether to group samples by 'group' column in samplesheet - n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) - n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling - bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20) - blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST) - kraken_memory = "128 GB" // Memory needed to safely load Kraken DB - quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64) - fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2) - host_taxon = "vertebrate" -} - -includeConfig "${projectDir}/configs/logging.config" -includeConfig "${projectDir}/configs/containers.config" -includeConfig "${projectDir}/configs/resources.config" -includeConfig "${projectDir}/configs/profiles.config" -includeConfig "${projectDir}/configs/output.config" -process.queue = "simon-batch-queue" // AWS Batch job queue \ No newline at end of file From ac101244b7241a76954f3e65a8097c96b132c844 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 13:47:37 +0000 Subject: [PATCH 30/47] placeholder comiit --- tests/run.config | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/run.config b/tests/run.config index 2de10f16..320ef045 100644 --- a/tests/run.config +++ b/tests/run.config @@ -27,7 +27,6 @@ params { quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64) fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2) host_taxon = "vertebrate" - blast_db_prefix = "nt_others" } From 30a2a6e910e30a15dd6fa41f59cf14dc85d633c5 Mon Sep 17 00:00:00 2001 From: Will Bradshaw <wjbradshaw1@gmail.com> Date: Wed, 27 Nov 2024 16:21:48 -0500 Subject: [PATCH 31/47] Merge pull request #116 from naobservatory/harmon_fix_copying_file_bug Fixing bug that's causing a fatal error in master pipeline. --- workflows/run.nf | 4 ++-- workflows/run_validation.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/run.nf b/workflows/run.nf index c0aceceb..6793ea99 100644 --- a/workflows/run.nf +++ b/workflows/run.nf @@ -84,9 +84,9 @@ workflow RUN { time_ch = Channel.of(start_time_str + "\n").collectFile(name: "time.txt") version_ch = Channel.fromPath("${projectDir}/pipeline-version.txt") index_params_ch = Channel.fromPath("${params.ref_dir}/input/index-params.json") - .map { file -> file.copyTo("${workDir}/params-index.json") } + .map { file -> file.copyTo("${params.base_dir}/work/params-index.json") } index_pipeline_version_ch = Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt") - .map { file -> file.copyTo("${workDir}/pipeline-version-index.txt") } + .map { file -> file.copyTo("${params.base_dir}/work/pipeline-version-index.txt") } publish: // Saved inputs index_params_ch >> "input" diff --git a/workflows/run_validation.nf b/workflows/run_validation.nf index 66e55cad..c0cdf326 100644 --- a/workflows/run_validation.nf +++ b/workflows/run_validation.nf @@ -43,9 +43,9 @@ workflow RUN_VALIDATION { time_ch = Channel.of(start_time_str + "\n").collectFile(name: "time.txt") version_ch = Channel.fromPath("${projectDir}/pipeline-version.txt") index_params_ch = Channel.fromPath("${params.ref_dir}/input/index-params.json") - .map { file -> file.copyTo("${workDir}/params-index.json") } + .map { file -> file.copyTo("${params.base_dir}/work/params-index.json") } index_pipeline_version_ch = Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt") - .map { file -> file.copyTo("${workDir}/pipeline-version-index.txt") } + .map { file -> file.copyTo("${params.base_dir}/work/pipeline-version-index.txt") } publish: // Saved inputs index_params_ch >> "input" From cc5fd19061cf58ddcc8d714f094a8eb17cb2863f Mon Sep 17 00:00:00 2001 From: EC2 Default User <harmonprograms@protonmail.com> Date: Tue, 3 Dec 2024 14:15:52 +0000 Subject: [PATCH 32/47] Changed version of nextflow, hopefully this works --- .github/workflows/end-to-end.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml index 800abb3a..fe85aa93 100644 --- a/.github/workflows/end-to-end.yml +++ b/.github/workflows/end-to-end.yml @@ -19,7 +19,7 @@ jobs: - name: Setup Nextflow latest-edge uses: nf-core/setup-nextflow@v1 with: - version: "latest-edge" + version: "latest" - name: Install nf-test run: | From ab782570bff107a690d037c911a040d9e961e774 Mon Sep 17 00:00:00 2001 From: EC2 Default User <harmonprograms@protonmail.com> Date: Tue, 3 Dec 2024 14:18:59 +0000 Subject: [PATCH 33/47] Updated label to be correct --- .github/workflows/end-to-end.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml index fe85aa93..1f9d3d9b 100644 --- a/.github/workflows/end-to-end.yml +++ b/.github/workflows/end-to-end.yml @@ -16,7 +16,7 @@ jobs: java-version: '11' distribution: 'adopt' - - name: Setup Nextflow latest-edge + - name: Setup Nextflow latest (stable) uses: nf-core/setup-nextflow@v1 with: version: "latest" From 1db7ba8c83f4f11d3ea841b2984c68fb3d1412e6 Mon Sep 17 00:00:00 2001 From: Simon Grimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 20:35:45 +0000 Subject: [PATCH 34/47] updated end-to-end.yml and run dev se config. --- .github/workflows/end-to-end.yml | 3 +++ tests/run_dev_se.config | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml index 1f9d3d9b..02e0eebb 100644 --- a/.github/workflows/end-to-end.yml +++ b/.github/workflows/end-to-end.yml @@ -44,3 +44,6 @@ jobs: - name: Run run_validation workflow run: nf-test test --tag validation --verbose + + - name: Run run_dev_se workflow + run: nf-test test --tag run_dev_se --verbose diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config index 523c2f82..48f7a392 100644 --- a/tests/run_dev_se.config +++ b/tests/run_dev_se.config @@ -10,7 +10,7 @@ params { ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow) // Files - sample_sheet = "${launchDir}/test-data/single-end-samplesheet.csv" // Path to library TSV + sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming // Whether the underlying data is paired-end or single-end From 1751288612ec355977feea9634623ac73fdabba1 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Tue, 3 Dec 2024 21:35:10 +0000 Subject: [PATCH 35/47] test commit --- tests/run_dev_se.config | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config index 48f7a392..d3db01c4 100644 --- a/tests/run_dev_se.config +++ b/tests/run_dev_se.config @@ -13,6 +13,7 @@ params { sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming + // Whether the underlying data is paired-end or single-end single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true From 5567568b1f03e03911aeff7273a766a3b3a3d3aa Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Wed, 4 Dec 2024 20:35:05 +0000 Subject: [PATCH 36/47] dropped read type info from config files where its not needed. --- configs/run.config | 3 --- tests/run.config | 3 --- tests/run_dev_se.config | 4 ---- 3 files changed, 10 deletions(-) diff --git a/configs/run.config b/configs/run.config index 6da753a8..3669ba39 100644 --- a/configs/run.config +++ b/configs/run.config @@ -13,9 +13,6 @@ params { sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Numerical grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) diff --git a/tests/run.config b/tests/run.config index f4a45119..cd8c62a6 100644 --- a/tests/run.config +++ b/tests/run.config @@ -15,9 +15,6 @@ params { sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Numerical grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config index dc0709fe..8fd5e597 100644 --- a/tests/run_dev_se.config +++ b/tests/run_dev_se.config @@ -13,10 +13,6 @@ params { sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming - - // Whether the underlying data is paired-end or single-end - single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true - // Numerical grouping = false // Whether to group samples by 'group' column in samplesheet n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads) From a3af7db8750fccefdd6e07b24926ace3e879d195 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Wed, 4 Dec 2024 20:38:52 +0000 Subject: [PATCH 37/47] Made rundevse index and outputs look the same as run.nf --- workflows/run_dev_se.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index 503edb6b..dad3d2cc 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -79,11 +79,14 @@ workflow RUN_DEV_SE { params_ch = Channel.of(params_str).collectFile(name: "run-params.json") time_ch = Channel.of(start_time_str + "\n").collectFile(name: "time.txt") version_ch = Channel.fromPath("${projectDir}/pipeline-version.txt") - + index_params_ch = Channel.fromPath("${params.ref_dir}/input/index-params.json") + .map { file -> file.copyTo("${params.base_dir}/work/params-index.json") } + index_pipeline_version_ch = Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt") + .map { file -> file.copyTo("${params.base_dir}/work/pipeline-version-index.txt") } publish: // Saved inputs - Channel.fromPath("${params.ref_dir}/input/index-params.json") >> "input" - Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt").collectFile(name: "pipeline-version-index.txt") >> "logging" + index_params_ch >> "input" + index_pipeline_version_ch >> "logging" Channel.fromPath(params.sample_sheet) >> "input" Channel.fromPath(params.adapters) >> "input" params_ch >> "input" From d465db75b80cf289dd11737dbfb9b8b89daf6d74 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Thu, 5 Dec 2024 18:26:47 +0000 Subject: [PATCH 38/47] fixed memory issue in bbduk --- modules/local/bbduk/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf index 2d72dc08..a6c3b210 100644 --- a/modules/local/bbduk/main.nf +++ b/modules/local/bbduk/main.nf @@ -54,7 +54,7 @@ process BBDUK_SINGLE { ref=!{contaminant_ref} io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}" # Define parameters - par="minkmerfraction=!{min_kmer_fraction} k=!{k} t=!{task.cpus} -Xmx30g" + par="minkmerfraction=!{min_kmer_fraction} k=!{k} t=!{task.cpus} -Xmx!{task.memory.toGiga()}g" # Execute bbduk.sh ${io} ${par} ''' From 4e74d7c12b1d99f925510ef9df3e2c1f328ab40d Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Thu, 5 Dec 2024 18:31:20 +0000 Subject: [PATCH 39/47] Fixing setup of run_dev_se test config. --- tests/run_dev_se.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config index 8fd5e597..34b72954 100644 --- a/tests/run_dev_se.config +++ b/tests/run_dev_se.config @@ -29,5 +29,5 @@ params { includeConfig "${projectDir}/configs/containers.config" includeConfig "${projectDir}/configs/profiles.config" +includeConfig "${projectDir}/configs/read_type.config" includeConfig "${projectDir}/configs/output.config" -includeConfig "${projectDir}/configs/read_type.config" \ No newline at end of file From d061c1d1d5bdf599d830f477b80fab034a39a305 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Mon, 9 Dec 2024 16:27:39 +0000 Subject: [PATCH 40/47] fixed imports in taxonomy/main.nf --- subworkflows/local/taxonomy/main.nf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index ee4416af..278c1873 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -6,14 +6,10 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -if (params.single_end) { - include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" -} else { - include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" //addParams(suffix: "fastq") +if (!params.single_end) { include { JOIN_FASTQ } from "../../../modules/local/joinFastq" include { BBMERGE } from "../../../modules/local/bbmerge" include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" - } include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" From db187e633a3ba4c8baeafb43979984349b514674 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <58591538+simonleandergrimm@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:39:53 -0500 Subject: [PATCH 41/47] Removed redundancy --- subworkflows/local/taxonomy/main.nf | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index 278c1873..d685b292 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -1,4 +1,4 @@ -/*********************************************************** +`/*********************************************************** | SUBWORKFLOW: TAXONOMIC PROFILING WITH KRAKEN AND BRACKEN | ***********************************************************/ @@ -36,12 +36,7 @@ workflow TAXONOMY { if (single_end) { // No merging in single read version summarize_bbmerge_ch = Channel.empty() - // Deduplicate reads (if applicable) - if (dedup_rc) { - dedup_ch = CLUMPIFY_SINGLE(reads_ch) - } else { - dedup_ch = reads_ch - } + single_read_ch = reads_ch } else { // Deduplicate reads (if applicable) if ( dedup_rc ){ @@ -53,14 +48,16 @@ workflow TAXONOMY { merged_ch = BBMERGE(paired_dedup_ch) // Only want to summarize the merged elements summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]}) - joined_ch = JOIN_FASTQ(merged_ch.reads) - // Deduplicate reads (if applicable) - if ( dedup_rc ){ - dedup_ch = CLUMPIFY_SINGLE(joined_ch) + single_read_ch = JOIN_FASTQ(merged_ch.reads) + } + + // Deduplicate reads (if applicable) + if (dedup_rc) { + dedup_ch = CLUMPIFY_SINGLE(single_read_ch) } else { - dedup_ch = joined_ch - } + dedup_ch = single_read_ch } + // Summarize last of the output summarize_dedup_ch = SUMMARIZE_DEDUP(dedup_ch) From 8a09b0882dd1dc26c95408b874f2165d6bd2d81c Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Tue, 10 Dec 2024 20:28:16 +0000 Subject: [PATCH 42/47] delete unexpected cahracter --- subworkflows/local/taxonomy/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index d685b292..b6ab693f 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -1,4 +1,4 @@ -`/*********************************************************** +/*********************************************************** | SUBWORKFLOW: TAXONOMIC PROFILING WITH KRAKEN AND BRACKEN | ***********************************************************/ @@ -55,7 +55,7 @@ workflow TAXONOMY { if (dedup_rc) { dedup_ch = CLUMPIFY_SINGLE(single_read_ch) } else { - dedup_ch = single_read_ch + dedup_ch = single_read_ch } // Summarize last of the output From 55f75f84c8cc0b2839cee214022a0d9655cef247 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Wed, 11 Dec 2024 16:58:09 +0000 Subject: [PATCH 43/47] moved kraken db location --- workflows/run_dev_se.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index ea78be67..2e350c17 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -25,6 +25,7 @@ workflow RUN_DEV_SE { // Start time start_time = new Date() start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)") + kraken_db_path = "${params.ref_dir}/results/kraken_db" // Check if grouping column exists in samplesheet check_grouping = new File(params.sample_sheet).text.readLines()[0].contains('group') ? true : false @@ -34,8 +35,6 @@ workflow RUN_DEV_SE { } else if (!params.grouping && check_grouping) { throw new Exception("Grouping is not enabled in config file, but group column is present in the samplesheet.") } - // Prepare Kraken DB - kraken_db_path = "${params.ref_dir}/results/kraken_db" } // Load samplesheet From 1210a3c849dbc77f44fcf5676f99f41b0f95e521 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <58591538+simonleandergrimm@users.noreply.github.com> Date: Thu, 12 Dec 2024 09:49:59 -0500 Subject: [PATCH 44/47] Update main.nf --- subworkflows/local/taxonomy/main.nf | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf index b6ab693f..2e7c70ea 100644 --- a/subworkflows/local/taxonomy/main.nf +++ b/subworkflows/local/taxonomy/main.nf @@ -6,11 +6,9 @@ | MODULES AND SUBWORKFLOWS | ***************************/ -if (!params.single_end) { - include { JOIN_FASTQ } from "../../../modules/local/joinFastq" - include { BBMERGE } from "../../../modules/local/bbmerge" - include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" -} +include { JOIN_FASTQ } from "../../../modules/local/joinFastq" +include { BBMERGE } from "../../../modules/local/bbmerge" +include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" From 9113fd4cdae21f1bf0c8345ab3a46e5459e3f22c Mon Sep 17 00:00:00 2001 From: simonleandergrimm <58591538+simonleandergrimm@users.noreply.github.com> Date: Fri, 20 Dec 2024 10:39:47 -0500 Subject: [PATCH 45/47] Update end-to-end.yml --- .github/workflows/end-to-end.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml index 6b765e5a..1fd0383e 100644 --- a/.github/workflows/end-to-end.yml +++ b/.github/workflows/end-to-end.yml @@ -83,4 +83,3 @@ jobs: - name: Run run_validation workflow run: nf-test test --tag validation --verbose - From 4bfbc462c861cacc0c0305ad54f468f297fe7b89 Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Fri, 20 Dec 2024 15:45:07 +0000 Subject: [PATCH 46/47] updated changelog. updated rundevse to have correct loadsamplesheet process naming. removed hvscreen main.nf --- CHANGELOG.md | 6 +++--- subworkflows/local/hv_screen/main.nf | 0 workflows/run_dev_se.nf | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 subworkflows/local/hv_screen/main.nf diff --git a/CHANGELOG.md b/CHANGELOG.md index 97560797..8635adc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ # v2.5.3 (in progress) -- Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing +- Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing - Began development of single-end read processing (still in progress) - - Restructured RAW, CLEAN, and QC workflows to handle both single-end and paired-end reads - - Added new FASTP_SINGLE and TRUNCATE_CONCAT_SINGLE processes to handle single-end reads + - Restructured RAW, CLEAN, QC, TAXONOMY, and PROFILE workflows to handle both single-end and paired-end reads + - Added new FASTP_SINGLE, TRUNCATE_CONCAT_SINGLE, BBDUK_SINGLE, CONCAT_GROUP_SINGLE, SUBSET_READS_SINGLE and SUBSET_READS_SINGLE_TARGET processes to handle single-end reads - Created separate end-to-end test workflow for single-end processing (which will be removed once single-end processing is fully integrated) - Modified samplesheet handling to support both single-end and paired-end data - Updated generate_samplesheet.sh to handle single-end data with --single_end flag diff --git a/subworkflows/local/hv_screen/main.nf b/subworkflows/local/hv_screen/main.nf deleted file mode 100644 index e69de29b..00000000 diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf index 6f9b982f..3df5f0e4 100644 --- a/workflows/run_dev_se.nf +++ b/workflows/run_dev_se.nf @@ -13,7 +13,7 @@ include { RAW } from "../subworkflows/local/raw" include { CLEAN } from "../subworkflows/local/clean" include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput" include { PROFILE } from "../subworkflows/local/profile" -include { LOAD_SAMPLESHET } from "../subworkflows/local/loadSampleSheet" +include { LOAD_SAMPLESHEET } from "../subworkflows/local/loadSampleSheet" nextflow.preview.output = true /***************** From d2826d2f63b88ab7fd6c81ab7ab87da98c84be3b Mon Sep 17 00:00:00 2001 From: simonleandergrimm <simonleandergrimm@gmail.com> Date: Fri, 20 Dec 2024 15:58:44 +0000 Subject: [PATCH 47/47] removed whitespace --- .github/workflows/end-to-end.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml index 6b765e5a..3e4c8d60 100644 --- a/.github/workflows/end-to-end.yml +++ b/.github/workflows/end-to-end.yml @@ -37,7 +37,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Set up JDK 11 uses: actions/setup-java@v4 with: @@ -83,4 +82,3 @@ jobs: - name: Run run_validation workflow run: nf-test test --tag validation --verbose -