From c4b1f62444a04415423e1a051072cc1003e2d005 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 15:00:00 +0000
Subject: [PATCH 01/47] Updated subsetReads to optionally take in single reads.
 Added PROFILE back to run_dev_se.nf

---
 modules/local/subsetReads/main.nf  | 35 +++++++++++++++++++++++++++++-
 subworkflows/local/profile/main.nf |  7 ++++--
 workflows/run_dev_se.nf            |  6 +++++
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/modules/local/subsetReads/main.nf b/modules/local/subsetReads/main.nf
index 6a16ef8d..6ef64936 100644
--- a/modules/local/subsetReads/main.nf
+++ b/modules/local/subsetReads/main.nf
@@ -52,7 +52,7 @@ process SUBSET_READS_PAIRED_MERGED {
         '''
 }
 
-// Subsample reads with seqtk with an autocomputed read fraction
+// Subsample reads with seqtk with an autocomputed read fraction (paired-end)
 process SUBSET_READS_PAIRED_TARGET {
     label "seqtk"
     label "single"
@@ -88,3 +88,36 @@ process SUBSET_READS_PAIRED_TARGET {
         echo "Output reads: $(zcat ${out1} | wc -l | awk '{ print $1/4 }')"
         '''
 }
+
+// Subsample reads with seqtk with an autocomputed read fraction (single-end)
+process SUBSET_READS_SINGLE_TARGET {
+    label "seqtk"
+    label "single"
+    input:
+        tuple val(sample), path(reads)
+        val readTarget
+    output:
+        tuple val(sample), path("${sample}_subset.${params.suffix}.gz")
+    shell:
+        '''
+        # Define input/output
+        in=!{reads}
+        out=!{sample}_subset.!{params.suffix}.gz
+        # Count reads and compute target fraction
+        n_reads=$(zcat ${in} | wc -l | awk '{ print $1/4 }')
+        echo "Input reads: ${n_reads}"
+        echo "Target reads: !{readTarget}"
+        if (( ${n_reads} <= !{readTarget} )); then
+            echo "Target larger than input; returning all reads."
+            cp ${in} ${out}
+        else
+            frac=$(awk -v a=${n_reads} -v b=!{readTarget} 'BEGIN {result = b/a; print (result > 1) ? 1.0 : result}')
+            echo "Read fraction for subsetting: ${frac}"
+            # Carry out subsetting
+            seed=${RANDOM}
+            seqtk sample -s ${seed} ${in} ${frac} | gzip -c > ${out}
+        fi
+        # Count reads for validation
+        echo "Output reads: $(zcat ${out} | wc -l | awk '{ print $1/4 }')"
+        '''
+}
diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index a27c7bc5..e9e8d878 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -6,7 +6,10 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-include { SUBSET_READS_PAIRED_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+include {
+    SUBSET_READS_${params.read_type == 'paired_end' ? 'PAIRED' : 'SINGLE'}_TARGET as SUBSET_READS_TARGET
+} from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+
 include { BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
 include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}")
 include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}")
@@ -24,7 +27,7 @@ workflow PROFILE {
         ref_dir
     main:
         // Randomly subset reads to target number
-        subset_ch = SUBSET_READS_PAIRED_TARGET(reads_ch, n_reads)
+        subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads)
         // Separate ribosomal reads
         ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz"
         ribo_ch = BBDUK(subset_ch, ribo_path, params.min_kmer_fraction, params.k)
diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index e368cc06..52ba8eca 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -41,6 +41,9 @@ workflow RUN_DEV_SE {
     RAW(samplesheet, params.n_reads_trunc)
     CLEAN(RAW.out.reads, params.adapters)
 
+    // Taxonomic profiling
+    PROFILE(CLEAN.out.reads, kraken_db_path, params.n_reads_profile, params.ref_dir)
+
     // Process output
     qc_ch = RAW.out.qc.concat(CLEAN.out.qc)
     PROCESS_OUTPUT(qc_ch)
@@ -65,4 +68,7 @@ workflow RUN_DEV_SE {
         PROCESS_OUTPUT.out.adapt >> "results/qc"
         PROCESS_OUTPUT.out.qbase >> "results/qc"
         PROCESS_OUTPUT.out.qseqs >> "results/qc"
+
+        PROFILE.out.bracken >> "results/taxonomy"
+        PROFILE.out.kraken >> "results/taxonomy"
 }
\ No newline at end of file

From 193dd22f896c51f0cf6bf1b6725a5195f8bbf88f Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 15:59:00 +0000
Subject: [PATCH 02/47] Added BBDUK single read script.

---
 modules/local/bbduk/main.nf        | 30 +++++++++++++++++++++++++++++-
 subworkflows/local/profile/main.nf | 14 ++++++++++----
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf
index 413a2953..c0d78130 100644
--- a/modules/local/bbduk/main.nf
+++ b/modules/local/bbduk/main.nf
@@ -1,5 +1,5 @@
 // Detection and removal of contaminant reads
-process BBDUK {
+process BBDUK_PAIRED {
     label "large"
     label "BBTools"
     input:
@@ -30,6 +30,34 @@ process BBDUK {
         '''
 }
 
+process BBDUK_SINGLE {
+    label "large"
+    label "BBTools"
+    input:
+        tuple val(sample), path(reads)
+        path(contaminant_ref)
+        val(min_kmer_fraction)
+        val(k)
+    output:
+        tuple val(sample), path("${sample}_${params.suffix}_bbduk_pass.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_${params.suffix}_bbduk_fail.fastq.gz"), emit: fail
+        tuple val(sample), path("${sample}_${params.suffix}_bbduk.stats.txt"), emit: log
+    shell:
+        '''
+        # Define input/output
+        in=!{reads}
+        op=!{sample}_!{params.suffix}_bbduk_pass.fastq.gz
+        of=!{sample}_!{params.suffix}_bbduk_fail.fastq.gz
+        stats=!{sample}_!{params.suffix}_bbduk.stats.txt
+        ref=!{contaminant_ref}
+        io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}"
+        # Define parameters
+        par="minkmerfraction=!{min_kmer_fraction} k=!{k} t=!{task.cpus} -Xmx30g"
+        # Execute
+        bbduk.sh ${io} ${par}
+        '''
+}
+
 // Detection and removal of contaminant reads (use minkmerhits instead of minkmerfraction)
 process BBDUK_HITS {
     label "large"
diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index e9e8d878..2483e79f 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -6,11 +6,17 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-include {
-    SUBSET_READS_${params.read_type == 'paired_end' ? 'PAIRED' : 'SINGLE'}_TARGET as SUBSET_READS_TARGET
-} from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
 
-include { BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
+if (params.read_type == "single_end") {
+    include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+} else if (params.read_type == "paired_end") {
+    include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+}
+if (params.read_type == "single_end") {
+    include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
+} else if (params.read_type == "paired_end") {
+    include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
+}
 include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}")
 include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}")
 include { MERGE_TAXONOMY_RIBO } from "../../../modules/local/mergeTaxonomyRibo"

From 0e081f2ff1370e8288daaecbddbcc15446b06e1d Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 18:16:13 +0000
Subject: [PATCH 03/47] Added subsetting for single reads/

---
 modules/local/subsetReads/main.nf | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/modules/local/subsetReads/main.nf b/modules/local/subsetReads/main.nf
index 6ef64936..4b766768 100644
--- a/modules/local/subsetReads/main.nf
+++ b/modules/local/subsetReads/main.nf
@@ -25,6 +25,30 @@ process SUBSET_READS_PAIRED {
         '''
 }
 
+// Subsample reads with seqtk (single-end)
+process SUBSET_READS_SINGLE {
+    label "seqtk"
+    label "single"
+    input:
+        tuple val(sample), path(reads)
+        val readFraction
+    output:
+        tuple val(sample), path("${sample}_subset.${params.suffix}.gz")
+    shell:
+        '''
+        # Define input/output
+        in=!{reads}
+        out=!{sample}_subset.!{params.suffix}.gz
+        # Count reads for validation
+        echo "Input reads: $(zcat ${in} | wc -l | awk '{ print $1/4 }')"
+        # Carry out subsetting
+        seed=${RANDOM}
+        seqtk sample -s ${seed} ${in} !{readFraction} | gzip -c > ${out}
+        # Count reads for validation
+        echo "Output reads: $(zcat ${out} | wc -l | awk '{ print $1/4 }')"
+        '''
+}
+
 // Subsample reads with seqtk (no sample name)
 process SUBSET_READS_PAIRED_MERGED {
     label "seqtk"

From 4e140aa65b7053cb85573aa7c50f06ccae2b29b5 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 18:16:27 +0000
Subject: [PATCH 04/47] Added selection of single or paired end reads.

---
 subworkflows/local/hv_screen/main.nf | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/subworkflows/local/hv_screen/main.nf b/subworkflows/local/hv_screen/main.nf
index 0edb7489..a4b77a43 100644
--- a/subworkflows/local/hv_screen/main.nf
+++ b/subworkflows/local/hv_screen/main.nf
@@ -11,7 +11,12 @@ if (params.read_type == "single_end") {
 } else if (params.read_type == "paired_end") {
     include { FASTP_PAIRED as FASTP } from "../../../modules/local/fastp"
 }
-include { BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
+
+if (params.read_type == "single_end") {
+    include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
+} else if (params.read_type == "paired_end") {
+    include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
+}
 include { PROCESS_BOWTIE2_SAM_PAIRED } from "../../../modules/local/processBowtie2Sam"
 
 /***********

From d270d3f091890cb56adadf431d0380f99c8ff1a1 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 18:17:04 +0000
Subject: [PATCH 05/47] WIP edits to taxonomy main scripts (commenting,
 subselecting processes, if clauses in the workflow)

---
 subworkflows/local/taxonomy/main.nf | 89 +++++++++++++++++++----------
 1 file changed, 58 insertions(+), 31 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index 6a8a7a7d..47022585 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -6,14 +6,18 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-include { SUBSET_READS_PAIRED } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
-include { BBMERGE } from "../../../modules/local/bbmerge"
-include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
-include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup"
-include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify"
-include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
-include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify"
-include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}")
+if (params.read_type == "paired_end") {
+    include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+} else if (params.read_type == "single_end") {
+    include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+}
+include { BBMERGE } from "../../../modules/local/bbmerge" // probalby  skippable in single read version
+include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" // probalby  skippable in single read version
+include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" // probalby  no change needed in single read version
+include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" // already has a single read version.
+include { JOIN_FASTQ } from "../../../modules/local/joinFastq" // probably not needed in single read version
+include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" // already has a single read version.
+include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}") // probs not changes needed
 include { LABEL_KRAKEN_REPORTS } from "../../../modules/local/labelKrakenReports"
 include { MERGE_TSVS as MERGE_KRAKEN_REPORTS } from "../../../modules/local/mergeTsvs" addParams(name: "kraken_reports")
 include { MERGE_TSVS as MERGE_BRACKEN } from "../../../modules/local/mergeTsvs" addParams(name: "bracken_reports")
@@ -33,26 +37,38 @@ workflow TAXONOMY {
         if ( params.read_fraction == 1 ){
             subset_ch = reads_ch
         } else {
-            subset_ch = SUBSET_READS_PAIRED(reads_ch, params.read_fraction)
+            subset_ch = SUBSET_READS(reads_ch, params.read_fraction)
         }
 
-         // Deduplicate reads (if applicable)
-        if ( params.dedup_rc ){
-            paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch)
-        } else {
-            paired_dedup_ch = subset_ch
-        }
-        // Prepare reads
-        merged_ch = BBMERGE(paired_dedup_ch)
-        // Only want to summarize the merged elements
-        summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]})
-        joined_ch = JOIN_FASTQ(merged_ch.reads)
-        // Deduplicate reads (if applicable)
-        if ( params.dedup_rc ){
-            dedup_ch = CLUMPIFY_SINGLE(joined_ch)
-        } else {
-            dedup_ch = joined_ch
+        if (params.read_type == "paired_end") {
+            // Deduplicate reads (if applicable)
+            if ( params.dedup_rc ){
+                paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch)
+            } else {
+                paired_dedup_ch = subset_ch
+            }
+            // Prepare reads
+            merged_ch = BBMERGE(paired_dedup_ch)
+            // Only want to summarize the merged elements
+            summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]})
+            joined_ch = JOIN_FASTQ(merged_ch.reads)
+            // Deduplicate reads (if applicable)
+            if ( params.dedup_rc ){
+                dedup_ch = CLUMPIFY_SINGLE(joined_ch)
+            } else {
+                dedup_ch = joined_ch
+            }
+
+        } else if (params.read_type == "single_end") {
+            // Deduplicate reads (if applicable)
+
+            if (params.dedup_rc) {
+                dedup_ch = CLUMPIFY_SINGLE(subset_ch)
+            } else {
+                dedup_ch = subset_ch
+            }
         }
+
         // Summarize last of the output
         summarize_dedup_ch = SUMMARIZE_DEDUP(dedup_ch)
 
@@ -64,10 +80,21 @@ workflow TAXONOMY {
         bracken_ch = BRACKEN(kraken_ch.report, kraken_db_ch, params.classification_level)
         bracken_label_ch = LABEL_BRACKEN_REPORTS(bracken_ch)
         bracken_merge_ch = MERGE_BRACKEN(bracken_label_ch.collect().ifEmpty([]))
-    emit:
-        kraken_output = kraken_ch.output
-        kraken_reports = kraken_merge_ch
-        bracken = bracken_merge_ch
-        bbmerge_summary = summarize_bbmerge_ch
-        dedup_summary = summarize_dedup_ch
+
+        if (params.read_type == "paired_end") {
+            emit:
+                kraken_output = kraken_ch.output
+                kraken_reports = kraken_merge_ch
+                bracken = bracken_merge_ch
+                bbmerge_summary = summarize_bbmerge_ch
+                dedup_summary = summarize_dedup_ch
+        }
+
+        else if (params.read_type == "single_end") {
+            emit:
+                kraken_output = kraken_ch.output
+                kraken_reports = kraken_merge_ch
+                bracken = bracken_merge_ch
+                dedup_summary = summarize_dedup_ch
+        }
 }

From bad278746f103966c972b7b8c55556026e586dc9 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 18:17:28 +0000
Subject: [PATCH 06/47] Adding kraken path to run_dev_se.nf

---
 workflows/run_dev_se.nf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index 52ba8eca..7dd99e88 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -38,6 +38,9 @@ workflow RUN_DEV_SE {
             .splitCsv(header: true)
             .map{row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2))}
     }
+    // Prepare Kraken DB
+    kraken_db_path = "${params.ref_dir}/results/kraken_db"
+// Preprocessing
     RAW(samplesheet, params.n_reads_trunc)
     CLEAN(RAW.out.reads, params.adapters)
 

From 79f922219b44b2a1a7d1719601af69465a55b84b Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 28 Oct 2024 18:18:03 +0000
Subject: [PATCH 07/47] Adding WIP changes to test/nextflow.config to run on
 Simon's S3.

---
 test/nextflow.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/nextflow.config b/test/nextflow.config
index 39150408..a63b3600 100644
--- a/test/nextflow.config
+++ b/test/nextflow.config
@@ -6,7 +6,7 @@ params {
     mode = "run"
 
     // Directories
-    base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
+    base_dir = "s3://nao-mgs-simon/test" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index-20240714/output" // Reference/index directory (generated by index workflow)
 
     // Files
@@ -28,4 +28,4 @@ includeConfig "${projectDir}/configs/logging.config"
 includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
-process.queue = "will-batch-queue" // AWS Batch job queue
+process.queue = "simon-batch-queue" // AWS Batch job queue

From 50adde14a7c517eec541891ec8bc1c224c668283 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Thu, 7 Nov 2024 19:14:11 +0000
Subject: [PATCH 08/47] fixed wrong indent in run_dev_se.nf

---
 workflows/run_dev_se.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index 7dd99e88..c2690db7 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -40,7 +40,7 @@ workflow RUN_DEV_SE {
     }
     // Prepare Kraken DB
     kraken_db_path = "${params.ref_dir}/results/kraken_db"
-// Preprocessing
+    // Preprocessing
     RAW(samplesheet, params.n_reads_trunc)
     CLEAN(RAW.out.reads, params.adapters)
 

From daa6c3ebb0ed62e1b073151e897bf2be5788f45a Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Thu, 7 Nov 2024 19:16:33 +0000
Subject: [PATCH 09/47] Slight edit to inline commments in taxonomy/main.nf.
 Reverted emit statement to original version.

---
 subworkflows/local/taxonomy/main.nf | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index 47022585..897c2a6b 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -60,8 +60,9 @@ workflow TAXONOMY {
             }
 
         } else if (params.read_type == "single_end") {
+            // No merging in single read version
+            summarize_bbmerge_ch = Channel.empty()
             // Deduplicate reads (if applicable)
-
             if (params.dedup_rc) {
                 dedup_ch = CLUMPIFY_SINGLE(subset_ch)
             } else {
@@ -81,20 +82,13 @@ workflow TAXONOMY {
         bracken_label_ch = LABEL_BRACKEN_REPORTS(bracken_ch)
         bracken_merge_ch = MERGE_BRACKEN(bracken_label_ch.collect().ifEmpty([]))
 
-        if (params.read_type == "paired_end") {
-            emit:
-                kraken_output = kraken_ch.output
-                kraken_reports = kraken_merge_ch
-                bracken = bracken_merge_ch
-                bbmerge_summary = summarize_bbmerge_ch
-                dedup_summary = summarize_dedup_ch
-        }
 
-        else if (params.read_type == "single_end") {
-            emit:
-                kraken_output = kraken_ch.output
-                kraken_reports = kraken_merge_ch
-                bracken = bracken_merge_ch
-                dedup_summary = summarize_dedup_ch
-        }
+        emit:
+            kraken_output = kraken_ch.output
+            kraken_reports = kraken_merge_ch
+            bracken = bracken_merge_ch
+            bbmerge_summary = summarize_bbmerge_ch
+            dedup_summary = summarize_dedup_ch
+
+
 }

From 39c797c9ce0c9d2644ec48a01c4802528799efbb Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Thu, 7 Nov 2024 19:28:07 +0000
Subject: [PATCH 10/47] Revert "Adding WIP changes to test/nextflow.config to
 run on Simon's S3."

This reverts commit 79f922219b44b2a1a7d1719601af69465a55b84b.
---
 test/nextflow.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/nextflow.config b/test/nextflow.config
index a63b3600..39150408 100644
--- a/test/nextflow.config
+++ b/test/nextflow.config
@@ -6,7 +6,7 @@ params {
     mode = "run"
 
     // Directories
-    base_dir = "s3://nao-mgs-simon/test" // Parent for working and output directories (can be S3)
+    base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index-20240714/output" // Reference/index directory (generated by index workflow)
 
     // Files
@@ -28,4 +28,4 @@ includeConfig "${projectDir}/configs/logging.config"
 includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
-process.queue = "simon-batch-queue" // AWS Batch job queue
+process.queue = "will-batch-queue" // AWS Batch job queue

From 2da99f1a21a7347deb934fa7b251ac769c998886 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Thu, 7 Nov 2024 19:29:58 +0000
Subject: [PATCH 11/47] removed white space in two files

---
 subworkflows/local/profile/main.nf  | 1 -
 subworkflows/local/taxonomy/main.nf | 1 -
 2 files changed, 2 deletions(-)

diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index 2483e79f..24906dbd 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -6,7 +6,6 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-
 if (params.read_type == "single_end") {
     include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
 } else if (params.read_type == "paired_end") {
diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index 897c2a6b..c330e494 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -82,7 +82,6 @@ workflow TAXONOMY {
         bracken_label_ch = LABEL_BRACKEN_REPORTS(bracken_ch)
         bracken_merge_ch = MERGE_BRACKEN(bracken_label_ch.collect().ifEmpty([]))
 
-
         emit:
             kraken_output = kraken_ch.output
             kraken_reports = kraken_merge_ch

From 1d301159811013bca33bd2e64cc938f7257477b2 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Thu, 7 Nov 2024 19:39:51 +0000
Subject: [PATCH 12/47] Removed unneeded comments in taxonomy/main.nf

---
 subworkflows/local/taxonomy/main.nf | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index c330e494..2e5dcaeb 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -11,13 +11,13 @@ if (params.read_type == "paired_end") {
 } else if (params.read_type == "single_end") {
     include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
 }
-include { BBMERGE } from "../../../modules/local/bbmerge" // probalby  skippable in single read version
-include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge" // probalby  skippable in single read version
-include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup" // probalby  no change needed in single read version
-include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify" // already has a single read version.
-include { JOIN_FASTQ } from "../../../modules/local/joinFastq" // probably not needed in single read version
-include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify" // already has a single read version.
-include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}") // probs not changes needed
+include { BBMERGE } from "../../../modules/local/bbmerge"
+include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
+include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup"
+include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify"
+include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
+include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify"
+include { KRAKEN } from "../../../modules/local/kraken" addParams(mem: "${params.kraken_memory}")
 include { LABEL_KRAKEN_REPORTS } from "../../../modules/local/labelKrakenReports"
 include { MERGE_TSVS as MERGE_KRAKEN_REPORTS } from "../../../modules/local/mergeTsvs" addParams(name: "kraken_reports")
 include { MERGE_TSVS as MERGE_BRACKEN } from "../../../modules/local/mergeTsvs" addParams(name: "bracken_reports")

From d4b7239ad858e1e5ed848aecc50193ab883add31 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:43:06 +0000
Subject: [PATCH 13/47] dropped params.*

---
 modules/local/bbduk/main.nf       | 13 +++++++------
 modules/local/subsetReads/main.nf | 10 ++++++----
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf
index de2c012b..804e4423 100644
--- a/modules/local/bbduk/main.nf
+++ b/modules/local/bbduk/main.nf
@@ -39,17 +39,18 @@ process BBDUK_SINGLE {
         path(contaminant_ref)
         val(min_kmer_fraction)
         val(k)
+        val(suffix)
     output:
-        tuple val(sample), path("${sample}_${params.suffix}_bbduk_pass.fastq.gz"), emit: reads
-        tuple val(sample), path("${sample}_${params.suffix}_bbduk_fail.fastq.gz"), emit: fail
-        tuple val(sample), path("${sample}_${params.suffix}_bbduk.stats.txt"), emit: log
+        tuple val(sample), path("${sample}_${suffix}_bbduk_pass.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_${suffix}_bbduk_fail.fastq.gz"), emit: fail
+        tuple val(sample), path("${sample}_${suffix}_bbduk.stats.txt"), emit: log
     shell:
         '''
         # Define input/output
         in=!{reads}
-        op=!{sample}_!{params.suffix}_bbduk_pass.fastq.gz
-        of=!{sample}_!{params.suffix}_bbduk_fail.fastq.gz
-        stats=!{sample}_!{params.suffix}_bbduk.stats.txt
+        op=!{sample}_!{suffix}_bbduk_pass.fastq.gz
+        of=!{sample}_!{suffix}_bbduk_fail.fastq.gz
+        stats=!{sample}_!{suffix}_bbduk.stats.txt
         ref=!{contaminant_ref}
         io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}"
         # Define parameters
diff --git a/modules/local/subsetReads/main.nf b/modules/local/subsetReads/main.nf
index 5fd33d28..91b8f53d 100644
--- a/modules/local/subsetReads/main.nf
+++ b/modules/local/subsetReads/main.nf
@@ -33,13 +33,14 @@ process SUBSET_READS_SINGLE {
     input:
         tuple val(sample), path(reads)
         val readFraction
+        val suffix
     output:
-        tuple val(sample), path("${sample}_subset.${params.suffix}.gz")
+        tuple val(sample), path("${sample}_subset.${suffix}.gz")
     shell:
         '''
         # Define input/output
         in=!{reads}
-        out=!{sample}_subset.!{params.suffix}.gz
+        out=!{sample}_subset.!{suffix}.gz
         # Count reads for validation
         echo "Input reads: $(zcat ${in} | wc -l | awk '{ print $1/4 }')"
         # Carry out subsetting
@@ -123,13 +124,14 @@ process SUBSET_READS_SINGLE_TARGET {
     input:
         tuple val(sample), path(reads)
         val readTarget
+        val suffix
     output:
-        tuple val(sample), path("${sample}_subset.${params.suffix}.gz")
+        tuple val(sample), path("${sample}_subset.${suffix}.gz")
     shell:
         '''
         # Define input/output
         in=!{reads}
-        out=!{sample}_subset.!{params.suffix}.gz
+        out=!{sample}_subset.!{suffix}.gz
         # Count reads and compute target fraction
         n_reads=$(zcat ${in} | wc -l | awk '{ print $1/4 }')
         echo "Input reads: ${n_reads}"

From 487c6c50585b6eb274bdd14c08edf5d7754dda01 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:43:24 +0000
Subject: [PATCH 14/47] added selection of correct concat_group process in
 extractViralReads

---
 subworkflows/local/extractViralReads/main.nf | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf
index 0b4f5b6a..81997075 100644
--- a/subworkflows/local/extractViralReads/main.nf
+++ b/subworkflows/local/extractViralReads/main.nf
@@ -27,7 +27,11 @@ include { COLLAPSE_VIRUS_READS } from "../../../modules/local/collapseVirusReads
 include { ADD_FRAG_DUP_TO_VIRUS_READS } from "../../../modules/local/addFragDupToVirusReads"
 include { MAKE_VIRUS_READS_FASTA } from "../../../modules/local/makeVirusReadsFasta"
 include { COUNT_VIRUS_CLADES } from "../../../modules/local/countVirusClades"
-include { CONCAT_GROUP } from "../../../modules/local/concatGroup"
+if (params.single_end) {
+    include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
+} else {
+    include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
+}
 
 /***********
 | WORKFLOW |

From dd71ea80c75b448d1ac99e222c2f90c096225ad3 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:43:42 +0000
Subject: [PATCH 15/47] added paired paired and single version for concat_group

---
 modules/local/concatGroup/main.nf | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/modules/local/concatGroup/main.nf b/modules/local/concatGroup/main.nf
index 8544327a..cb67d5ef 100644
--- a/modules/local/concatGroup/main.nf
+++ b/modules/local/concatGroup/main.nf
@@ -1,5 +1,5 @@
 // Copy a file to a new location with a custom path
-process CONCAT_GROUP {
+process CONCAT_GROUP_PAIRED {
     label "base"
     label "single"
     input:
@@ -14,3 +14,19 @@ process CONCAT_GROUP {
         cat ${fastq_2_list.join(' ')} > ${group}_R2.fastq.gz
         """
 }
+
+
+process CONCAT_GROUP_SINGLE {
+    label "base"
+    label "single"
+    input:
+        tuple val(samples), path(fastq_list), val(group)
+
+    output:
+        tuple val(group), path("${group}.fastq.gz")
+
+    script:
+        """
+        cat ${fastq_list.join(' ')} > ${group}.fastq.gz
+        """
+}

From 329acb1112c4d881ca4a3bf3fd98a3cd1da57864 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:44:07 +0000
Subject: [PATCH 16/47] Adapted profile to be cleaner and take in changes
 introduced by Will.

---
 subworkflows/local/profile/main.nf | 56 +++++++++++++++++-------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index 1126a58a..b81799e3 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -6,27 +6,22 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-if (params.read_type == "single_end") {
-    include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
-} else if (params.read_type == "paired_end") {
-    include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
+if (params.single_end) {
+    include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
+    include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk"
+    include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
+    include { SUBSET_READS_SINGLE_TARGET; SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
+} else {
+    include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
+    include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
+    include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk"
+    include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
 }
-if (params.read_type == "single_end") {
-    include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
-} else if (params.read_type == "paired_end") {
-    include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk" addParams(suffix: params.bbduk_suffix)
-}
-include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}")
-include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy" addParams(dedup_rc: false, classification_level: "D", read_fraction: 1, kraken_memory: "${params.kraken_memory}")
-
-// FIX ABOVE
 
-include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_PAIRED_TARGET_GROUP } from "../../../modules/local/subsetReads"
-include { BBDUK } from "../../../modules/local/bbduk"
+include { BBDUK_HITS } from "../../../modules/local/bbduk"
 include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy"
 include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy"
 include { MERGE_TAXONOMY_RIBO } from "../../../modules/local/mergeTaxonomyRibo"
-include { CONCAT_GROUP } from "../../../modules/local/concatGroup"
 
 /****************
 | MAIN WORKFLOW |
@@ -44,24 +39,34 @@ workflow PROFILE {
         bbduk_suffix
         kraken_memory
         grouping
+        single_end
     main:
         // Randomly subset reads to target number
-        subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads) // DROP
-        subset_ch = SUBSET_READS_PAIRED_TARGET(reads_ch, n_reads, "fastq")
+        subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads, "fastq")
+
         if (grouping){
             // Join samplesheet with trimmed_reads and update fastq files
-            subset_group_ch = group_ch.join(subset_ch, by: 0)
-            .map { sample, group, reads -> tuple(sample, reads[0], reads[1], group) }
-            .groupTuple(by: 3)
+            if (single_end) {
+                subset_group_ch = group_ch.join(subset_ch, by: 0)
+                .map { sample, group, reads -> tuple(sample, reads, group) }
+                .groupTuple(by: 2)
+                // Single-sample groups are already subsetted to target number
+                single_sample_groups = subset_group_ch.filter { it[0].size() == 1 }
+                    .map { samples, read_list, group -> tuple(group, [read_list[0]]) }
+
+            } else {
+                subset_group_ch = group_ch.join(subset_ch, by: 0)
+                .map { sample, group, reads -> tuple(sample, reads[0], reads[1], group) }
+                .groupTuple(by: 3)
+                single_sample_groups = subset_group_ch.filter { it[0].size() == 1 }
+                    .map { samples, fwd_list, rev_list, group -> tuple(group, [fwd_list[0], rev_list[0]]) }
+            }
             // Split into multi-sample groups, these need to be subsetted to target number
             multi_sample_groups = subset_group_ch.filter { it[0].size() > 1 }
-            // These are already subsetted to target number
-            single_sample_groups = subset_group_ch.filter { it[0].size() == 1 }
-                .map { samples, fwd_list, rev_list, group -> tuple(group, [fwd_list[0], rev_list[0]]) }
             // Concatenate multi-sample groups
             grouped_samples = CONCAT_GROUP(multi_sample_groups)
             // Randomly subset multi-sample groups to target number
-            subset_grouped_ch = SUBSET_READS_PAIRED_TARGET_GROUP(grouped_samples, n_reads, "fastq")
+            subset_grouped_ch = SUBSET_READS_TARGET_GROUP(grouped_samples, n_reads, "fastq")
             // Mix with subsetted multi-sample group with already subsetted single-sample groups
             grouped_ch = subset_grouped_ch.mix(single_sample_groups)
         } else {
@@ -79,6 +84,7 @@ workflow PROFILE {
         br_ribo = tax_ribo_ch.bracken.collectFile(name: "bracken_reports_ribo.tsv.gz")
         br_noribo = tax_noribo_ch.bracken.collectFile(name: "bracken_reports_noribo.tsv.gz")
         merge_ch = MERGE_TAXONOMY_RIBO(kr_ribo, kr_noribo, br_ribo, br_noribo)
+
     emit:
         bracken = merge_ch.bracken
         kraken = merge_ch.kraken

From ea74362aa9fea5ec147ff6c4fda79b4f445914da Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:44:34 +0000
Subject: [PATCH 17/47] Cleaned up taxonomy and took into account changes of
 v2.5.0

---
 subworkflows/local/taxonomy/main.nf | 69 +++++++++--------------------
 1 file changed, 22 insertions(+), 47 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index a1da46c8..acfe39c2 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -6,20 +6,17 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-if (params.read_type == "paired_end") {
-    include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
-} else if (params.read_type == "single_end") {
-    include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads" addParams(suffix: "fastq")
-}
-
-// fix above
+if (params.single_end) {
+    include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads"
+} else {
+    include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" //addParams(suffix: "fastq")
+    include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
+    include { BBMERGE } from "../../../modules/local/bbmerge"
+    include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
 
-include { SUBSET_READS_PAIRED } from "../../../modules/local/subsetReads"
-include { BBMERGE } from "../../../modules/local/bbmerge"
-include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
+}
 include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup"
 include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify"
-include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
 include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify"
 include { KRAKEN } from "../../../modules/local/kraken"
 include { LABEL_KRAKEN_REPORTS } from "../../../modules/local/labelKrakenReports"
@@ -45,13 +42,23 @@ workflow TAXONOMY {
         if ( read_fraction == 1 ){
             subset_ch = reads_ch
         } else {
-    // OLD VERSION below
             subset_ch = SUBSET_READS(reads_ch, params.read_fraction)
         }
 
-        if (params.read_type == "paired_end") {
+        if (params.single_end) {
+            // No merging in single read version
+            summarize_bbmerge_ch = Channel.empty()
             // Deduplicate reads (if applicable)
-            if ( params.dedup_rc ){
+            if (params.dedup_rc) {
+                dedup_ch = CLUMPIFY_SINGLE(subset_ch)
+            } else {
+                dedup_ch = subset_ch
+            }
+            // No merging in single read version
+            summarize_bbmerge_ch = Channel.empty()
+        } else {
+            // Deduplicate reads (if applicable)
+            if ( dedup_rc ){
                 paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch)
             } else {
                 paired_dedup_ch = subset_ch
@@ -62,44 +69,12 @@ workflow TAXONOMY {
             summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]})
             joined_ch = JOIN_FASTQ(merged_ch.reads)
             // Deduplicate reads (if applicable)
-            if ( params.dedup_rc ){
+            if ( dedup_rc ){
                 dedup_ch = CLUMPIFY_SINGLE(joined_ch)
             } else {
                 dedup_ch = joined_ch
             }
-
-        } else if (params.read_type == "single_end") {
-            // No merging in single read version
-            summarize_bbmerge_ch = Channel.empty()
-            // Deduplicate reads (if applicable)
-            if (params.dedup_rc) {
-                dedup_ch = CLUMPIFY_SINGLE(subset_ch)
-            } else {
-                dedup_ch = subset_ch
-            }
-// NEW VERSION below
-
-            subset_ch = SUBSET_READS_PAIRED(reads_ch, read_fraction, "fastq")
-        }
-
-         // Deduplicate reads (if applicable)
-        if ( dedup_rc ){
-            paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch)
-        } else {
-            paired_dedup_ch = subset_ch
-        }
-        // Prepare reads
-        merged_ch = BBMERGE(paired_dedup_ch)
-        // Only want to summarize the merged elements
-        summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]})
-        joined_ch = JOIN_FASTQ(merged_ch.reads)
-        // Deduplicate reads (if applicable)
-        if ( dedup_rc ){
-            dedup_ch = CLUMPIFY_SINGLE(joined_ch)
-        } else {
-            dedup_ch = joined_ch
         }
-// NEW VERSION
         // Summarize last of the output
         summarize_dedup_ch = SUMMARIZE_DEDUP(dedup_ch)
 

From bbced47051a92bbff7fa1e3cc92da46ddcca55e0 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:45:37 +0000
Subject: [PATCH 18/47] Fixed single-end flags in run.nf

---
 workflows/run.nf | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/workflows/run.nf b/workflows/run.nf
index 8c347ec3..a2c7b108 100644
--- a/workflows/run.nf
+++ b/workflows/run.nf
@@ -28,9 +28,6 @@ workflow RUN {
     start_time = new Date()
     start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)")
 
-    single_end = file(params.sample_sheet).readLines()[0].split(',').contains('fastq_2') ? false : true
-
-
     // Prepare samplesheet
     if ( params.grouping ) {
         samplesheet = Channel
@@ -50,8 +47,8 @@ workflow RUN {
     // Prepare Kraken DB
     kraken_db_path = "${params.ref_dir}/results/kraken_db"
     // Preprocessing
-    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end)
-    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end)
+    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
+    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)
     // Extract and count human-viral reads
     EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "3", "21", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", "${params.kraken_memory}", params.grouping)
     // Process intermediate output for chimera detection
@@ -64,7 +61,7 @@ workflow RUN {
         BLAST_VIRAL(EXTRACT_VIRAL_READS.out.fasta, blast_db_path, blast_db_prefix, params.blast_viral_fraction, "32", "256 GB", "32 GB")
     }
     // Taxonomic profiling
-    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping)
+    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping, params.single_end)
     // Process output
     qc_ch = RAW.out.qc.concat(CLEAN.out.qc)
     PROCESS_OUTPUT(qc_ch)

From 4a1edda332e4b27ab134d632e85b228ce510e2a5 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:46:32 +0000
Subject: [PATCH 19/47] added "params" to the single_end flags to run_dev_se.nf

---
 workflows/run_dev_se.nf | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index 54983633..4ff59e8c 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -12,6 +12,7 @@ import java.time.LocalDateTime
 include { RAW } from "../subworkflows/local/raw"
 include { CLEAN } from "../subworkflows/local/clean"
 include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput"
+include { PROFILE } from "../subworkflows/local/profile"
 nextflow.preview.output = true
 
 /*****************
@@ -24,8 +25,6 @@ workflow RUN_DEV_SE {
     start_time = new Date()
     start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)")
 
-    single_end = file(params.sample_sheet).readLines()[0].split(',').contains('fastq_2') ? false : true
-
     // Prepare samplesheet
     if (single_end) {
         if (params.grouping) {
@@ -60,14 +59,16 @@ workflow RUN_DEV_SE {
             group_ch = Channel.empty()
             }
         }
+    // Prepare Kraken DB
+    kraken_db_path = "${params.ref_dir}/results/kraken_db"
 
 
     // Preprocessing
-    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end)
-    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end)
+    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
+    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)
 
     // Taxonomic profiling
-    PROFILE(CLEAN.out.reads, kraken_db_path, params.n_reads_profile, params.ref_dir)
+    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping, params.single_end)
 
     // Process output
     qc_ch = RAW.out.qc.concat(CLEAN.out.qc)
@@ -95,4 +96,7 @@ workflow RUN_DEV_SE {
         PROCESS_OUTPUT.out.adapt >> "results"
         PROCESS_OUTPUT.out.qbase >> "results"
         PROCESS_OUTPUT.out.qseqs >> "results"
+        // Final results
+        PROFILE.out.bracken >> "results"
+        PROFILE.out.kraken >> "results"
 }

From 753982d642c9e6f0ac55a10ceaca9d22721bbf12 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:47:29 +0000
Subject: [PATCH 20/47] Fixing run_dev_se.config file

---
 configs/run_dev_se.config | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
index d271d93f..9c90bb37 100644
--- a/configs/run_dev_se.config
+++ b/configs/run_dev_se.config
@@ -5,17 +5,22 @@
 params {
     mode = "run_dev_se"
 
-    read_type = "single_end"
 
     // Directories
     base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-mgs-wb/index-20240714/output" // Reference/index directory (generated by index workflow)
+    ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
 
     // Files
     sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
+    // println "Single end mode: ${single_end}"
+
     // Numerical
+    grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
     n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
     bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
@@ -23,10 +28,12 @@ params {
     kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
     quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
+    host_taxon = "vertebrate"
 }
 
 includeConfig "${projectDir}/configs/logging.config"
 includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
-process.queue = "will-batch-queue" // AWS Batch job queue
+includeConfig "${projectDir}/configs/output.config"
+process.queue = "simon-batch-queue" // AWS Batch job queue

From 5c3cb7333db5c937e558c9a970fc48ee0d43175c Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Wed, 20 Nov 2024 19:48:33 +0000
Subject: [PATCH 21/47] updated config in paired end test dataset

---
 test-paired-end/nextflow.config  | 7 +++++--
 test-single-read/nextflow.config | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test-paired-end/nextflow.config b/test-paired-end/nextflow.config
index f29ccc30..3556d11b 100644
--- a/test-paired-end/nextflow.config
+++ b/test-paired-end/nextflow.config
@@ -5,7 +5,6 @@
 params {
     mode = "run_dev_se"
 
-
     // Directories
     base_dir = "s3://nao-mgs-simon/test_paired_end" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
@@ -14,8 +13,12 @@ params {
     sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
+
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
     // Numerical
-    grouping = true // Whether to group samples by 'group' column in samplesheet
+    grouping = false // Whether to group samples by 'group' column in samplesheet
     // Numerical
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
     n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
diff --git a/test-single-read/nextflow.config b/test-single-read/nextflow.config
index 3efa0181..a8be84d5 100644
--- a/test-single-read/nextflow.config
+++ b/test-single-read/nextflow.config
@@ -15,10 +15,10 @@ params {
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
     // Whether the underlying data is paired-end or single-end
-    single_end = params.sample_sheet.toString().readLines()[0].contains('fastq_2') ? false : true
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
 
     // Numerical
-    grouping = true // Whether to group samples by 'group' column in samplesheet
+    grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
     n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
     bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
@@ -34,4 +34,4 @@ includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
 includeConfig "${projectDir}/configs/output.config"
-process.queue = "simon-batch-queue" // AWS Batch job queue
\ No newline at end of file
+process.queue = "simon-batch-queue" // AWS Batch job queue

From 5c29398b021ca1cf05c0ef1c0e09802c834a921e Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Fri, 22 Nov 2024 19:52:54 +0000
Subject: [PATCH 22/47] added params to single end.

---
 workflows/run_dev_se.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index 4ff59e8c..d628814e 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -26,7 +26,7 @@ workflow RUN_DEV_SE {
     start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)")
 
     // Prepare samplesheet
-    if (single_end) {
+    if (params.single_end) {
         if (params.grouping) {
             samplesheet = Channel
                 .fromPath(params.sample_sheet)

From 8fa88faa5be2e18d246f9d8396f4d6eaf2cf9c07 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Sat, 30 Nov 2024 17:49:32 +0000
Subject: [PATCH 23/47] Dropped params.* for single_end.

---
 subworkflows/local/profile/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index 8636bca0..d97d6eee 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -6,7 +6,7 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-if (params.single_end) {
+if (single_end) {
     include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
     include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk"
     include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"

From abe95bf5bc6a5d928a671d8583f474938eaf8269 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 2 Dec 2024 15:50:36 +0000
Subject: [PATCH 24/47] moved single_end parameter definition to .config files.

---
 configs/run.config                           |  4 +++
 configs/run_dev_se.config                    |  1 -
 modules/local/concatGroup/main.nf            |  2 +-
 subworkflows/local/extractViralReads/main.nf |  3 +-
 subworkflows/local/profile/main.nf           |  9 +++--
 subworkflows/local/taxonomy/main.nf          | 13 +++----
 test-data/single-end-samplesheet.csv         |  2 +-
 test-se/nextflow.config                      | 36 ++++++++++++++++++++
 tests/run.config                             |  3 ++
 tests/run_dev_se.config                      |  1 -
 workflows/run.nf                             | 11 +++---
 workflows/run_dev_se.nf                      | 11 +++---
 12 files changed, 68 insertions(+), 28 deletions(-)
 create mode 100644 test-se/nextflow.config

diff --git a/configs/run.config b/configs/run.config
index 98199092..7657476c 100644
--- a/configs/run.config
+++ b/configs/run.config
@@ -13,6 +13,10 @@ params {
     sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
+
     // Numerical
     grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
index a8be84d5..5e8f11cc 100644
--- a/configs/run_dev_se.config
+++ b/configs/run_dev_se.config
@@ -5,7 +5,6 @@
 params {
     mode = "run_dev_se"
 
-
     // Directories
     base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
diff --git a/modules/local/concatGroup/main.nf b/modules/local/concatGroup/main.nf
index 54913c46..5bb687f3 100644
--- a/modules/local/concatGroup/main.nf
+++ b/modules/local/concatGroup/main.nf
@@ -1,5 +1,5 @@
 // Copy a file to a new location with a custom path
-process CONCAT_GROUP {
+process CONCAT_GROUP_PAIRED {
     label "coreutils"
     label "single"
     input:
diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf
index facb4868..6520211e 100644
--- a/subworkflows/local/extractViralReads/main.nf
+++ b/subworkflows/local/extractViralReads/main.nf
@@ -52,6 +52,7 @@ workflow EXTRACT_VIRAL_READS {
         encoding
         fuzzy_match
         grouping
+        single_end
     main:
         // Get reference paths
         viral_genome_path = "${ref_dir}/results/virus-genomes-filtered.fasta.gz"
@@ -94,7 +95,7 @@ workflow EXTRACT_VIRAL_READS {
         human_bbm_ch = BBMAP_HUMAN(other_bt2_ch.reads_unconc, bbm_human_index_path, "human")
         other_bbm_ch = BBMAP_OTHER(human_bbm_ch.reads_unmapped, bbm_other_index_path, "other")
         // Run Kraken on filtered viral candidates
-        tax_ch = TAXONOMY(other_bbm_ch.reads_unmapped, kraken_db_ch, true, "F")
+        tax_ch = TAXONOMY(other_bbm_ch.reads_unmapped, kraken_db_ch, true, "F", single_end)
         // Process Kraken output and merge with Bowtie2 output across samples
         kraken_output_ch = PROCESS_KRAKEN_VIRAL(tax_ch.kraken_output, virus_db_path, host_taxon)
         bowtie2_kraken_merged_ch = MERGE_SAM_KRAKEN(kraken_output_ch.combine(bowtie2_sam_ch, by: 0))
diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index d97d6eee..13bd2a1b 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -6,7 +6,8 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-if (single_end) {
+
+if (params.single_end) {
     include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
     include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk"
     include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
@@ -40,6 +41,8 @@ workflow PROFILE {
         grouping
         single_end
     main:
+
+
         // Randomly subset reads to target number
         subset_ch = SUBSET_READS_TARGET(reads_ch, n_reads, "fastq")
 
@@ -75,8 +78,8 @@ workflow PROFILE {
         ribo_path = "${ref_dir}/results/ribo-ref-concat.fasta.gz"
         ribo_ch = BBDUK(grouped_ch, ribo_path, min_kmer_fraction, k, bbduk_suffix)
         // Run taxonomic profiling separately on ribo and non-ribo reads
-        tax_ribo_ch = TAXONOMY_RIBO(ribo_ch.fail, kraken_db_ch, false, "D")
-        tax_noribo_ch = TAXONOMY_NORIBO(ribo_ch.reads, kraken_db_ch, false, "D")
+        tax_ribo_ch = TAXONOMY_RIBO(ribo_ch.fail, kraken_db_ch, false, "D", single_end)
+        tax_noribo_ch = TAXONOMY_NORIBO(ribo_ch.reads, kraken_db_ch, false, "D", single_end)
         // Merge ribo and non-ribo outputs
         kr_ribo = tax_ribo_ch.kraken_reports.collectFile(name: "kraken_reports_ribo.tsv.gz")
         kr_noribo = tax_noribo_ch.kraken_reports.collectFile(name: "kraken_reports_noribo.tsv.gz")
diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index 3b9937c2..ee4416af 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -35,22 +35,23 @@ workflow TAXONOMY {
         kraken_db_ch
         dedup_rc
         classification_level
+        single_end
     main:
-        if (params.single_end) {
+        if (single_end) {
             // No merging in single read version
             summarize_bbmerge_ch = Channel.empty()
             // Deduplicate reads (if applicable)
-            if (params.dedup_rc) {
-                dedup_ch = CLUMPIFY_SINGLE(subset_ch)
+            if (dedup_rc) {
+                dedup_ch = CLUMPIFY_SINGLE(reads_ch)
             } else {
-                dedup_ch = subset_ch
+                dedup_ch = reads_ch
             }
         } else {
             // Deduplicate reads (if applicable)
             if ( dedup_rc ){
-                paired_dedup_ch = CLUMPIFY_PAIRED(subset_ch)
+                paired_dedup_ch = CLUMPIFY_PAIRED(reads_ch)
             } else {
-                paired_dedup_ch = subset_ch
+                paired_dedup_ch = reads_ch
             }
             // Prepare reads
             merged_ch = BBMERGE(paired_dedup_ch)
diff --git a/test-data/single-end-samplesheet.csv b/test-data/single-end-samplesheet.csv
index 5b6a53f0..f59a9a86 100644
--- a/test-data/single-end-samplesheet.csv
+++ b/test-data/single-end-samplesheet.csv
@@ -1,2 +1,2 @@
 sample,fastq
-230926Esv_D23-14904-1,s3://nao-testing/gold-standard-test/raw/gold_standard_R1.fastq.gz
\ No newline at end of file
+gold_standard,s3://nao-testing/gold-standard-test/raw/gold_standard_R1.fastq.gz
\ No newline at end of file
diff --git a/test-se/nextflow.config b/test-se/nextflow.config
new file mode 100644
index 00000000..345229bc
--- /dev/null
+++ b/test-se/nextflow.config
@@ -0,0 +1,36 @@
+/************************************************
+| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
+************************************************/
+
+params {
+    mode = "run_dev_se"
+
+    // Directories
+    base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
+    ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
+
+    // Files
+    sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV
+    adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
+
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
+    // Numerical
+    grouping = false // Whether to group samples by 'group' column in samplesheet
+    n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
+    n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
+    bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
+    blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
+    kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
+    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
+    fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
+    host_taxon = "vertebrate"
+}
+
+includeConfig "${projectDir}/configs/logging.config"
+includeConfig "${projectDir}/configs/containers.config"
+includeConfig "${projectDir}/configs/resources.config"
+includeConfig "${projectDir}/configs/profiles.config"
+includeConfig "${projectDir}/configs/output.config"
+process.queue = "simon-batch-queue" // AWS Batch job queue
\ No newline at end of file
diff --git a/tests/run.config b/tests/run.config
index fc157874..b81f7342 100644
--- a/tests/run.config
+++ b/tests/run.config
@@ -11,6 +11,9 @@ params {
     base_dir = "./" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
 
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
     // Files
     sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index 0c7133cd..95562d62 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -13,7 +13,6 @@ params {
     sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
-
     // Whether the underlying data is paired-end or single-end
     single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
 
diff --git a/workflows/run.nf b/workflows/run.nf
index ccba712c..c0aceceb 100644
--- a/workflows/run.nf
+++ b/workflows/run.nf
@@ -40,9 +40,6 @@ workflow RUN {
         }
     }
 
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
     // Prepare samplesheet
     if ( params.grouping ) {
         samplesheet = Channel
@@ -60,10 +57,10 @@ workflow RUN {
         group_ch = Channel.empty()
     }
     // Preprocessing
-    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end)
-    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end)
+    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
+    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)
     // Extract and count human-viral reads
-    EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping)
+    EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping, params.single_end)
     // Process intermediate output for chimera detection
     raw_processed_ch = EXTRACT_VIRAL_READS.out.bbduk_match.join(RAW.out.reads, by: 0)
     EXTRACT_RAW_READS_FROM_PROCESSED(raw_processed_ch, "raw_viral_subset")
@@ -77,7 +74,7 @@ workflow RUN {
         blast_paired_ch = Channel.empty()
     }
     // Taxonomic profiling
-    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, single_end)
+    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, params.single_end)
     // Process output
     qc_ch = RAW.out.qc.concat(CLEAN.out.qc)
     PROCESS_OUTPUT(qc_ch)
diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index c74fbd15..503edb6b 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -25,11 +25,8 @@ workflow RUN_DEV_SE {
     start_time = new Date()
     start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)")
 
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
     // Prepare samplesheet
-    if (single_end) {
+    if (params.single_end) {
         if (params.grouping) {
             samplesheet = Channel
                 .fromPath(params.sample_sheet)
@@ -67,11 +64,11 @@ workflow RUN_DEV_SE {
 
 
     // Preprocessing
-    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", single_end)
-    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", single_end)
+    RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
+    CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)
 
     // Taxonomic profiling
-    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", "${params.kraken_memory}", params.grouping, single_end)
+    PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, params.single_end)
 
     // Process output
     qc_ch = RAW.out.qc.concat(CLEAN.out.qc)

From 15facd15c7fbf7506847f5e01430aaa5501001a7 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Mon, 2 Dec 2024 16:05:45 +0000
Subject: [PATCH 25/47] moved order of single_end definition.

---
 tests/run.config | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/run.config b/tests/run.config
index b81f7342..2de10f16 100644
--- a/tests/run.config
+++ b/tests/run.config
@@ -11,13 +11,13 @@ params {
     base_dir = "./" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
 
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
     // Files
     sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
     // Numerical
     grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)

From 8f5afe86261dfee49796357f4c181fe293a5d353 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 13:40:59 +0000
Subject: [PATCH 26/47] Reformatted tests/run-dev-se.config to be thesame as
 tests/run.config

---
 configs/run.config      | 1 -
 tests/run_dev_se.config | 9 ++++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/configs/run.config b/configs/run.config
index 7657476c..b405bb17 100644
--- a/configs/run.config
+++ b/configs/run.config
@@ -16,7 +16,6 @@ params {
     // Whether the underlying data is paired-end or single-end
     single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
 
-
     // Numerical
     grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index 95562d62..523c2f82 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -10,7 +10,7 @@ params {
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
 
     // Files
-    sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV
+    sample_sheet = "${launchDir}/test-data/single-end-samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
     // Whether the underlying data is paired-end or single-end
@@ -26,11 +26,10 @@ params {
     quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
+
+    blast_db_prefix = "nt_others"
 }
 
-includeConfig "${projectDir}/configs/logging.config"
 includeConfig "${projectDir}/configs/containers.config"
-includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
-includeConfig "${projectDir}/configs/output.config"
-process.queue = "simon-batch-queue" // AWS Batch job queue
\ No newline at end of file
+includeConfig "${projectDir}/configs/output.config"
\ No newline at end of file

From b0d67372a758817c1e163f8aa80e6fed8ca6889b Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 13:42:15 +0000
Subject: [PATCH 27/47] removed unneded whitespace

---
 subworkflows/local/profile/main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
index 13bd2a1b..e0bc1db3 100644
--- a/subworkflows/local/profile/main.nf
+++ b/subworkflows/local/profile/main.nf
@@ -86,7 +86,6 @@ workflow PROFILE {
         br_ribo = tax_ribo_ch.bracken.collectFile(name: "bracken_reports_ribo.tsv.gz")
         br_noribo = tax_noribo_ch.bracken.collectFile(name: "bracken_reports_noribo.tsv.gz")
         merge_ch = MERGE_TAXONOMY_RIBO(kr_ribo, kr_noribo, br_ribo, br_noribo)
-
     emit:
         bracken = merge_ch.bracken
         kraken = merge_ch.kraken

From 58270449b9fea8da184843132318718a9bebb8a9 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 13:44:24 +0000
Subject: [PATCH 28/47] reset test-data/nextflowconfig to have settings set to
 will defaults.

---
 test-data/nextflow.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test-data/nextflow.config b/test-data/nextflow.config
index 635a9601..8a84c703 100644
--- a/test-data/nextflow.config
+++ b/test-data/nextflow.config
@@ -6,7 +6,7 @@ params {
     mode = "run"
 
     // Directories
-    base_dir = "s3://nao-mgs-simon/test" // Parent for working and output directories (can be S3)
+    base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
 
     // Files
@@ -34,4 +34,4 @@ includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
 includeConfig "${projectDir}/configs/output.config"
-process.queue = "will-batch-queue" // AWS Batch job queue
+process.queue = "will-batch-queue" // AWS Batch job queue
\ No newline at end of file

From 8302f251134dd9090d6fffc7769224a085c5fc66 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 13:45:48 +0000
Subject: [PATCH 29/47] removed test-se configfile.

---
 test-se/nextflow.config | 36 ------------------------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 test-se/nextflow.config

diff --git a/test-se/nextflow.config b/test-se/nextflow.config
deleted file mode 100644
index 345229bc..00000000
--- a/test-se/nextflow.config
+++ /dev/null
@@ -1,36 +0,0 @@
-/************************************************
-| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
-************************************************/
-
-params {
-    mode = "run_dev_se"
-
-    // Directories
-    base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
-
-    // Files
-    sample_sheet = "${launchDir}/single-end-samplesheet.csv" // Path to library TSV
-    adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
-
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
-    // Numerical
-    grouping = false // Whether to group samples by 'group' column in samplesheet
-    n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
-    n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
-    bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
-    blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
-    kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
-    fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
-    host_taxon = "vertebrate"
-}
-
-includeConfig "${projectDir}/configs/logging.config"
-includeConfig "${projectDir}/configs/containers.config"
-includeConfig "${projectDir}/configs/resources.config"
-includeConfig "${projectDir}/configs/profiles.config"
-includeConfig "${projectDir}/configs/output.config"
-process.queue = "simon-batch-queue" // AWS Batch job queue
\ No newline at end of file

From ac101244b7241a76954f3e65a8097c96b132c844 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 13:47:37 +0000
Subject: [PATCH 30/47] placeholder comiit

---
 tests/run.config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/run.config b/tests/run.config
index 2de10f16..320ef045 100644
--- a/tests/run.config
+++ b/tests/run.config
@@ -27,7 +27,6 @@ params {
     quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
-
     blast_db_prefix = "nt_others"
 }
 

From 30a2a6e910e30a15dd6fa41f59cf14dc85d633c5 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 27 Nov 2024 16:21:48 -0500
Subject: [PATCH 31/47] Merge pull request #116 from
 naobservatory/harmon_fix_copying_file_bug

Fixing bug that's causing a fatal error in master pipeline.
---
 workflows/run.nf            | 4 ++--
 workflows/run_validation.nf | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/workflows/run.nf b/workflows/run.nf
index c0aceceb..6793ea99 100644
--- a/workflows/run.nf
+++ b/workflows/run.nf
@@ -84,9 +84,9 @@ workflow RUN {
     time_ch = Channel.of(start_time_str + "\n").collectFile(name: "time.txt")
     version_ch = Channel.fromPath("${projectDir}/pipeline-version.txt")
     index_params_ch = Channel.fromPath("${params.ref_dir}/input/index-params.json")
-    .map { file -> file.copyTo("${workDir}/params-index.json") }
+    .map { file -> file.copyTo("${params.base_dir}/work/params-index.json") }
     index_pipeline_version_ch = Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt")
-    .map { file -> file.copyTo("${workDir}/pipeline-version-index.txt") }
+    .map { file -> file.copyTo("${params.base_dir}/work/pipeline-version-index.txt") }
     publish:
         // Saved inputs
         index_params_ch >> "input"
diff --git a/workflows/run_validation.nf b/workflows/run_validation.nf
index 66e55cad..c0cdf326 100644
--- a/workflows/run_validation.nf
+++ b/workflows/run_validation.nf
@@ -43,9 +43,9 @@ workflow RUN_VALIDATION {
     time_ch = Channel.of(start_time_str + "\n").collectFile(name: "time.txt")
     version_ch = Channel.fromPath("${projectDir}/pipeline-version.txt")
     index_params_ch = Channel.fromPath("${params.ref_dir}/input/index-params.json")
-    .map { file -> file.copyTo("${workDir}/params-index.json") }
+    .map { file -> file.copyTo("${params.base_dir}/work/params-index.json") }
     index_pipeline_version_ch = Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt")
-    .map { file -> file.copyTo("${workDir}/pipeline-version-index.txt") }
+    .map { file -> file.copyTo("${params.base_dir}/work/pipeline-version-index.txt") }
     publish:
         // Saved inputs
         index_params_ch >> "input"

From cc5fd19061cf58ddcc8d714f094a8eb17cb2863f Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Tue, 3 Dec 2024 14:15:52 +0000
Subject: [PATCH 32/47] Changed version of nextflow, hopefully this works

---
 .github/workflows/end-to-end.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
index 800abb3a..fe85aa93 100644
--- a/.github/workflows/end-to-end.yml
+++ b/.github/workflows/end-to-end.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Setup Nextflow latest-edge
         uses: nf-core/setup-nextflow@v1
         with:
-          version: "latest-edge"
+          version: "latest"
 
       - name: Install nf-test
         run: |

From ab782570bff107a690d037c911a040d9e961e774 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Tue, 3 Dec 2024 14:18:59 +0000
Subject: [PATCH 33/47] Updated label to be correct

---
 .github/workflows/end-to-end.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
index fe85aa93..1f9d3d9b 100644
--- a/.github/workflows/end-to-end.yml
+++ b/.github/workflows/end-to-end.yml
@@ -16,7 +16,7 @@ jobs:
           java-version: '11'
           distribution: 'adopt'
 
-      - name: Setup Nextflow latest-edge
+      - name: Setup Nextflow latest (stable)
         uses: nf-core/setup-nextflow@v1
         with:
           version: "latest"

From 1db7ba8c83f4f11d3ea841b2984c68fb3d1412e6 Mon Sep 17 00:00:00 2001
From: Simon Grimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 20:35:45 +0000
Subject: [PATCH 34/47] updated end-to-end.yml and run dev se config.

---
 .github/workflows/end-to-end.yml | 3 +++
 tests/run_dev_se.config          | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
index 1f9d3d9b..02e0eebb 100644
--- a/.github/workflows/end-to-end.yml
+++ b/.github/workflows/end-to-end.yml
@@ -44,3 +44,6 @@ jobs:
 
       - name: Run run_validation workflow
         run: nf-test test --tag validation --verbose
+
+      - name: Run run_dev_se workflow
+        run: nf-test test --tag run_dev_se --verbose
diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index 523c2f82..48f7a392 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -10,7 +10,7 @@ params {
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
 
     // Files
-    sample_sheet = "${launchDir}/test-data/single-end-samplesheet.csv" // Path to library TSV
+    sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
     // Whether the underlying data is paired-end or single-end

From 1751288612ec355977feea9634623ac73fdabba1 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Tue, 3 Dec 2024 21:35:10 +0000
Subject: [PATCH 35/47] test commit

---
 tests/run_dev_se.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index 48f7a392..d3db01c4 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -13,6 +13,7 @@ params {
     sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
+
     // Whether the underlying data is paired-end or single-end
     single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
 

From 5567568b1f03e03911aeff7273a766a3b3a3d3aa Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Wed, 4 Dec 2024 20:35:05 +0000
Subject: [PATCH 36/47] dropped read type info from config files where its not
 needed.

---
 configs/run.config      | 3 ---
 tests/run.config        | 3 ---
 tests/run_dev_se.config | 4 ----
 3 files changed, 10 deletions(-)

diff --git a/configs/run.config b/configs/run.config
index 6da753a8..3669ba39 100644
--- a/configs/run.config
+++ b/configs/run.config
@@ -13,9 +13,6 @@ params {
     sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
     // Numerical
     grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
diff --git a/tests/run.config b/tests/run.config
index f4a45119..cd8c62a6 100644
--- a/tests/run.config
+++ b/tests/run.config
@@ -15,9 +15,6 @@ params {
     sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
     // Numerical
     grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index dc0709fe..8fd5e597 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -13,10 +13,6 @@ params {
     sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
 
-
-    // Whether the underlying data is paired-end or single-end
-    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-
     // Numerical
     grouping = false // Whether to group samples by 'group' column in samplesheet
     n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)

From a3af7db8750fccefdd6e07b24926ace3e879d195 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Wed, 4 Dec 2024 20:38:52 +0000
Subject: [PATCH 37/47] Made rundevse index and outputs look the same as run.nf

---
 workflows/run_dev_se.nf | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index 503edb6b..dad3d2cc 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -79,11 +79,14 @@ workflow RUN_DEV_SE {
     params_ch = Channel.of(params_str).collectFile(name: "run-params.json")
     time_ch = Channel.of(start_time_str + "\n").collectFile(name: "time.txt")
     version_ch = Channel.fromPath("${projectDir}/pipeline-version.txt")
-
+    index_params_ch = Channel.fromPath("${params.ref_dir}/input/index-params.json")
+    .map { file -> file.copyTo("${params.base_dir}/work/params-index.json") }
+    index_pipeline_version_ch = Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt")
+    .map { file -> file.copyTo("${params.base_dir}/work/pipeline-version-index.txt") }
     publish:
         // Saved inputs
-        Channel.fromPath("${params.ref_dir}/input/index-params.json") >> "input"
-        Channel.fromPath("${params.ref_dir}/logging/pipeline-version.txt").collectFile(name: "pipeline-version-index.txt") >> "logging"
+        index_params_ch >> "input"
+        index_pipeline_version_ch >> "logging"
         Channel.fromPath(params.sample_sheet) >> "input"
         Channel.fromPath(params.adapters) >> "input"
         params_ch >> "input"

From d465db75b80cf289dd11737dbfb9b8b89daf6d74 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Thu, 5 Dec 2024 18:26:47 +0000
Subject: [PATCH 38/47] fixed memory issue in bbduk

---
 modules/local/bbduk/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf
index 2d72dc08..a6c3b210 100644
--- a/modules/local/bbduk/main.nf
+++ b/modules/local/bbduk/main.nf
@@ -54,7 +54,7 @@ process BBDUK_SINGLE {
         ref=!{contaminant_ref}
         io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}"
         # Define parameters
-        par="minkmerfraction=!{min_kmer_fraction} k=!{k} t=!{task.cpus} -Xmx30g"
+        par="minkmerfraction=!{min_kmer_fraction} k=!{k} t=!{task.cpus} -Xmx!{task.memory.toGiga()}g"
         # Execute
         bbduk.sh ${io} ${par}
         '''

From 4e74d7c12b1d99f925510ef9df3e2c1f328ab40d Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Thu, 5 Dec 2024 18:31:20 +0000
Subject: [PATCH 39/47] Fixing setup of run_dev_se test config.

---
 tests/run_dev_se.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index 8fd5e597..34b72954 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -29,5 +29,5 @@ params {
 
 includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/profiles.config"
+includeConfig "${projectDir}/configs/read_type.config"
 includeConfig "${projectDir}/configs/output.config"
-includeConfig "${projectDir}/configs/read_type.config"
\ No newline at end of file

From d061c1d1d5bdf599d830f477b80fab034a39a305 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Mon, 9 Dec 2024 16:27:39 +0000
Subject: [PATCH 40/47] fixed imports in taxonomy/main.nf

---
 subworkflows/local/taxonomy/main.nf | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index ee4416af..278c1873 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -6,14 +6,10 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-if (params.single_end) {
-    include { SUBSET_READS_SINGLE as SUBSET_READS } from "../../../modules/local/subsetReads"
-} else {
-    include { SUBSET_READS_PAIRED as SUBSET_READS } from "../../../modules/local/subsetReads" //addParams(suffix: "fastq")
+if (!params.single_end) {
     include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
     include { BBMERGE } from "../../../modules/local/bbmerge"
     include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
-
 }
 include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup"
 include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify"

From db187e633a3ba4c8baeafb43979984349b514674 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <58591538+simonleandergrimm@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:39:53 -0500
Subject: [PATCH 41/47] Removed redundancy

---
 subworkflows/local/taxonomy/main.nf | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index 278c1873..d685b292 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -1,4 +1,4 @@
-/***********************************************************
+`/***********************************************************
 | SUBWORKFLOW: TAXONOMIC PROFILING WITH KRAKEN AND BRACKEN |
 ***********************************************************/
 
@@ -36,12 +36,7 @@ workflow TAXONOMY {
         if (single_end) {
             // No merging in single read version
             summarize_bbmerge_ch = Channel.empty()
-            // Deduplicate reads (if applicable)
-            if (dedup_rc) {
-                dedup_ch = CLUMPIFY_SINGLE(reads_ch)
-            } else {
-                dedup_ch = reads_ch
-            }
+            single_read_ch = reads_ch
         } else {
             // Deduplicate reads (if applicable)
             if ( dedup_rc ){
@@ -53,14 +48,16 @@ workflow TAXONOMY {
             merged_ch = BBMERGE(paired_dedup_ch)
             // Only want to summarize the merged elements
             summarize_bbmerge_ch = SUMMARIZE_BBMERGE(merged_ch.reads.map{sample, files -> [sample, files[0]]})
-            joined_ch = JOIN_FASTQ(merged_ch.reads)
-            // Deduplicate reads (if applicable)
-            if ( dedup_rc ){
-                dedup_ch = CLUMPIFY_SINGLE(joined_ch)
+            single_read_ch = JOIN_FASTQ(merged_ch.reads)
+        }
+
+        // Deduplicate reads (if applicable)
+        if (dedup_rc) {
+                dedup_ch = CLUMPIFY_SINGLE(single_read_ch)
             } else {
-                dedup_ch = joined_ch
-            }
+                dedup_ch = single_read_ch 
         }
+
         // Summarize last of the output
         summarize_dedup_ch = SUMMARIZE_DEDUP(dedup_ch)
 

From 8a09b0882dd1dc26c95408b874f2165d6bd2d81c Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Tue, 10 Dec 2024 20:28:16 +0000
Subject: [PATCH 42/47] delete unexpected cahracter

---
 subworkflows/local/taxonomy/main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index d685b292..b6ab693f 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -1,4 +1,4 @@
-`/***********************************************************
+/***********************************************************
 | SUBWORKFLOW: TAXONOMIC PROFILING WITH KRAKEN AND BRACKEN |
 ***********************************************************/
 
@@ -55,7 +55,7 @@ workflow TAXONOMY {
         if (dedup_rc) {
                 dedup_ch = CLUMPIFY_SINGLE(single_read_ch)
             } else {
-                dedup_ch = single_read_ch 
+                dedup_ch = single_read_ch
         }
 
         // Summarize last of the output

From 55f75f84c8cc0b2839cee214022a0d9655cef247 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Wed, 11 Dec 2024 16:58:09 +0000
Subject: [PATCH 43/47] moved kraken db location

---
 workflows/run_dev_se.nf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index ea78be67..2e350c17 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -25,6 +25,7 @@ workflow RUN_DEV_SE {
     // Start time
     start_time = new Date()
     start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)")
+    kraken_db_path = "${params.ref_dir}/results/kraken_db"
 
     // Check if grouping column exists in samplesheet
     check_grouping = new File(params.sample_sheet).text.readLines()[0].contains('group') ? true : false
@@ -34,8 +35,6 @@ workflow RUN_DEV_SE {
         } else if (!params.grouping && check_grouping) {
             throw new Exception("Grouping is not enabled in config file, but group column is present in the samplesheet.")
         }
-    // Prepare Kraken DB
-    kraken_db_path = "${params.ref_dir}/results/kraken_db"
     }
 
     // Load samplesheet

From 1210a3c849dbc77f44fcf5676f99f41b0f95e521 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <58591538+simonleandergrimm@users.noreply.github.com>
Date: Thu, 12 Dec 2024 09:49:59 -0500
Subject: [PATCH 44/47] Update main.nf

---
 subworkflows/local/taxonomy/main.nf | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/subworkflows/local/taxonomy/main.nf b/subworkflows/local/taxonomy/main.nf
index b6ab693f..2e7c70ea 100644
--- a/subworkflows/local/taxonomy/main.nf
+++ b/subworkflows/local/taxonomy/main.nf
@@ -6,11 +6,9 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-if (!params.single_end) {
-    include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
-    include { BBMERGE } from "../../../modules/local/bbmerge"
-    include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
-}
+include { JOIN_FASTQ } from "../../../modules/local/joinFastq"
+include { BBMERGE } from "../../../modules/local/bbmerge"
+include { SUMMARIZE_BBMERGE } from "../../../modules/local/summarizeBBMerge"
 include { SUMMARIZE_DEDUP } from "../../../modules/local/summarizeDedup"
 include { CLUMPIFY_PAIRED } from "../../../modules/local/clumpify"
 include { CLUMPIFY_SINGLE } from "../../../modules/local/clumpify"

From 9113fd4cdae21f1bf0c8345ab3a46e5459e3f22c Mon Sep 17 00:00:00 2001
From: simonleandergrimm <58591538+simonleandergrimm@users.noreply.github.com>
Date: Fri, 20 Dec 2024 10:39:47 -0500
Subject: [PATCH 45/47] Update end-to-end.yml

---
 .github/workflows/end-to-end.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
index 6b765e5a..1fd0383e 100644
--- a/.github/workflows/end-to-end.yml
+++ b/.github/workflows/end-to-end.yml
@@ -83,4 +83,3 @@ jobs:
 
       - name: Run run_validation workflow
         run: nf-test test --tag validation --verbose
-

From 4bfbc462c861cacc0c0305ad54f468f297fe7b89 Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Fri, 20 Dec 2024 15:45:07 +0000
Subject: [PATCH 46/47] updated changelog. updated rundevse to have correct
 loadsamplesheet process naming. removed hvscreen main.nf

---
 CHANGELOG.md                         | 6 +++---
 subworkflows/local/hv_screen/main.nf | 0
 workflows/run_dev_se.nf              | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
 delete mode 100644 subworkflows/local/hv_screen/main.nf

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 97560797..8635adc8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,8 @@
 # v2.5.3 (in progress)
-- Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing 
+- Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing
 - Began development of single-end read processing (still in progress)
-    - Restructured RAW, CLEAN, and QC workflows to handle both single-end and paired-end reads
-    - Added new FASTP_SINGLE and TRUNCATE_CONCAT_SINGLE processes to handle single-end reads
+    - Restructured RAW, CLEAN, QC, TAXONOMY, and PROFILE workflows to handle both single-end and paired-end reads
+    - Added new FASTP_SINGLE, TRUNCATE_CONCAT_SINGLE, BBDUK_SINGLE, CONCAT_GROUP_SINGLE, SUBSET_READS_SINGLE and SUBSET_READS_SINGLE_TARGET processes to handle single-end reads
     - Created separate end-to-end test workflow for single-end processing (which will be removed once single-end processing is fully integrated)
     - Modified samplesheet handling to support both single-end and paired-end data
     - Updated generate_samplesheet.sh to handle single-end data with --single_end flag
diff --git a/subworkflows/local/hv_screen/main.nf b/subworkflows/local/hv_screen/main.nf
deleted file mode 100644
index e69de29b..00000000
diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
index 6f9b982f..3df5f0e4 100644
--- a/workflows/run_dev_se.nf
+++ b/workflows/run_dev_se.nf
@@ -13,7 +13,7 @@ include { RAW } from "../subworkflows/local/raw"
 include { CLEAN } from "../subworkflows/local/clean"
 include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput"
 include { PROFILE } from "../subworkflows/local/profile"
-include { LOAD_SAMPLESHET } from "../subworkflows/local/loadSampleSheet"
+include { LOAD_SAMPLESHEET } from "../subworkflows/local/loadSampleSheet"
 nextflow.preview.output = true
 
 /*****************

From d2826d2f63b88ab7fd6c81ab7ab87da98c84be3b Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Fri, 20 Dec 2024 15:58:44 +0000
Subject: [PATCH 47/47] removed whitespace

---
 .github/workflows/end-to-end.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
index 6b765e5a..3e4c8d60 100644
--- a/.github/workflows/end-to-end.yml
+++ b/.github/workflows/end-to-end.yml
@@ -37,7 +37,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-
       - name: Set up JDK 11
         uses: actions/setup-java@v4
         with:
@@ -83,4 +82,3 @@ jobs:
 
       - name: Run run_validation workflow
         run: nf-test test --tag validation --verbose
-