Merge branch 'single-read-raw-clean' into single-read-profile

naobservatory · Dec 11, 2024 · 11f2145 · 11f2145
2 parents 8a09b08 + 6ad3ce2
commit 11f2145
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 74 deletions.
diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
@@ -57,9 +57,6 @@ jobs:
       - name: Run run workflow
         run: nf-test test --tag run --verbose
 
-      - name: Run run_dev_se workflow
-        run: nf-test test --tag run_dev_se --verbose
-
   test-validation:
     runs-on: ubuntu-latest
     timeout-minutes: 5

diff --git a/modules/local/truncateConcat/main.nf b/modules/local/truncateConcat/main.nf
@@ -1,34 +1,38 @@
 // Truncate concatenated read files for trial run
-process TRUNCATE_CONCAT {
+process TRUNCATE_CONCAT_PAIRED {
     label "single"
     label "BBTools"
     input:
         tuple val(sample), path(reads)
         val n_reads
-        val single_end
     output:
-
-        tuple val(sample), path({
-            single_end ?
-                    "${sample}_trunc.fastq.gz" :
-                    "${sample}_trunc_{1,2}.fastq.gz"
-            }), emit: reads
+        tuple val(sample), path("${sample}_trunc_{1,2}.fastq.gz"), emit: reads
     shell:
         '''
         echo "Number of output reads: !{n_reads}"
         n_lines=$(expr !{n_reads} \\* 4)
         echo "Number of output lines: ${n_lines}"
-        if [ $(echo "!{reads}" | wc -w) -eq 2 ]; then
-            echo "Processing paired-end reads"
-            o1=!{sample}_trunc_1.fastq.gz
-            o2=!{sample}_trunc_2.fastq.gz
-            zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o1}
-            zcat !{reads[1]} | head -n ${n_lines} | gzip -c > ${o2}
-        else
-            echo "Processing single-end reads"
-            o=!{sample}_trunc.fastq.gz
-            zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o}
-        fi
-
+        o1=!{sample}_trunc_1.fastq.gz
+        o2=!{sample}_trunc_2.fastq.gz
+        zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o1}
+        zcat !{reads[1]} | head -n ${n_lines} | gzip -c > ${o2}
         '''
 }
+
+process TRUNCATE_CONCAT_SINGLE {
+    label "single"
+    label "BBTools"
+    input:
+        tuple val(sample), path(reads)
+        val n_reads
+    output:
+        tuple val(sample), path("${sample}_trunc.fastq.gz"), emit: reads
+    shell:
+        '''
+        echo "Number of output reads: !{n_reads}"
+        n_lines=$(expr !{n_reads} \\* 4)
+        echo "Number of output lines: ${n_lines}"
+        o=!{sample}_trunc.fastq.gz
+        zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o}
+        '''
+}
diff --git a/subworkflows/local/loadSampleSheet/main.nf b/subworkflows/local/loadSampleSheet/main.nf
@@ -0,0 +1,45 @@
+/***********
+| WORKFLOW |
+***********/
+
+workflow LOAD_SAMPLESHET {
+    take:
+        sample_sheet
+    main:
+        if (params.single_end) {
+            if (params.grouping) {
+                samplesheet = Channel
+                    .fromPath(params.sample_sheet)
+                    .splitCsv(header: true)
+                    .map { row -> tuple(row.sample, file(row.fastq), row.group) }
+                samplesheet_ch = samplesheet.map { sample, read, group -> tuple(sample, [read]) }
+                group_ch = samplesheet.map { sample, read, group -> tuple(sample, group) }
+            } else {
+                samplesheet = Channel
+                    .fromPath(params.sample_sheet)
+                    .splitCsv(header: true)
+                    .map { row -> tuple(row.sample, file(row.fastq)) }
+                samplesheet_ch = samplesheet.map { sample, read -> tuple(sample, [read]) }
+                group_ch = Channel.empty()
+            }
+        } else {
+            if (params.grouping) {
+                samplesheet = Channel
+                    .fromPath(params.sample_sheet)
+                    .splitCsv(header: true)
+                    .map { row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2), row.group) }
+                samplesheet_ch = samplesheet.map { sample, read1, read2, group -> tuple(sample, [read1, read2]) }
+                group_ch = samplesheet.map { sample, read1, read2, group -> tuple(sample, group) }
+            } else {
+                samplesheet = Channel
+                    .fromPath(params.sample_sheet)
+                    .splitCsv(header: true)
+                    .map { row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2)) }
+                samplesheet_ch = samplesheet.map { sample, read1, read2 -> tuple(sample, [read1, read2]) }
+                group_ch = Channel.empty()
+                }
+            }
+    emit:
+        samplesheet = samplesheet_ch
+        group = group_ch
+}
diff --git a/subworkflows/local/raw/main.nf b/subworkflows/local/raw/main.nf
@@ -7,7 +7,11 @@
 ***************************/
 
 include { QC } from "../../../subworkflows/local/qc"
-include { TRUNCATE_CONCAT } from "../../../modules/local/truncateConcat"
+if (params.single_end) {
+    include { TRUNCATE_CONCAT_SINGLE as TRUNCATE_CONCAT } from "../../../modules/local/truncateConcat"
+} else {
+    include { TRUNCATE_CONCAT_PAIRED as TRUNCATE_CONCAT } from "../../../modules/local/truncateConcat"
+}
 
 /***********
 | WORKFLOW |
@@ -23,7 +27,7 @@ workflow RAW {
         single_end
     main:
         if ( n_reads_trunc > 0 ) {
-            out_ch = TRUNCATE_CONCAT(samplesheet_ch, n_reads_trunc, single_end)
+            out_ch = TRUNCATE_CONCAT(samplesheet_ch, n_reads_trunc)
         } else {
             out_ch = samplesheet_ch
         }

diff --git a/workflows/run.nf b/workflows/run.nf
@@ -16,6 +16,7 @@ include { BLAST_VIRAL } from "../subworkflows/local/blastViral"
 include { PROFILE } from "../subworkflows/local/profile"
 include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput"
 include { EXTRACT_RAW_READS_FROM_PROCESSED } from "../modules/local/extractRawReadsFromProcessed"
+include { LOAD_SAMPLESHET } from "../subworkflows/local/loadSampleSheet"
 nextflow.preview.output = true
 
 /*****************
@@ -40,22 +41,11 @@ workflow RUN {
         }
     }
 
-    // Prepare samplesheet
-    if ( params.grouping ) {
-        samplesheet = Channel
-            .fromPath(params.sample_sheet)
-            .splitCsv(header: true)
-            .map { row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2), row.group) }
-        samplesheet_ch = samplesheet.map { sample, read1, read2, group -> tuple(sample, [read1, read2]) }
-        group_ch = samplesheet.map { sample, read1, read2, group -> tuple(sample, group) }
-    } else {
-        samplesheet = Channel
-            .fromPath(params.sample_sheet)
-            .splitCsv(header: true)
-            .map { row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2)) }
-        samplesheet_ch = samplesheet.map { sample, read1, read2 -> tuple(sample, [read1, read2]) }
-        group_ch = Channel.empty()
-    }
+    // Load samplesheet
+    LOAD_SAMPLESHET(params.sample_sheet)
+    samplesheet_ch = LOAD_SAMPLESHET.out.samplesheet
+    group_ch = LOAD_SAMPLESHET.out.group
+
     // Preprocessing
     RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
     CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
@@ -13,6 +13,7 @@ include { RAW } from "../subworkflows/local/raw"
 include { CLEAN } from "../subworkflows/local/clean"
 include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput"
 include { PROFILE } from "../subworkflows/local/profile"
+include { LOAD_SAMPLESHET } from "../subworkflows/local/loadSampleSheet"
 nextflow.preview.output = true
 
 /*****************
@@ -25,43 +26,22 @@ workflow RUN_DEV_SE {
     start_time = new Date()
     start_time_str = start_time.format("YYYY-MM-dd HH:mm:ss z (Z)")
 
-    // Prepare samplesheet
-    if (params.single_end) {
-        if (params.grouping) {
-            samplesheet = Channel
-                .fromPath(params.sample_sheet)
-                .splitCsv(header: true)
-                .map { row -> tuple(row.sample, file(row.fastq), row.group) }
-            samplesheet_ch = samplesheet.map { sample, read, group -> tuple(sample, [read]) }
-            group_ch = samplesheet.map { sample, read, group -> tuple(sample, group) }
-        } else {
-            samplesheet = Channel
-                .fromPath(params.sample_sheet)
-                .splitCsv(header: true)
-                .map { row -> tuple(row.sample, file(row.fastq)) }
-            samplesheet_ch = samplesheet.map { sample, read -> tuple(sample, [read]) }
-            group_ch = Channel.empty()
-        }
-    } else {
-        if (params.grouping) {
-            samplesheet = Channel
-                .fromPath(params.sample_sheet)
-                .splitCsv(header: true)
-                .map { row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2), row.group) }
-            samplesheet_ch = samplesheet.map { sample, read1, read2, group -> tuple(sample, [read1, read2]) }
-            group_ch = samplesheet.map { sample, read1, read2, group -> tuple(sample, group) }
-        } else {
-            samplesheet = Channel
-                .fromPath(params.sample_sheet)
-                .splitCsv(header: true)
-                .map { row -> tuple(row.sample, file(row.fastq_1), file(row.fastq_2)) }
-            samplesheet_ch = samplesheet.map { sample, read1, read2 -> tuple(sample, [read1, read2]) }
-            group_ch = Channel.empty()
-            }
+    // Check if grouping column exists in samplesheet
+    check_grouping = new File(params.sample_sheet).text.readLines()[0].contains('group') ? true : false
+    if (params.grouping != check_grouping) {
+        if (params.grouping && !check_grouping) {
+            throw new Exception("Grouping enabled in config file, but group column absent from samplesheet.")
+        } else if (!params.grouping && check_grouping) {
+            throw new Exception("Grouping is not enabled in config file, but group column is present in the samplesheet.")
         }
     // Prepare Kraken DB
     kraken_db_path = "${params.ref_dir}/results/kraken_db"
+    }
 
+    // Load samplesheet
+    LOAD_SAMPLESHET(params.sample_sheet)
+    samplesheet_ch = LOAD_SAMPLESHET.out.samplesheet
+    group_ch = LOAD_SAMPLESHET.out.group
 
     // Preprocessing
     RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)