From eb055a2f114588899e8aafc687c316f5a857f57a Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Sun, 29 Oct 2023 15:59:44 +0100
Subject: [PATCH 01/24] add merge Lane on paired end samples

---
 workflows/sammyseq.nf | 70 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 10 deletions(-)

diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf
index 5f7b3bf..027d095 100644
--- a/workflows/sammyseq.nf
+++ b/workflows/sammyseq.nf
@@ -113,15 +113,52 @@ workflow SAMMYSEQ {
     //INPUT_CHECK.out.reads.view()
 
     // merge fastq with same lane
-    ch_merge_lane = INPUT_CHECK.out.reads
+    // ch_merge_lane = INPUT_CHECK.out.reads
+    //                  .groupTuple()
+    //                  .map{ meta , path -> //if paired end ?
+    //                     meta = meta
+    //                     path = path.flatten()
+    //                     [meta , path]
+    //                  }
+
+    //TODO th
+
+    ch_notmerge_lane = INPUT_CHECK.out.reads
+                     .map{ meta, path -> 
+                        id=meta.subMap('id')
+                        meta=meta
+                        path=path
+                        [id.id, meta, path]
+                      }
                      .groupTuple()
-                     .map{ meta , path -> //if paired end ?
-                        meta = meta
-                        path = path.flatten()
-                        [meta , path]
+                     .filter{ it[1].size() == 1 }
+                     .map{ id, meta, path -> 
+                        meta_notmerge=meta[0]
+                        path_notmerge=path[0]
+                        [meta_notmerge, path_notmerge]
                      }
 
-    ch_merge_lane.view{"ch_merge_lane ${it}"}
+    //INPUT_CHECK.out.reads.view{"INPUT_CHECK.out.reads : ${it}"}
+    //ch_notmerge_lane.view{"ch_notmerge_lane: ${it}"}
+
+    ch_merge_lane = INPUT_CHECK.out.reads
+                     .map{ meta, path -> 
+                        id=meta.subMap('id')
+                        meta=meta
+                        path=path
+                        [id.id, meta, path]
+                      }
+                     .groupTuple()
+                     .filter{ it[1].size() == 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni 
+                     .map{ id, meta, path -> 
+                          meta = meta[0]
+                        def readsSample1 = [path[0][0], path[0][1]]
+                        def readsSample2 = [path[1][0], path[1][1]]
+                        [meta, readsSample1, readsSample2]
+                        
+                      }
+
+    //ch_merge_lane.view{"ch_merge_lane ${it}"}
 
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
@@ -129,11 +166,20 @@ workflow SAMMYSEQ {
         return
     }
 
+    ch_merge_lane.view{"ch_merge_lane : ${it}"}
+
     CAT_FASTQ (
            ch_merge_lane
     ).reads.set { cat_lane_output }
 
     //cat_lane_output.view()     
+    ch_starter = cat_lane_output.mix(ch_notmerge_lane)
+
+    //ch_starter.view()
+
+    if (params.stopAt == 'CAT_FASTQ_line') {
+        return
+    }
 
 
     // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input")
@@ -168,9 +214,11 @@ workflow SAMMYSEQ {
 
     } else {
         //merged_reads = INPUT_CHECK.out.reads
-        merged_reads = cat_lane_output
+        merged_reads = ch_starter
     }
 
+    //merged_reads.view()
+
     if (params.stopAt == 'CAT_FASTQ') {
         return
     }
@@ -178,7 +226,7 @@ workflow SAMMYSEQ {
     //
     // MODULE: Run FastQC
     //
-    //merged_reads.view()
+
 
 
     FASTQC (
@@ -187,15 +235,17 @@ workflow SAMMYSEQ {
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
     TRIMMOMATIC (
+        //INPUT_CHECK.out.reads
+        //ch_notmerge_lane
        merged_reads 
     )
 
+    //TRIMMOMATIC.out.trimmed_reads.view()
+
     if (params.stopAt == 'TRIMMOMATIC') {
         return
     }
 
-
-    //TRIMMOMATIC.out.trimmed_reads.view()
     
     FASTQ_ALIGN_BWAALN (
         TRIMMOMATIC.out.trimmed_reads,

From 93fd55ea29e1809bd396d9e9b58c4a1e12208da5 Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Sun, 29 Oct 2023 17:04:35 +0100
Subject: [PATCH 02/24] bugfix single reads lane merge

---
 workflows/sammyseq.nf | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf
index 027d095..89f74cb 100644
--- a/workflows/sammyseq.nf
+++ b/workflows/sammyseq.nf
@@ -112,16 +112,7 @@ workflow SAMMYSEQ {
 
     //INPUT_CHECK.out.reads.view()
 
-    // merge fastq with same lane
-    // ch_merge_lane = INPUT_CHECK.out.reads
-    //                  .groupTuple()
-    //                  .map{ meta , path -> //if paired end ?
-    //                     meta = meta
-    //                     path = path.flatten()
-    //                     [meta , path]
-    //                  }
-
-    //TODO th
+    //TODO use branch
 
     ch_notmerge_lane = INPUT_CHECK.out.reads
                      .map{ meta, path -> 
@@ -151,11 +142,14 @@ workflow SAMMYSEQ {
                      .groupTuple()
                      .filter{ it[1].size() == 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni 
                      .map{ id, meta, path -> 
-                          meta = meta[0]
-                        def readsSample1 = [path[0][0], path[0][1]]
-                        def readsSample2 = [path[1][0], path[1][1]]
-                        [meta, readsSample1, readsSample2]
-                        
+                        meta = meta[0]
+                        def readsSample1 = [path[0][0], path[0][1]].findAll()
+                        def readsSample2 = [path[1][0], path[1][1]].findAll()
+                        if(readsSample1.size() == 1 && readsSample2.size() == 1) {
+                            [meta, readsSample1 + readsSample2] 
+                        } else {
+                            [meta, readsSample1, readsSample2]
+                        }
                       }
 
     //ch_merge_lane.view{"ch_merge_lane ${it}"}
@@ -166,7 +160,7 @@ workflow SAMMYSEQ {
         return
     }
 
-    ch_merge_lane.view{"ch_merge_lane : ${it}"}
+    //ch_merge_lane.view{"ch_merge_lane : ${it}"}
 
     CAT_FASTQ (
            ch_merge_lane

From 8493bf6942f32d738abb3270f11bf632130ec55a Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Tue, 31 Oct 2023 09:58:48 +0100
Subject: [PATCH 03/24] Markduplicates should mark and not remove duplicate
 reads : SOLVED

---
 conf/modules.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index ef99c6d..551d67b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -71,7 +71,7 @@ process {
 
 
     withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' {
-        ext.args   = '--ASSUME_SORTED true --REMOVE_DUPLICATES true --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp'
+        ext.args   = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp'
         ext.prefix = { "${meta.id}.md" }
         publishDir = [
             [

From 376520bfbe5ce4afe40241f0154136b18430894f Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Tue, 31 Oct 2023 12:15:22 +0100
Subject: [PATCH 04/24] Add fastqc after trimming #16

---
 workflows/sammyseq.nf | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf
index 89f74cb..4587d35 100644
--- a/workflows/sammyseq.nf
+++ b/workflows/sammyseq.nf
@@ -221,20 +221,34 @@ workflow SAMMYSEQ {
     // MODULE: Run FastQC
     //
 
+    TRIMMOMATIC (
+       merged_reads 
+    )
+
+    //TRIMMOMATIC.out.trimmed_reads.view()
+
 
+    //a channel is created for the trimmed files and the id is renamed to meta, so that when passed to fastqc it does not overwrite the output files with non-trimmed ones
+    ch_fastqc_trim=TRIMMOMATIC.out.trimmed_reads
+                   .map{ meta, path ->
+                        def id=meta.subMap('id')
+                        newid=id.id + "_trim"
+                        sng=meta.subMap('single_end').single_end
+                        newmeta=[id: newid, single_end: sng]
+                        [ newmeta ,path]
+                    } 
 
+    //trimmed and untrimmed fastq channels are merged and the resulting channel is passed to FASTQC
+    ch_fastqc_in = ch_fastqc_trim.mix(merged_reads)
+    //ch_fastqc_in.view()
     FASTQC (
-        merged_reads
+        ch_fastqc_in
+        //merged_reads
     )
+
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
-    TRIMMOMATIC (
-        //INPUT_CHECK.out.reads
-        //ch_notmerge_lane
-       merged_reads 
-    )
 
-    //TRIMMOMATIC.out.trimmed_reads.view()
 
     if (params.stopAt == 'TRIMMOMATIC') {
         return

From 658cf663896953b40a0de6a33ada57dd81971a6f Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Sat, 4 Nov 2023 14:37:38 +0100
Subject: [PATCH 05/24] multi lane CAT

---
 conf/modules.config                  | 38 +++++++++++++++++++++++-----
 subworkflows/local/prepare_genome.nf |  9 ++++---
 workflows/sammyseq.nf                | 17 +++++--------
 3 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 551d67b..75dbc7e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -26,6 +26,14 @@ process {
         ]
     }
 
+    withName : ".*PREPARE_GENOME:.*" {
+        publishDir = [
+            path: { "${params.outdir}/genome" },
+            mode: params.publish_dir_mode,
+            enabled: params.save_reference
+        ]
+    }
+
     withName: FASTQC {
         ext.args = '--quiet'
     }
@@ -58,19 +66,19 @@ process {
     // Alignment, Picard MarkDuplicates and Filtering options
     //
 
-    withName: '.*FASTQ_ALIGN_BWAALN:BWA_.*' {
+    withName: '.*FASTQ_ALIGN_BWAALN:.*' {
         publishDir = [
             [
-                path: { "${params.outdir}/BWA" },
+                path: { "${params.outdir}/alignment/bwa" },
                 mode: params.publish_dir_mode,
-                pattern: '*.bam*',
+                pattern: '*.{bam,bai}',
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         ]
     }
 
 
-    withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' {
+    withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES.*' {
         ext.args   = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp'
         ext.prefix = { "${meta.id}.md" }
         publishDir = [
@@ -81,7 +89,7 @@ process {
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ],
             [
-                path: { "${params.outdir}/markduplicates/bam" },
+                path: { "${params.outdir}/alignment/markduplicates" },
                 mode: params.publish_dir_mode,
                 pattern: '*.md.{bam,bai}',
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
@@ -90,10 +98,10 @@ process {
         ]
     }
 
-    withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' {
+    withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX.*' {
         ext.prefix  = { "${meta.id}.markdup.sorted" }
         publishDir  = [
-            path: { "${params.outdir}/markduplicates/bam" },
+            path: { "${params.outdir}/alignment/markduplicates" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             pattern: '*.{bai,csi}'
@@ -110,4 +118,20 @@ process {
         ]
     }
 
+    withName : ".*DEEPTOOLS_BAMCOVERAGE.*" {
+        publishDir = [
+            path: { "${params.outdir}/single_tracks/deeptools" },
+            mode: params.publish_dir_mode,
+            pattern: '*.bigWig'
+        ]
+    }
+
+    withName : ".*RTWOSAMPLESMLE.*" {
+        publishDir = [
+            path: { "${params.outdir}/comparisons/spp_mle" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
+
 }
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
index b39e726..eca6b1c 100644
--- a/subworkflows/local/prepare_genome.nf
+++ b/subworkflows/local/prepare_genome.nf
@@ -42,10 +42,11 @@ workflow PREPARE_GENOME {
     //println(ch_fasta)
     // Make fasta file available if reference saved or IGV is run
     //if (params.save_reference || !params.skip_igv) {
-    if (params.save_reference) {    
-        file("${params.outdir}/genome/").mkdirs()
-        ch_fasta.copyTo("${params.outdir}/genome/")
-    }
+
+    // if (params.save_reference) {    
+    //     file("${params.outdir}/genome/").mkdirs()
+    //     ch_fasta.copyTo("${params.outdir}/genome/")
+    // }
 
 
     //
diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf
index 4587d35..af75c05 100644
--- a/workflows/sammyseq.nf
+++ b/workflows/sammyseq.nf
@@ -140,19 +140,15 @@ workflow SAMMYSEQ {
                         [id.id, meta, path]
                       }
                      .groupTuple()
-                     .filter{ it[1].size() == 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni 
+                     .filter{ it[1].size() >= 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni 
                      .map{ id, meta, path -> 
+                        single = meta[0].subMap('single_end')
                         meta = meta[0]
-                        def readsSample1 = [path[0][0], path[0][1]].findAll()
-                        def readsSample2 = [path[1][0], path[1][1]].findAll()
-                        if(readsSample1.size() == 1 && readsSample2.size() == 1) {
-                            [meta, readsSample1 + readsSample2] 
-                        } else {
-                            [meta, readsSample1, readsSample2]
-                        }
+                        def flatPath = path.flatten()
+                        [ meta , flatPath ]
                       }
 
-    //ch_merge_lane.view{"ch_merge_lane ${it}"}
+    ch_merge_lane.view{"ch_merge_lane ${it}"}
 
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
@@ -171,7 +167,7 @@ workflow SAMMYSEQ {
 
     //ch_starter.view()
 
-    if (params.stopAt == 'CAT_FASTQ_line') {
+    if (params.stopAt == 'CAT_FASTQ_lane') {
         return
     }
 
@@ -430,6 +426,7 @@ workflow SAMMYSEQ {
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
 
     ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.stats.collect{it[1]}.ifEmpty([]))
+    ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.metrics.collect{it[1]}.ifEmpty([]))
     ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.flagstat.collect{it[1]}.ifEmpty([]))
     ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.idxstats.collect{it[1]}.ifEmpty([]))
 

From 6368b290fee23e244b9bcd2e92ba5218054fee09 Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Sat, 4 Nov 2023 21:19:43 +0100
Subject: [PATCH 06/24] merge fraction and lane bugfix

---
 subworkflows/local/cat_fractions.nf | 50 +++++++++++++++++++++++++++++
 workflows/sammyseq.nf               | 34 ++++++--------------
 2 files changed, 60 insertions(+), 24 deletions(-)
 create mode 100644 subworkflows/local/cat_fractions.nf

diff --git a/subworkflows/local/cat_fractions.nf b/subworkflows/local/cat_fractions.nf
new file mode 100644
index 0000000..c752644
--- /dev/null
+++ b/subworkflows/local/cat_fractions.nf
@@ -0,0 +1,50 @@
+// Import SAMTOOLS_FAIDX module
+include { CAT_FASTQ                   } from '../../modules/nf-core/cat/fastq/main'
+
+// Definition of the subworkflow
+workflow CAT_FRACTIONS {
+    take:
+    //reads_to_merge
+    //all_reads
+    starting_files
+
+    main:
+
+        all_files = starting_files.map { meta, fastq -> 
+            def expID = meta.expID  // Prendi l'expID da meta
+            meta.remove('fraction') // Rimuovi il campo 'fraction'
+            [expID, meta, fastq]
+        }
+
+        all_files.groupTuple( by:0 )
+            .map{ expID, meta , fastq ->
+                meta2 = meta[0].clone()  // Prendi solo la prima mappa e crea una copia
+                meta2.id = expID  // Sostituisci il valore nel campo id con expID 
+                [ [meta2], fastq.flatten().toList() ]
+            }
+            .set { reads_to_merge }
+
+            reads_to_merge
+            .filter { meta, fastq -> fastq.size() > 1 }    
+            .set { ch_reads_to_process_in_CAT_FASTQ }
+
+            ch_reads_to_process_in_CAT_FASTQ    
+            .flatMap { meta, fastq  -> 
+                    meta.collect { m -> [m,fastq]}
+                }
+            .set { ch_to_CAT }
+
+            //ch_to_CAT.view{"ch_to_CAT : ${it}"}
+
+            CAT_FASTQ (
+                    ch_to_CAT
+            ).reads.set { cat_fastq_output }        
+
+            ch_cat_adjusted = CAT_FASTQ.out.reads.map { meta, fastq ->
+                    return [meta, fastq]
+            } // make fastq of CAT_FASTQ a list of paths
+            merged_reads = starting_files.mix(ch_cat_adjusted)
+
+    emit:
+    merged_reads = merged_reads
+}
\ No newline at end of file
diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf
index af75c05..f02b197 100644
--- a/workflows/sammyseq.nf
+++ b/workflows/sammyseq.nf
@@ -49,10 +49,12 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil
 include { INPUT_CHECK } from '../subworkflows/local/input_check'
 include { PREPARE_GENOME      } from '../subworkflows/local/prepare_genome'
 include { FASTQ_ALIGN_BWAALN  } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf'
+include { CAT_FRACTIONS } from '../subworkflows/local/cat_fractions'
 include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard'
 include { CUT_SIZES_GENOME } from "../modules/local/chromosomes_size"
 include { RTWOSAMPLESMLE } from '../modules/local/rtwosamplesmle'
 
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     IMPORT NF-CORE MODULES/SUBWORKFLOWS
@@ -148,7 +150,7 @@ workflow SAMMYSEQ {
                         [ meta , flatPath ]
                       }
 
-    ch_merge_lane.view{"ch_merge_lane ${it}"}
+    //ch_merge_lane.view{"ch_merge_lane ${it}"}
 
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
@@ -165,7 +167,7 @@ workflow SAMMYSEQ {
     //cat_lane_output.view()     
     ch_starter = cat_lane_output.mix(ch_notmerge_lane)
 
-    //ch_starter.view()
+    //ch_starter.view{"ch_starter : ${it}"}
 
     if (params.stopAt == 'CAT_FASTQ_lane') {
         return
@@ -181,35 +183,19 @@ workflow SAMMYSEQ {
 
     if(params.combine_fractions){
 
-            INPUT_CHECK.out.reads_to_merge
-            .filter { meta, fastq -> fastq.size() > 1 }    
-            .set { ch_reads_to_process_in_CAT_FASTQ }
-
-            ch_reads_to_process_in_CAT_FASTQ    
-            .flatMap { meta, fastq  -> 
-                    meta.collect { m -> [m,fastq]}
-                }
-            .set { ch_to_CAT }
-
-            //ch_to_CAT.view{"ch_to_CAT : ${it}"}
-
-            CAT_FASTQ (
-                    ch_to_CAT
-            ).reads.set { cat_fastq_output }        
-
-            ch_cat_adjusted = CAT_FASTQ.out.reads.map { meta, fastq ->
-                    return [meta, [fastq]]
-            } // make fastq of CAT_FASTQ a list of paths
-            merged_reads = INPUT_CHECK.out.reads.mix(ch_cat_adjusted)
+        merged_reads = CAT_FRACTIONS(//INPUT_CHECK.out.reads_to_merge,
+                                    //INPUT_CHECK.out.reads
+                                    ch_starter
+                                    )//.out.merged_reads
 
     } else {
         //merged_reads = INPUT_CHECK.out.reads
         merged_reads = ch_starter
     }
 
-    //merged_reads.view()
+    //merged_reads.view{"merged_reads: ${it}"}
 
-    if (params.stopAt == 'CAT_FASTQ') {
+    if (params.stopAt == 'CAT_FRACTIONS') {
         return
     }
 

From 7dd7f3e2e260aaebdc2deb613e6b239a22d0fee8 Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Sat, 4 Nov 2023 22:52:44 +0100
Subject: [PATCH 07/24] add two_samples_mle output to results folder

---
 conf/modules.config                  | 7 +++++++
 modules/local/rtwosamplesmle/main.nf | 3 ++-
 workflows/sammyseq.nf                | 4 ++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 75dbc7e..7954838 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -118,6 +118,13 @@ process {
         ]
     }
 
+    withName : ".*SAMTOOLS_FAIDX.*" {
+        publishDir = [
+            path: { "${params.outdir}/genome" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
     withName : ".*DEEPTOOLS_BAMCOVERAGE.*" {
         publishDir = [
             path: { "${params.outdir}/single_tracks/deeptools" },
diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf
index 2161cc5..646794a 100644
--- a/modules/local/rtwosamplesmle/main.nf
+++ b/modules/local/rtwosamplesmle/main.nf
@@ -48,11 +48,12 @@ process RTWOSAMPLESMLE {
     path chromsizes_file
 
 
-    //output:
+    output:
     // TODO nf-core: Named file extensions MUST be emitted for ALL output channels
     //tuple val(meta), path("*.bam"), emit: bam
     // TODO nf-core: List additional required output channels/values here
     //path "versions.yml"           , emit: versions
+    tuple val(meta), path("*_mle.txt")              , emit: results
 
     when:
     task.ext.when == null || task.ext.when
diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf
index f02b197..deb5e15 100644
--- a/workflows/sammyseq.nf
+++ b/workflows/sammyseq.nf
@@ -256,6 +256,10 @@ workflow SAMMYSEQ {
 
     ch_fai_for_cut = SAMTOOLS_FAIDX.out.fai.collect()
 
+    if (params.stopAt == 'SAMTOOLS_FAIDX') {
+        return
+    }
+
     CUT_SIZES_GENOME(ch_fai_for_cut)
     //CUT_SIZES_GENOME.out.ch_sizes_genome.view()
 

From fcc592b406459cb40e5d68b81348910bd7862fdf Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Thu, 16 Nov 2023 09:30:47 +0100
Subject: [PATCH 08/24] add blacklist parameter

---
 conf/modules.config  | 12 ++++++++++++
 nextflow.config      |  3 +++
 nextflow_schema.json | 26 +++++++++++++++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 7954838..77c68e6 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -126,6 +126,7 @@ process {
     }
 
     withName : ".*DEEPTOOLS_BAMCOVERAGE.*" {
+         
         publishDir = [
             path: { "${params.outdir}/single_tracks/deeptools" },
             mode: params.publish_dir_mode,
@@ -142,3 +143,14 @@ process {
 
 
 }
+
+if (params.blackListFile != null) {
+        process {
+            withName: '.*DEEPTOOLS_BAMCOVERAGE.*' {
+                ext.args    = "–blackListFileName ${params.blackListFile}"
+            }
+            }
+}
+
+
+
diff --git a/nextflow.config b/nextflow.config
index 5140b09..b63519c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -34,6 +34,9 @@ params {
     max_multiqc_email_size     = '25.MB'
     multiqc_methods_description = null
 
+    // bam generation
+    blackListFile              = null
+
     // Boilerplate options
     outdir                     = null
     publish_dir_mode           = 'copy'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 86cb4cd..7e1cd4c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -15,7 +15,20 @@
                 "stopAt": {
                     "type": "string",
                     "description": "Specify after which step the pipeline should stop.",
-                    "fa_icon": "fas fa-stop-circle"
+                    "fa_icon": "fas fa-stop-circle",
+                    "enum": [
+                        "BEGIN",
+                        "PREPARE_GENOME",
+                        "INPUT_CHECK",
+                        "CAT_FASTQ_lane",
+                        "CAT_FRACTIONS",
+                        "TRIMMOMATIC",
+                        "FASTQ_ALIGN_BWAALN",
+                        "SAMTOOLS_FAIDX",
+                        "BAM_MARKDUPLICATES_PICARD",
+                        "DEEPTOOLS_BAMCOVERAGE",
+                        "RTWOSAMPLESMLE"
+                    ]
                 },
                 "input": {
                     "type": "string",
@@ -103,6 +116,17 @@
                 }
             }
         },
+        "bam options":{
+            "title": "option for the generation of bam",
+            "properties": {
+                "blackListFile" : {
+                    "type": "string",
+                    "format": "file-path",
+                    "description": "A BED or GTF file containing regions that should be excluded from all analyses",
+                    "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant"
+                }
+            }
+        },
         "institutional_config_options": {
             "title": "Institutional config options",
             "type": "object",

From 2ccc17c81348c572a3cf1b3b681a14def355996a Mon Sep 17 00:00:00 2001
From: Lucio Di Filippo <difilippolucio@gmail.com>
Date: Thu, 16 Nov 2023 09:47:15 +0100
Subject: [PATCH 09/24] bugfix nextflow_schema

---
 nextflow_schema.json | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 7e1cd4c..9683793 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -116,17 +116,6 @@
                 }
             }
         },
-        "bam options":{
-            "title": "option for the generation of bam",
-            "properties": {
-                "blackListFile" : {
-                    "type": "string",
-                    "format": "file-path",
-                    "description": "A BED or GTF file containing regions that should be excluded from all analyses",
-                    "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant"
-                }
-            }
-        },
         "institutional_config_options": {
             "title": "Institutional config options",
             "type": "object",
@@ -338,5 +327,13 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ]
+    ],
+    "properties": {
+        "blackListFile" : {
+            "type": "string",
+            "format": "file-path",
+            "description": "A BED or GTF file containing regions that should be excluded from all analyses",
+            "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant"
+        }
+    }
 }

From 2887f65d3253cfc5f5889a1a652970f387601771 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 11:00:24 +0100
Subject: [PATCH 10/24] fix prettier and EditorConfig tests

---
 conf/modules.config  | 2 +-
 nextflow_schema.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 77c68e6..715dc71 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -126,7 +126,7 @@ process {
     }
 
     withName : ".*DEEPTOOLS_BAMCOVERAGE.*" {
-         
+
         publishDir = [
             path: { "${params.outdir}/single_tracks/deeptools" },
             mode: params.publish_dir_mode,
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 9683793..74324cc 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -329,7 +329,7 @@
         }
     ],
     "properties": {
-        "blackListFile" : {
+        "blackListFile": {
             "type": "string",
             "format": "file-path",
             "description": "A BED or GTF file containing regions that should be excluded from all analyses",

From 6ae1f176b09c2868535e078cd4629e4e867cb393 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 11:09:25 +0100
Subject: [PATCH 11/24] changed test profile reference to align reads

---
 conf/test.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test.config b/conf/test.config
index ed15af0..74c4282 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -23,5 +23,5 @@ params {
     input  = 'https://genome.isasi.cnr.it/biocomp/test-datasets/sammyseq/testdata/chr22/samplesheet_test_github_chr22_tinier.csv'
 
     // Genome references
-    fasta  = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    fasta  = 'https://genome.isasi.cnr.it/biocomp/test-datasets/sammyseq/testdata/chr22/chr22.fa'
 }

From 51eb68e8b117d089fbd2c836ec363810a371bf0d Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 12:58:38 +0100
Subject: [PATCH 12/24] modified example samplesheetmodified example
 samplesheet

---
 assets/samplesheet.csv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
index 5f653ab..3bc0416 100644
--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,fastq_1,fastq_2
-SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
-SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
+sample,fastq_1,fastq_2,experimentalID,fraction
+SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,SAMPLE_PAIRED_END_EXPID,S2
+SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,,SAMPLE_SINGLE_END_EXPID,S2

From c5709a902d3412ee896df29c0812c06b09005fe1 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 13:21:55 +0100
Subject: [PATCH 13/24] bumped 0.01dev version

---
 assets/multiqc_config.yml | 4 ++--
 nextflow.config           | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 541c831..590b51c 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,8 +1,8 @@
 report_comment: >
 
-  This report has been generated by the <a href="https://github.com/nf-core/sammyseq/releases/tag/0.01" target="_blank">nf-core/sammyseq</a>
+  This report has been generated by the <a href="https://github.com/nf-core/sammyseq/releases/tag/dev" target="_blank">nf-core/sammyseq</a>
   analysis pipeline. For information about how to interpret these results, please see the
-  <a href="https://nf-co.re/sammyseq/0.01/docs/output" target="_blank">documentation</a>.
+  <a href="https://nf-co.re/sammyseq/dev/docs/output" target="_blank">documentation</a>.
 
 report_section_order:
   "nf-core-sammyseq-methods-description":
diff --git a/nextflow.config b/nextflow.config
index b63519c..082e371 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -9,7 +9,7 @@
 // Global default params, used in configs
 params {
 
-    
+
 
     // TODO nf-core: Specify your pipeline's command line flags
     // Input options
@@ -22,7 +22,7 @@ params {
     igenomes_ignore            = false
     bwa_index                  = null
     save_reference             = false
-    
+
     //overall analysis errorStrategy
     stopAt                     = ''
     combine_fractions          = false
@@ -55,7 +55,7 @@ params {
     custom_config_base         = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"
     config_profile_contact     = null
     config_profile_url         = null
-    
+
 
     // Max resource options
     // Defaults only, expecting to be overwritten
@@ -241,7 +241,7 @@ manifest {
     description     = """analysis pipeline for SAMMY-seq"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.01'
+    version = '0.01dev'
     doi             = ''
 }
 

From 969f3f8cf4309b71f5db6a3da5da9d77a9743168 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 15:41:36 +0100
Subject: [PATCH 14/24] fixed link in multiqc report to dev version &
 temporarily disabled lint failing test

---
 .nf-core.yml              | 3 +++
 assets/multiqc_config.yml | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.nf-core.yml b/.nf-core.yml
index 3805dc8..27becf0 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -1 +1,4 @@
 repository_type: pipeline
+lint:
+    multiqc_config:
+        report_comment: False
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 590b51c..7dfa5ed 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,6 +1,6 @@
 report_comment: >
 
-  This report has been generated by the <a href="https://github.com/nf-core/sammyseq/releases/tag/dev" target="_blank">nf-core/sammyseq</a>
+  This report has been generated by the <a href="https://github.com/nf-core/sammyseq/tree/dev" target="_blank">nf-core/sammyseq</a>
   analysis pipeline. For information about how to interpret these results, please see the
   <a href="https://nf-co.re/sammyseq/dev/docs/output" target="_blank">documentation</a>.
 

From 03f577bc5cc431253bf05435e7e95bc1405fd55e Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 15:51:46 +0100
Subject: [PATCH 15/24] fixed prettier

---
 .nf-core.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.nf-core.yml b/.nf-core.yml
index 27becf0..5085ace 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -1,4 +1,4 @@
 repository_type: pipeline
 lint:
-    multiqc_config:
-        report_comment: False
+  multiqc_config:
+    report_comment: False

From f8fd5df5dcf7b742bd75e256c22be95281474497 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 16:43:40 +0100
Subject: [PATCH 16/24] updated CITATIONS.md

---
 CITATIONS.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CITATIONS.md b/CITATIONS.md
index 6411d58..fc47df4 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -12,7 +12,7 @@
 
 - [BWA](https://www.ncbi.nlm.nih.gov/pubmed/19451168/)
 
-> Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234.
+  > Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234.
 
 - [deepTools](https://www.ncbi.nlm.nih.gov/pubmed/27079975/)
 
@@ -32,7 +32,9 @@
 
   > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002.
 
-- [Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)
+- [Trimmomatic](https://pubmed.ncbi.nlm.nih.gov/24695404/)
+
+  > Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. 2014 Aug 1;30(15):2114-20. doi: 10.1093/bioinformatics/btu170. Epub 2014 Apr 1. PMID: 24695404; PMCID: PMC4103590.
 
 ## R packages
 

From 3b7a6c8fdd703d7a7340243b93395441fd445501 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 18:11:02 +0100
Subject: [PATCH 17/24] updated parameters

---
 main.nf              |  1 -
 nextflow_schema.json | 11 +++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 615706d..f1fd011 100644
--- a/main.nf
+++ b/main.nf
@@ -17,7 +17,6 @@ nextflow.enable.dsl = 2
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-// TODO nf-core: Remove this line if you don't need a FASTA file
 //   This is an example of how to use getGenomeAttribute() to fetch parameters
 //   from igenomes.config using `--genome`
 params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 74324cc..124f29c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -16,6 +16,7 @@
                     "type": "string",
                     "description": "Specify after which step the pipeline should stop.",
                     "fa_icon": "fas fa-stop-circle",
+                    "hidden": true,
                     "enum": [
                         "BEGIN",
                         "PREPARE_GENOME",
@@ -37,7 +38,7 @@
                     "mimetype": "text/csv",
                     "pattern": "^\\S+\\.csv$",
                     "description": "Path to comma-separated file containing information about the samples in the experiment.",
-                    "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/sammyseq/usage#samplesheet-input).",
+                    "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 5 columns, and a header row. See [usage docs](https://nf-co.re/sammyseq/usage#samplesheet-input).",
                     "fa_icon": "fas fa-file-csv"
                 },
                 "outdir": {
@@ -61,14 +62,16 @@
                 "comparisonFile": {
                     "type": "string",
                     "format": "file-path",
-                    "description": "Path to comma-separated file containing information about samples comparisons",
+                    "description": "Path to comma-separated file containing the sample names for the desired paired comparisons",
                     "mimetype": "text/csv",
-                    "help_text": "You will need to create a comparisons file with information about the comparisons in your experiment before running the pipeline. Use this parameter to specify the necessary comparisons. It has to be a comma-separated file with 2 columns, one for the first sample to compair and the second one to sample against which to do it."
+                    "help_text": "You will need to create a comparisons file with information about the desired comparisons in your experiment before running the pipeline. It has to be a comma-separated file with 2 columns: one for the first sample to compare and the second one to sample against which to do it. The sample names must correspond to sample names present in the first column of the samplesheet (--input parameter)",
+                    "fa_icon": "fas fa-file-csv"
                 },
                 "combine_fractions": {
                     "type": "boolean",
                     "description": "when combined, the fastqs of individual fractions from the same samples will be merged into a new fastq.",
-                    "help_text": "the combined fastq file can be used in order to obtain information on the whole sample or to perform additional analyses not included"
+                    "help_text": "the combined fastq file can be used in order to obtain information on the whole sample or to perform additional analyses not included",
+                    "fa_icon": "fas fa-merge"
                 }
             }
         },

From bf80c9b2a75b492022266318735c6dfc1e6067e3 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Thu, 16 Nov 2023 20:07:58 +0100
Subject: [PATCH 18/24] R module bugfixing

---
 modules/local/rtwosamplesmle/main.nf          |  2 +-
 .../templates/two_samples_mle.R               | 46 +++++++++++--------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf
index 646794a..a5d1e55 100644
--- a/modules/local/rtwosamplesmle/main.nf
+++ b/modules/local/rtwosamplesmle/main.nf
@@ -53,7 +53,7 @@ process RTWOSAMPLESMLE {
     //tuple val(meta), path("*.bam"), emit: bam
     // TODO nf-core: List additional required output channels/values here
     //path "versions.yml"           , emit: versions
-    tuple val(meta), path("*_mle.txt")              , emit: results
+    tuple val(meta), path("*_mle.bw")              , emit: results
 
     when:
     task.ext.when == null || task.ext.when
diff --git a/modules/local/rtwosamplesmle/templates/two_samples_mle.R b/modules/local/rtwosamplesmle/templates/two_samples_mle.R
index 5eb2b3e..805e72a 100644
--- a/modules/local/rtwosamplesmle/templates/two_samples_mle.R
+++ b/modules/local/rtwosamplesmle/templates/two_samples_mle.R
@@ -14,15 +14,15 @@ require(rtracklayer)
 # FUNCTIONS
 ################################################
 
-sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) {
-        if (!(is.null(endcol))) {
-                x <- x[order(x[,endcol]),]
-        }
-        x <- x[order(x[,stcol]),]
-        chrs <- ordered(x[,chrcol], levels=chrorder)
-        # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep=""))
-        x <- x[order(chrs),]
-}
+# sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) {
+#         if (!(is.null(endcol))) {
+#                 x <- x[order(x[,endcol]),]
+#         }
+#         x <- x[order(x[,stcol]),]
+#         chrs <- ordered(x[,chrcol], levels=chrorder)
+#         # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep=""))
+#         x <- x[order(chrs),]
+# }
 
 ################################################
 ## PARAMS
@@ -36,8 +36,7 @@ sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=past
 ip_file <- "${bam1}"
 input_file <- "${bam2}"
 chromsizes_file <- "${chromsizes_file}"
-mle_output_file <- "${bam1.baseName}VS${bam2.baseName}_mle.txt"
-remove_anomalies <- FALSE
+mle_output_file <- "${bam1.baseName}_VS_${bam2.baseName}_mle.bw"
 
 ################################################
 # DEFAULT PARAMETERS
@@ -45,6 +44,7 @@ remove_anomalies <- FALSE
 
 remove_anomalies <- FALSE
 debug_mode <- TRUE
+stebp = 50
 
 ################################################
 # CORE PROCESSES
@@ -66,6 +66,7 @@ chromsizes <- fread(chromsizes_file, data.table = FALSE)
 #chromsizes <- sortbychr(chromsizes, chrcol="V1", stcol="V2", chrorder=paste("chr", c(seq(19), "X", "Y"), sep=""))
 #chrs <- as.character(sub('chr', '', chromsizes[, 1]))
 chrs <- chromsizes[, 1]
+rownames(chromsizes) <- chrs
 
 ### Import the data
 
@@ -75,31 +76,40 @@ input <- read.bam.tags(input_file)
 ### Select the informative tags
 ip_tags <- ip[['tags']]
 input_tags <- input[['tags']]
+
 print("second part")
 ### Remove the tags with anomalies
 if(remove_anomalies==TRUE){
     ip_rm <- remove.local.tag.anomalies(ip_tags[chrs])
     input_rm <- remove.local.tag.anomalies(input_tags[chrs])
 }else{
-    ip_rm <- ip_tags
-    input_rm <- input_tags
+    ip_rm <- ip_tags[chrs]
+    input_rm <- input_tags[chrs]
 }
 
 print("third part")
 ### Make the smoothing with the loglikelihood function
-print("lane 90")
+print("line 90")
 #print(paste0("ip_rm=",head(ip_rm)))
 #print(paste0("input_rm=",head(ip_rm)))
-chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE)
-print("lane 91")
+# chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE)
+ip_chrs <- names(ip_rm)[sapply(ip_rm, length) > 10]
+input_chrs <- names(input_rm)[sapply(input_rm, length) > 10]
+chr2keep <- intersect(ip_chrs, input_chrs)
+# length(chr2keep)
+
+chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm[chr2keep], input_rm[chr2keep], tag.shift = 0, background.density.scaling = FALSE, step = stebp)
+
+print("line 91")
 #chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_tags, input_tags, tag.shift = 0, background.density.scaling = FALSE)
 
 ccl <- sapply(chip_smoothed_mle, nrow)
 cpos <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$x))
+cend <- cpos + stebp - 1
 ccov <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$y))
 print("line 97")
-gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cpos), strand = Rle('*', sum(ccl)), score = ccov)
-seqlengths(gr.mle) <- chromsizes[, 2]
+gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cend), strand = Rle('*', sum(ccl)), score = ccov)
+seqlengths(gr.mle) <- chromsizes[chr2keep, 2]
 
 #rename score col
 col_names <- names(mcols(gr.mle))

From 3be8ef117aa41ba4f096fb4fc61b6fd01f9c37f1 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Fri, 17 Nov 2023 04:22:51 +0100
Subject: [PATCH 19/24] updated usage and configuration

---
 README.md            |  15 +-----
 conf/base.config     |   4 +-
 docs/usage.md        | 114 +++++++++++++++++++++++++++----------------
 nextflow.config      |   7 +--
 nextflow_schema.json |  21 ++++----
 5 files changed, 89 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index 1f2b3fa..340504e 100644
--- a/README.md
+++ b/README.md
@@ -46,9 +46,6 @@ to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/i
 with `-profile test` before running the workflow on actual data.
 :::
 
-<!-- TODO nf-core: Describe the minimum required steps to execute the pipeline, e.g. how to prepare samplesheets.
-     Explain what rows and columns represent. For instance (please edit as appropriate -->
-
 First, prepare a samplesheet with your input data that looks as follows:
 
 `samplesheet.csv`:
@@ -64,11 +61,10 @@ Each row represents a fastq file (single-end) or a pair of fastq files (paired e
 
 Now, you can run the pipeline using:
 
-<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
-
 ```bash
 nextflow run nf-core/sammyseq \
    -profile <docker/singularity/.../institute> \
+   --fasta reference_genome.fa \
    --input samplesheet.csv \
    --outdir <OUTDIR>
 ```
@@ -78,19 +74,12 @@ or
 ```bash
 nextflow run nf-core/sammyseq \
    -profile <docker/singularity/.../institute> \
+   --fasta reference_genome.fa \
    --input samplesheet.csv \
    --outdir <OUTDIR> \
    --comparisonFile comparisons.csv
 ```
 
-`comparisons.csv`:
-
-```csv
-sample1,sample2
-CTRL004_S2,CTRL004_S3
-CTRL004_S2,CTRL004_S4
-```
-
 :::warning
 Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
 provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
diff --git a/conf/base.config b/conf/base.config
index f0a8691..05cea2b 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -10,7 +10,7 @@
 
 process {
 
-    // TODO nf-core: Check the defaults for all processes
+    // nf-core: Check the defaults for all processes
     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
@@ -24,7 +24,7 @@ process {
     //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
     //        If possible, it would be nice to keep the same label naming convention when
     //        adding in your local modules too.
-    // TODO nf-core: Customise requirements for specific processes.
+    // nf-core: Customise requirements for specific processes.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_single {
         cpus   = { check_max( 1                  , 'cpus'    ) }
diff --git a/docs/usage.md b/docs/usage.md
index 30ecc2c..6233631 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -6,31 +6,79 @@
 
 ## Introduction
 
-<!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
+sammyseqis a workflow designed for the analysis of Sequential Analysis of MacroMolecules accessibilitY sequencing (SAMMY-seq) data, a cheap and effective methodology to analyze chromatin state in cells. SAMMY-seq is an innovative technique based on the separation of chromatin in fractions, each progressively based on their solubility and accessibility, and extraction and sequencing of the DNA present in each of them.
+
+## Running the pipeline
+
+The typical command for running the pipeline is as follows:
+
+```bash
+nextflow run nf-core/sammyseq -r dev \
+    -profile docker \
+    --fasta ./reference_genome.fa\
+    --input ./samplesheet.csv \
+    --outdir ./results
+```
+
+This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
+
+Note that the pipeline will create the following files in your working directory:
+
+```bash
+work                # Directory containing the nextflow working files
+<OUTDIR>            # Finished results in specified location (defined with --outdir)
+.nextflow_log       # Log file from Nextflow
+# Other nextflow hidden files, eg. history of pipeline runs and old logs.
+```
+
+If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file.
+
+Pipeline settings can be provided in a `yaml` or `json` file via `-params-file <file>`.
+
+:::warning
+Do not use `-c <file>` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args).
+:::
+
+The above pipeline run specified with a params file in yaml format:
+
+```bash
+nextflow run nf-core/sammyseq -profile docker -params-file params.yaml
+```
+
+with `params.yaml` containing:
+
+```yaml
+input: './samplesheet.csv'
+outdir: './results/'
+fasta: './genome.fa'
+<...>
+```
+
+You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+Before running the pipeline, you will need to create a samplesheet with information about the samples you would like to analyze. Use this parameter to specify its location:
 
 ```bash
---input '[path to samplesheet file]'
+--input '[full path to samplesheet file]'
 ```
 
+It has to be a comma-separated file with 5 columns, and a header row as shown in the examples below.
+
 ### Multiple runs of the same sample
 
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 2 lanes:
+The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample fraction sequenced across 2 lanes:
 
 ```console
 sample,fastq_1,fastq_2,experimentalID,fraction
-CONTROL_REP1_S2,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,CONTROL,S2
-CONTROL_REP1_S2,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,CONTROL,S2
+CONTROL_REP1_S2,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,CONTROL_REP1,S2
+CONTROL_REP1_S2,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,CONTROL_REP1,S2
 ```
 
 ### Full samplesheet
 
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 5 columns to match those defined in the table below.
-
-<!-- A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. -->
+The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can contain a mixture of single- and paired-end but in case of multiple runs of the same `sample` they have to be of the same type to be correctly merged. There can be additional columns but the first 5 have to match those defined in the table below.
 
 ```console
 ssample,fastq_1,fastq_2,experimentalID,fraction
@@ -45,53 +93,35 @@ CTRL004_S4,/home/sammy/test_data/CTRL004_S4_chr22only.fq.gz,,CTRL004,S4
 | `fastq_1`        | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
 | `fastq_2`        | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
 | `experimentalID` | Experimental sample identifier. This represents the biological specimen of interest and will be the same for all fractions exctracted.                                                 |
-| `fraction`       | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4 .                                                                                  |
+| `fraction`       | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4.                                                                                   |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
-## Running the pipeline
+### Pairwise comparisons
 
-The typical command for running the pipeline is as follows:
+It is possible to generate pairwise comparisons between two samples by providing a list with the parameter `--comparisonFile` to indicate the full path to a comma-separated file with 2 columns:
 
-```bash
-nextflow run nf-core/sammyseq --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker
-```
-
-This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
-
-Note that the pipeline will create the following files in your working directory:
+`comparisons.csv`:
 
-```bash
-work                # Directory containing the nextflow working files
-<OUTDIR>            # Finished results in specified location (defined with --outdir)
-.nextflow_log       # Log file from Nextflow
-# Other nextflow hidden files, eg. history of pipeline runs and old logs.
+```csv
+sample1,sample2
+CTRL004_S2,CTRL004_S3
+CTRL004_S2,CTRL004_S4
 ```
 
-If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file.
+It can contain any combination of sample identifiers, they have to correspond to identifiers present in the `sample` column in the input file. When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508
 
-Pipeline settings can be provided in a `yaml` or `json` file via `-params-file <file>`.
+### Combine fractions
 
-:::warning
-Do not use `-c <file>` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args).
-:::
-
-The above pipeline run specified with a params file in yaml format:
+Optionally, the fractions extracted from the same `experimentalID` can be combined together for later use by setting the parameter `--combine_fractions`.
 
-```bash
-nextflow run nf-core/sammyseq -profile docker -params-file params.yaml
-```
+## Reference genome files
 
-with `params.yaml` containing:
+The minimum reference genome requirements is the FASTA file, provided with the mandatory parameter `--fasta`, the bwa index will be generated by the pipeline and can be saved for later reuse if the `--save_reference` parameter is passed. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space, if already present it can be passed using the `--bwa_index '/path/to/bwa/index/'` parameter.
 
-```yaml
-input: './samplesheet.csv'
-outdir: './results/'
-genome: 'GRCh37'
-<...>
-```
+## Blacklist bed files
 
-You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
+A blacklist of regions that will be excluded by signal tracks can be provided using the optional parameter `--blackListFile` with full path to a coordinate file in bed format. Blacklist files for several genome builds can be found in the [ENCODE Blacklist Project](https://github.com/Boyle-Lab/Blacklist).
 
 ### Updating the pipeline
 
diff --git a/nextflow.config b/nextflow.config
index 082e371..ded694b 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -9,9 +9,6 @@
 // Global default params, used in configs
 params {
 
-
-
-    // TODO nf-core: Specify your pipeline's command line flags
     // Input options
     input                      = null
     comparisonFile             = null
@@ -19,11 +16,11 @@ params {
     // References
     genome                     = null
     igenomes_base              = 's3://ngi-igenomes/igenomes'
-    igenomes_ignore            = false
+    igenomes_ignore            = true
     bwa_index                  = null
     save_reference             = false
 
-    //overall analysis errorStrategy
+    //overall analysis options
     stopAt                     = ''
     combine_fractions          = false
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 124f29c..dbc2628 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -80,6 +80,7 @@
             "type": "object",
             "fa_icon": "fas fa-dna",
             "description": "Reference genome related files and options required for the workflow.",
+            "required": ["fasta"],
             "properties": {
                 "genome": {
                     "type": "string",
@@ -115,7 +116,15 @@
                     "description": "Do not load the iGenomes reference config.",
                     "fa_icon": "fas fa-ban",
                     "hidden": true,
-                    "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`."
+                    "default": true,
+                    "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. You can run `sammyseq` by specifying at least a FASTA genome file."
+                },
+                "blackListFile": {
+                    "type": "string",
+                    "format": "file-path",
+                    "fa_icon": "fas fa-ban",
+                    "description": "A BED or GTF file containing regions that should be excluded from all analyses",
+                    "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant"
                 }
             }
         },
@@ -330,13 +339,5 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ],
-    "properties": {
-        "blackListFile": {
-            "type": "string",
-            "format": "file-path",
-            "description": "A BED or GTF file containing regions that should be excluded from all analyses",
-            "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant"
-        }
-    }
+    ]
 }

From 4fce10d27e278baba0733948dca163298e86a085 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Fri, 17 Nov 2023 11:27:12 +0100
Subject: [PATCH 20/24] initiated outpput documentation

---
 docs/output.md | 98 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 88 insertions(+), 10 deletions(-)

diff --git a/docs/output.md b/docs/output.md
index 0a7fdf3..afe0f30 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -6,18 +6,23 @@ This document describes the output produced by the pipeline. Most of the plots a
 
 The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 
-<!-- TODO nf-core: Write this documentation describing your workflow's output -->
-
 ## Pipeline overview
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
-- [FastQC](#fastqc) - Raw read QC
+- [FastQC](#fastqc)
+- [Trim adapters](#trim-adapters)
+- [Align to Reference](#align-to-reference)
+- [Mark Duplicates](#mark-duplicates)
+- [`Signal track generation`](#signal-track-generation)
+- [`Comparisons`](#comparisons)
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
 ### FastQC
 
+[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about the sequenced reads. It provides information about the quality score distribution across reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+
 <details markdown="1">
 <summary>Output files</summary>
 
@@ -27,20 +32,83 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 </details>
 
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+#### Trim adapters
 
-![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
+[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software for read trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek
+`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs.
 
-![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
+The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in:
+
+<details markdown="1">
+<summary>Output files for all samples</summary>
 
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
+**Output directory: `{outdir}/preprocessing/fastp/<sample>`**
+
+- `<sample>_<lane>_{1,2}.fastp.fastq.gz>`
+  - Bgzipped FastQ file
+
+</details>
 
 :::note
-The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
+The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads. They can be compared to check removal of adapter sequence and potentially regions with low quality after the trimming step.
 :::
 
+### Align to Reference
+
+[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then sorted with [samtools](https://www.htslib.org/doc/samtools.html).
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `alignment/bwa/`
+  - `<sample>.bam` and `<sample>.bam.bai`
+
+</details>
+
+### Mark Duplicates
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `alignment/markduplicates/`
+  - `<sample>.md.bam` and `<sample>.md.bam.bai`
+- `reports/markduplicates/`
+  - `<sample>.md.MarkDuplicates.metrics.txt`
+
+</details>
+
+Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file.
+
+### Signal track generation
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `single_tracks/deeptools/`
+  - `<sample>.bigWig`
+
+</details>
+
+[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting.
+
+### Comparisons
+
+When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `comparisons/spp_mle/`
+  - `<sample1>.md_VS_<sample2>.md.bw`
+
+</details>
+
 ### MultiQC
 
+[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
+
+Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.
+
 <details markdown="1">
 <summary>Output files</summary>
 
@@ -51,9 +119,19 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m
 
 </details>
 
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
+### Reference genome files
 
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.
+<details markdown="1">
+<summary>Output files</summary>
+
+- `genome/`
+
+  - A number of genome-specific files are generated by the pipeline in order to aid in the filtering of the data, and because they are required by standard tools such as BEDTools. These can be found in this directory.
+  - `bwa/`: Directory containing BWA indices.
+
+  - If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory. This can be quite a time-consuming process so it permits their reuse for future runs of the pipeline or for other purposes.
+
+</details>
 
 ### Pipeline information
 

From e5d034d75cf3c088547843b288760047b9d6e9c4 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Fri, 17 Nov 2023 11:55:20 +0100
Subject: [PATCH 21/24] updatedd output.md

---
 docs/output.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/output.md b/docs/output.md
index afe0f30..adcc02a 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -11,11 +11,11 @@ The directories listed below will be created in the results directory after the
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
 - [FastQC](#fastqc)
-- [Trim adapters](#trim-adapters)
-- [Align to Reference](#align-to-reference)
-- [Mark Duplicates](#mark-duplicates)
-- [`Signal track generation`](#signal-track-generation)
-- [`Comparisons`](#comparisons)
+- [Trim reads](#trim-adapters)
+- [Alignment on Reference](#align-to-reference)
+- [Mark Duplicate reads](#mark-duplicates)
+- [Signal track generation](#signal-track-generation)
+- [Comparisons](#comparisons)
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 

From f947060203b413efc9db3058f85ac72c7e42b047 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Fri, 17 Nov 2023 12:03:39 +0100
Subject: [PATCH 22/24] updated output.md

---
 docs/output.md | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/docs/output.md b/docs/output.md
index adcc02a..47be2f8 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -11,9 +11,9 @@ The directories listed below will be created in the results directory after the
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
 - [FastQC](#fastqc)
-- [Trim reads](#trim-adapters)
-- [Alignment on Reference](#align-to-reference)
-- [Mark Duplicate reads](#mark-duplicates)
+- [Trim reads](#trim-reads)
+- [Alignment on Reference](#alignment-on-reference)
+- [Mark Duplicate reads](#mark-duplicate-reads)
 - [Signal track generation](#signal-track-generation)
 - [Comparisons](#comparisons)
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
@@ -32,30 +32,17 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 </details>
 
-#### Trim adapters
+#### Trim reads
 
-[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software for read trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek
-`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs.
-
-The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in:
-
-<details markdown="1">
-<summary>Output files for all samples</summary>
-
-**Output directory: `{outdir}/preprocessing/fastp/<sample>`**
-
-- `<sample>_<lane>_{1,2}.fastp.fastq.gz>`
-  - Bgzipped FastQ file
-
-</details>
+[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc.
 
 :::note
 The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads. They can be compared to check removal of adapter sequence and potentially regions with low quality after the trimming step.
 :::
 
-### Align to Reference
+### Alignment on Reference
 
-[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then sorted with [samtools](https://www.htslib.org/doc/samtools.html).
+The alignment is performed using [BWA](https://github.com/lh3/bwa) and the aligned reads are then sorted by chromosome coordinates with [samtools](https://www.htslib.org/doc/samtools.html).
 
 <details markdown="1">
 <summary>Output files</summary>
@@ -65,7 +52,7 @@ The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _tri
 
 </details>
 
-### Mark Duplicates
+### Mark Duplicate reads
 
 <details markdown="1">
 <summary>Output files</summary>

From 67e55899d8e796044141013a9ab8f2626be3c5ec Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Fri, 17 Nov 2023 12:22:32 +0100
Subject: [PATCH 23/24] updated output.md

---
 docs/output.md | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/docs/output.md b/docs/output.md
index 47be2f8..4d92099 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -16,28 +16,32 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [Mark Duplicate reads](#mark-duplicate-reads)
 - [Signal track generation](#signal-track-generation)
 - [Comparisons](#comparisons)
-- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
-- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
+- [MultiQC](#multiqc)
+- [Pipeline information](#pipeline-information)
 
-### FastQC
+### Read quality check
+
+#### FastQC
 
 [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about the sequenced reads. It provides information about the quality score distribution across reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
 
+#### Trim reads
+
+[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc.
+
 <details markdown="1">
 <summary>Output files</summary>
 
 - `fastqc/`
   - `*_fastqc.html`: FastQC report containing quality metrics.
   - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+  - `*_trim_fastqc.html`: FastQC report containing quality metrics for trimmed reads.
+  - `*_trim_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images for trimmed reads.
 
 </details>
 
-#### Trim reads
-
-[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc.
-
 :::note
-The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads. They can be compared to check removal of adapter sequence and potentially regions with low quality after the trimming step.
+The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads so they can be directly compared.
 :::
 
 ### Alignment on Reference
@@ -54,6 +58,8 @@ The alignment is performed using [BWA](https://github.com/lh3/bwa) and the align
 
 ### Mark Duplicate reads
 
+Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file.
+
 <details markdown="1">
 <summary>Output files</summary>
 
@@ -64,10 +70,10 @@ The alignment is performed using [BWA](https://github.com/lh3/bwa) and the align
 
 </details>
 
-Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file.
-
 ### Signal track generation
 
+[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting.
+
 <details markdown="1">
 <summary>Output files</summary>
 
@@ -76,11 +82,9 @@ Read pairs that are likely to have originated from duplicates of the same origin
 
 </details>
 
-[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting.
-
 ### Comparisons
 
-When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508
+When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile smoothed by the Gaussian kernel is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508
 
 <details markdown="1">
 <summary>Output files</summary>
@@ -108,20 +112,20 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
 
 ### Reference genome files
 
+A number of genome-specific files if required by some of the analysis steps. If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory.
+
 <details markdown="1">
 <summary>Output files</summary>
 
 - `genome/`
-
-  - A number of genome-specific files are generated by the pipeline in order to aid in the filtering of the data, and because they are required by standard tools such as BEDTools. These can be found in this directory.
   - `bwa/`: Directory containing BWA indices.
 
-  - If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory. This can be quite a time-consuming process so it permits their reuse for future runs of the pipeline or for other purposes.
-
 </details>
 
 ### Pipeline information
 
+[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.
+
 <details markdown="1">
 <summary>Output files</summary>
 
@@ -132,5 +136,3 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
   - Parameters used by the pipeline run: `params.json`.
 
 </details>
-
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.

From c207edb6d4bd1bfdef6d28077119f2302eb7da52 Mon Sep 17 00:00:00 2001
From: daisymut <margherita.mutarelli@cnr.it>
Date: Fri, 17 Nov 2023 12:51:18 +0100
Subject: [PATCH 24/24] updated usage.md and renamed example samplesheet

---
 docs/usage.md                                          | 10 +++++-----
 ...ome_isasi_chr22_tinier.csv => samplesheet_test.csv} |  0
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename tests/{samplesheet_test_genome_isasi_chr22_tinier.csv => samplesheet_test.csv} (100%)

diff --git a/docs/usage.md b/docs/usage.md
index 6233631..5cca560 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -20,7 +20,7 @@ nextflow run nf-core/sammyseq -r dev \
     --outdir ./results
 ```
 
-This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
+This will launch the pipeline with the `docker` configuration profile. See [below](#profile) for more information about profiles.
 
 Note that the pipeline will create the following files in your working directory:
 
@@ -95,7 +95,7 @@ CTRL004_S4,/home/sammy/test_data/CTRL004_S4_chr22only.fq.gz,,CTRL004,S4
 | `experimentalID` | Experimental sample identifier. This represents the biological specimen of interest and will be the same for all fractions exctracted.                                                 |
 | `fraction`       | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4.                                                                                   |
 
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+An [example samplesheet](../tests/samplesheet_test.csv) has been provided with the pipeline.
 
 ### Pairwise comparisons
 
@@ -119,11 +119,11 @@ Optionally, the fractions extracted from the same `experimentalID` can be combin
 
 The minimum reference genome requirements is the FASTA file, provided with the mandatory parameter `--fasta`, the bwa index will be generated by the pipeline and can be saved for later reuse if the `--save_reference` parameter is passed. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space, if already present it can be passed using the `--bwa_index '/path/to/bwa/index/'` parameter.
 
-## Blacklist bed files
+### Blacklist bed files
 
 A blacklist of regions that will be excluded by signal tracks can be provided using the optional parameter `--blackListFile` with full path to a coordinate file in bed format. Blacklist files for several genome builds can be found in the [ENCODE Blacklist Project](https://github.com/Boyle-Lab/Blacklist).
 
-### Updating the pipeline
+## Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
 
@@ -131,7 +131,7 @@ When you run the above command, Nextflow automatically pulls the pipeline code f
 nextflow pull nf-core/sammyseq
 ```
 
-### Reproducibility
+## Reproducibility
 
 It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since.
 
diff --git a/tests/samplesheet_test_genome_isasi_chr22_tinier.csv b/tests/samplesheet_test.csv
similarity index 100%
rename from tests/samplesheet_test_genome_isasi_chr22_tinier.csv
rename to tests/samplesheet_test.csv