From eb055a2f114588899e8aafc687c316f5a857f57a Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Sun, 29 Oct 2023 15:59:44 +0100 Subject: [PATCH 01/24] add merge Lane on paired end samples --- workflows/sammyseq.nf | 70 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 5f7b3bf..027d095 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -113,15 +113,52 @@ workflow SAMMYSEQ { //INPUT_CHECK.out.reads.view() // merge fastq with same lane - ch_merge_lane = INPUT_CHECK.out.reads + // ch_merge_lane = INPUT_CHECK.out.reads + // .groupTuple() + // .map{ meta , path -> //if paired end ? + // meta = meta + // path = path.flatten() + // [meta , path] + // } + + //TODO th + + ch_notmerge_lane = INPUT_CHECK.out.reads + .map{ meta, path -> + id=meta.subMap('id') + meta=meta + path=path + [id.id, meta, path] + } .groupTuple() - .map{ meta , path -> //if paired end ? - meta = meta - path = path.flatten() - [meta , path] + .filter{ it[1].size() == 1 } + .map{ id, meta, path -> + meta_notmerge=meta[0] + path_notmerge=path[0] + [meta_notmerge, path_notmerge] } - ch_merge_lane.view{"ch_merge_lane ${it}"} + //INPUT_CHECK.out.reads.view{"INPUT_CHECK.out.reads : ${it}"} + //ch_notmerge_lane.view{"ch_notmerge_lane: ${it}"} + + ch_merge_lane = INPUT_CHECK.out.reads + .map{ meta, path -> + id=meta.subMap('id') + meta=meta + path=path + [id.id, meta, path] + } + .groupTuple() + .filter{ it[1].size() == 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni + .map{ id, meta, path -> + meta = meta[0] + def readsSample1 = [path[0][0], path[0][1]] + def readsSample2 = [path[1][0], path[1][1]] + [meta, readsSample1, readsSample2] + + } + + //ch_merge_lane.view{"ch_merge_lane ${it}"} ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) @@ -129,11 +166,20 @@ workflow SAMMYSEQ { return } + ch_merge_lane.view{"ch_merge_lane : ${it}"} + CAT_FASTQ ( ch_merge_lane ).reads.set { cat_lane_output } //cat_lane_output.view() + ch_starter = cat_lane_output.mix(ch_notmerge_lane) + + //ch_starter.view() + + if (params.stopAt == 'CAT_FASTQ_line') { + return + } // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") @@ -168,9 +214,11 @@ workflow SAMMYSEQ { } else { //merged_reads = INPUT_CHECK.out.reads - merged_reads = cat_lane_output + merged_reads = ch_starter } + //merged_reads.view() + if (params.stopAt == 'CAT_FASTQ') { return } @@ -178,7 +226,7 @@ workflow SAMMYSEQ { // // MODULE: Run FastQC // - //merged_reads.view() + FASTQC ( @@ -187,15 +235,17 @@ workflow SAMMYSEQ { ch_versions = ch_versions.mix(FASTQC.out.versions.first()) TRIMMOMATIC ( + //INPUT_CHECK.out.reads + //ch_notmerge_lane merged_reads ) + //TRIMMOMATIC.out.trimmed_reads.view() + if (params.stopAt == 'TRIMMOMATIC') { return } - - //TRIMMOMATIC.out.trimmed_reads.view() FASTQ_ALIGN_BWAALN ( TRIMMOMATIC.out.trimmed_reads, From 93fd55ea29e1809bd396d9e9b58c4a1e12208da5 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Sun, 29 Oct 2023 17:04:35 +0100 Subject: [PATCH 02/24] bugfix single reads lane merge --- workflows/sammyseq.nf | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 027d095..89f74cb 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -112,16 +112,7 @@ workflow SAMMYSEQ { //INPUT_CHECK.out.reads.view() - // merge fastq with same lane - // ch_merge_lane = INPUT_CHECK.out.reads - // .groupTuple() - // .map{ meta , path -> //if paired end ? - // meta = meta - // path = path.flatten() - // [meta , path] - // } - - //TODO th + //TODO use branch ch_notmerge_lane = INPUT_CHECK.out.reads .map{ meta, path -> @@ -151,11 +142,14 @@ workflow SAMMYSEQ { .groupTuple() .filter{ it[1].size() == 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni .map{ id, meta, path -> - meta = meta[0] - def readsSample1 = [path[0][0], path[0][1]] - def readsSample2 = [path[1][0], path[1][1]] - [meta, readsSample1, readsSample2] - + meta = meta[0] + def readsSample1 = [path[0][0], path[0][1]].findAll() + def readsSample2 = [path[1][0], path[1][1]].findAll() + if(readsSample1.size() == 1 && readsSample2.size() == 1) { + [meta, readsSample1 + readsSample2] + } else { + [meta, readsSample1, readsSample2] + } } //ch_merge_lane.view{"ch_merge_lane ${it}"} @@ -166,7 +160,7 @@ workflow SAMMYSEQ { return } - ch_merge_lane.view{"ch_merge_lane : ${it}"} + //ch_merge_lane.view{"ch_merge_lane : ${it}"} CAT_FASTQ ( ch_merge_lane From 8493bf6942f32d738abb3270f11bf632130ec55a Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Tue, 31 Oct 2023 09:58:48 +0100 Subject: [PATCH 03/24] Markduplicates should mark and not remove duplicate reads : SOLVED --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index ef99c6d..551d67b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -71,7 +71,7 @@ process { withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { - ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES true --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' + ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' ext.prefix = { "${meta.id}.md" } publishDir = [ [ From 376520bfbe5ce4afe40241f0154136b18430894f Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Tue, 31 Oct 2023 12:15:22 +0100 Subject: [PATCH 04/24] Add fastqc after trimming #16 --- workflows/sammyseq.nf | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 89f74cb..4587d35 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -221,20 +221,34 @@ workflow SAMMYSEQ { // MODULE: Run FastQC // + TRIMMOMATIC ( + merged_reads + ) + + //TRIMMOMATIC.out.trimmed_reads.view() + + //a channel is created for the trimmed files and the id is renamed to meta, so that when passed to fastqc it does not overwrite the output files with non-trimmed ones + ch_fastqc_trim=TRIMMOMATIC.out.trimmed_reads + .map{ meta, path -> + def id=meta.subMap('id') + newid=id.id + "_trim" + sng=meta.subMap('single_end').single_end + newmeta=[id: newid, single_end: sng] + [ newmeta ,path] + } + //trimmed and untrimmed fastq channels are merged and the resulting channel is passed to FASTQC + ch_fastqc_in = ch_fastqc_trim.mix(merged_reads) + //ch_fastqc_in.view() FASTQC ( - merged_reads + ch_fastqc_in + //merged_reads ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - TRIMMOMATIC ( - //INPUT_CHECK.out.reads - //ch_notmerge_lane - merged_reads - ) - //TRIMMOMATIC.out.trimmed_reads.view() if (params.stopAt == 'TRIMMOMATIC') { return From 658cf663896953b40a0de6a33ada57dd81971a6f Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Sat, 4 Nov 2023 14:37:38 +0100 Subject: [PATCH 05/24] multi lane CAT --- conf/modules.config | 38 +++++++++++++++++++++++----- subworkflows/local/prepare_genome.nf | 9 ++++--- workflows/sammyseq.nf | 17 +++++-------- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 551d67b..75dbc7e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,14 @@ process { ] } + withName : ".*PREPARE_GENOME:.*" { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + } + withName: FASTQC { ext.args = '--quiet' } @@ -58,19 +66,19 @@ process { // Alignment, Picard MarkDuplicates and Filtering options // - withName: '.*FASTQ_ALIGN_BWAALN:BWA_.*' { + withName: '.*FASTQ_ALIGN_BWAALN:.*' { publishDir = [ [ - path: { "${params.outdir}/BWA" }, + path: { "${params.outdir}/alignment/bwa" }, mode: params.publish_dir_mode, - pattern: '*.bam*', + pattern: '*.{bam,bai}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ] } - withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { + withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES.*' { ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' ext.prefix = { "${meta.id}.md" } publishDir = [ @@ -81,7 +89,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/markduplicates/bam" }, + path: { "${params.outdir}/alignment/markduplicates" }, mode: params.publish_dir_mode, pattern: '*.md.{bam,bai}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -90,10 +98,10 @@ process { ] } - withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' { + withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX.*' { ext.prefix = { "${meta.id}.markdup.sorted" } publishDir = [ - path: { "${params.outdir}/markduplicates/bam" }, + path: { "${params.outdir}/alignment/markduplicates" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.{bai,csi}' @@ -110,4 +118,20 @@ process { ] } + withName : ".*DEEPTOOLS_BAMCOVERAGE.*" { + publishDir = [ + path: { "${params.outdir}/single_tracks/deeptools" }, + mode: params.publish_dir_mode, + pattern: '*.bigWig' + ] + } + + withName : ".*RTWOSAMPLESMLE.*" { + publishDir = [ + path: { "${params.outdir}/comparisons/spp_mle" }, + mode: params.publish_dir_mode, + ] + } + + } diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b39e726..eca6b1c 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -42,10 +42,11 @@ workflow PREPARE_GENOME { //println(ch_fasta) // Make fasta file available if reference saved or IGV is run //if (params.save_reference || !params.skip_igv) { - if (params.save_reference) { - file("${params.outdir}/genome/").mkdirs() - ch_fasta.copyTo("${params.outdir}/genome/") - } + + // if (params.save_reference) { + // file("${params.outdir}/genome/").mkdirs() + // ch_fasta.copyTo("${params.outdir}/genome/") + // } // diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 4587d35..af75c05 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -140,19 +140,15 @@ workflow SAMMYSEQ { [id.id, meta, path] } .groupTuple() - .filter{ it[1].size() == 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni + .filter{ it[1].size() >= 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni .map{ id, meta, path -> + single = meta[0].subMap('single_end') meta = meta[0] - def readsSample1 = [path[0][0], path[0][1]].findAll() - def readsSample2 = [path[1][0], path[1][1]].findAll() - if(readsSample1.size() == 1 && readsSample2.size() == 1) { - [meta, readsSample1 + readsSample2] - } else { - [meta, readsSample1, readsSample2] - } + def flatPath = path.flatten() + [ meta , flatPath ] } - //ch_merge_lane.view{"ch_merge_lane ${it}"} + ch_merge_lane.view{"ch_merge_lane ${it}"} ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) @@ -171,7 +167,7 @@ workflow SAMMYSEQ { //ch_starter.view() - if (params.stopAt == 'CAT_FASTQ_line') { + if (params.stopAt == 'CAT_FASTQ_lane') { return } @@ -430,6 +426,7 @@ workflow SAMMYSEQ { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.metrics.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.flagstat.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.idxstats.collect{it[1]}.ifEmpty([])) From 6368b290fee23e244b9bcd2e92ba5218054fee09 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Sat, 4 Nov 2023 21:19:43 +0100 Subject: [PATCH 06/24] merge fraction and lane bugfix --- subworkflows/local/cat_fractions.nf | 50 +++++++++++++++++++++++++++++ workflows/sammyseq.nf | 34 ++++++-------------- 2 files changed, 60 insertions(+), 24 deletions(-) create mode 100644 subworkflows/local/cat_fractions.nf diff --git a/subworkflows/local/cat_fractions.nf b/subworkflows/local/cat_fractions.nf new file mode 100644 index 0000000..c752644 --- /dev/null +++ b/subworkflows/local/cat_fractions.nf @@ -0,0 +1,50 @@ +// Import SAMTOOLS_FAIDX module +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' + +// Definition of the subworkflow +workflow CAT_FRACTIONS { + take: + //reads_to_merge + //all_reads + starting_files + + main: + + all_files = starting_files.map { meta, fastq -> + def expID = meta.expID // Prendi l'expID da meta + meta.remove('fraction') // Rimuovi il campo 'fraction' + [expID, meta, fastq] + } + + all_files.groupTuple( by:0 ) + .map{ expID, meta , fastq -> + meta2 = meta[0].clone() // Prendi solo la prima mappa e crea una copia + meta2.id = expID // Sostituisci il valore nel campo id con expID + [ [meta2], fastq.flatten().toList() ] + } + .set { reads_to_merge } + + reads_to_merge + .filter { meta, fastq -> fastq.size() > 1 } + .set { ch_reads_to_process_in_CAT_FASTQ } + + ch_reads_to_process_in_CAT_FASTQ + .flatMap { meta, fastq -> + meta.collect { m -> [m,fastq]} + } + .set { ch_to_CAT } + + //ch_to_CAT.view{"ch_to_CAT : ${it}"} + + CAT_FASTQ ( + ch_to_CAT + ).reads.set { cat_fastq_output } + + ch_cat_adjusted = CAT_FASTQ.out.reads.map { meta, fastq -> + return [meta, fastq] + } // make fastq of CAT_FASTQ a list of paths + merged_reads = starting_files.mix(ch_cat_adjusted) + + emit: + merged_reads = merged_reads +} \ No newline at end of file diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index af75c05..f02b197 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -49,10 +49,12 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { FASTQ_ALIGN_BWAALN } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf' +include { CAT_FRACTIONS } from '../subworkflows/local/cat_fractions' include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard' include { CUT_SIZES_GENOME } from "../modules/local/chromosomes_size" include { RTWOSAMPLESMLE } from '../modules/local/rtwosamplesmle' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -148,7 +150,7 @@ workflow SAMMYSEQ { [ meta , flatPath ] } - ch_merge_lane.view{"ch_merge_lane ${it}"} + //ch_merge_lane.view{"ch_merge_lane ${it}"} ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) @@ -165,7 +167,7 @@ workflow SAMMYSEQ { //cat_lane_output.view() ch_starter = cat_lane_output.mix(ch_notmerge_lane) - //ch_starter.view() + //ch_starter.view{"ch_starter : ${it}"} if (params.stopAt == 'CAT_FASTQ_lane') { return @@ -181,35 +183,19 @@ workflow SAMMYSEQ { if(params.combine_fractions){ - INPUT_CHECK.out.reads_to_merge - .filter { meta, fastq -> fastq.size() > 1 } - .set { ch_reads_to_process_in_CAT_FASTQ } - - ch_reads_to_process_in_CAT_FASTQ - .flatMap { meta, fastq -> - meta.collect { m -> [m,fastq]} - } - .set { ch_to_CAT } - - //ch_to_CAT.view{"ch_to_CAT : ${it}"} - - CAT_FASTQ ( - ch_to_CAT - ).reads.set { cat_fastq_output } - - ch_cat_adjusted = CAT_FASTQ.out.reads.map { meta, fastq -> - return [meta, [fastq]] - } // make fastq of CAT_FASTQ a list of paths - merged_reads = INPUT_CHECK.out.reads.mix(ch_cat_adjusted) + merged_reads = CAT_FRACTIONS(//INPUT_CHECK.out.reads_to_merge, + //INPUT_CHECK.out.reads + ch_starter + )//.out.merged_reads } else { //merged_reads = INPUT_CHECK.out.reads merged_reads = ch_starter } - //merged_reads.view() + //merged_reads.view{"merged_reads: ${it}"} - if (params.stopAt == 'CAT_FASTQ') { + if (params.stopAt == 'CAT_FRACTIONS') { return } From 7dd7f3e2e260aaebdc2deb613e6b239a22d0fee8 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Sat, 4 Nov 2023 22:52:44 +0100 Subject: [PATCH 07/24] add two_samples_mle output to results folder --- conf/modules.config | 7 +++++++ modules/local/rtwosamplesmle/main.nf | 3 ++- workflows/sammyseq.nf | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 75dbc7e..7954838 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -118,6 +118,13 @@ process { ] } + withName : ".*SAMTOOLS_FAIDX.*" { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + ] + } + withName : ".*DEEPTOOLS_BAMCOVERAGE.*" { publishDir = [ path: { "${params.outdir}/single_tracks/deeptools" }, diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index 2161cc5..646794a 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -48,11 +48,12 @@ process RTWOSAMPLESMLE { path chromsizes_file - //output: + output: // TODO nf-core: Named file extensions MUST be emitted for ALL output channels //tuple val(meta), path("*.bam"), emit: bam // TODO nf-core: List additional required output channels/values here //path "versions.yml" , emit: versions + tuple val(meta), path("*_mle.txt") , emit: results when: task.ext.when == null || task.ext.when diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index f02b197..deb5e15 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -256,6 +256,10 @@ workflow SAMMYSEQ { ch_fai_for_cut = SAMTOOLS_FAIDX.out.fai.collect() + if (params.stopAt == 'SAMTOOLS_FAIDX') { + return + } + CUT_SIZES_GENOME(ch_fai_for_cut) //CUT_SIZES_GENOME.out.ch_sizes_genome.view() From fcc592b406459cb40e5d68b81348910bd7862fdf Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Thu, 16 Nov 2023 09:30:47 +0100 Subject: [PATCH 08/24] add blacklist parameter --- conf/modules.config | 12 ++++++++++++ nextflow.config | 3 +++ nextflow_schema.json | 26 +++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 7954838..77c68e6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -126,6 +126,7 @@ process { } withName : ".*DEEPTOOLS_BAMCOVERAGE.*" { + publishDir = [ path: { "${params.outdir}/single_tracks/deeptools" }, mode: params.publish_dir_mode, @@ -142,3 +143,14 @@ process { } + +if (params.blackListFile != null) { + process { + withName: '.*DEEPTOOLS_BAMCOVERAGE.*' { + ext.args = "–blackListFileName ${params.blackListFile}" + } + } +} + + + diff --git a/nextflow.config b/nextflow.config index 5140b09..b63519c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -34,6 +34,9 @@ params { max_multiqc_email_size = '25.MB' multiqc_methods_description = null + // bam generation + blackListFile = null + // Boilerplate options outdir = null publish_dir_mode = 'copy' diff --git a/nextflow_schema.json b/nextflow_schema.json index 86cb4cd..7e1cd4c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,7 +15,20 @@ "stopAt": { "type": "string", "description": "Specify after which step the pipeline should stop.", - "fa_icon": "fas fa-stop-circle" + "fa_icon": "fas fa-stop-circle", + "enum": [ + "BEGIN", + "PREPARE_GENOME", + "INPUT_CHECK", + "CAT_FASTQ_lane", + "CAT_FRACTIONS", + "TRIMMOMATIC", + "FASTQ_ALIGN_BWAALN", + "SAMTOOLS_FAIDX", + "BAM_MARKDUPLICATES_PICARD", + "DEEPTOOLS_BAMCOVERAGE", + "RTWOSAMPLESMLE" + ] }, "input": { "type": "string", @@ -103,6 +116,17 @@ } } }, + "bam options":{ + "title": "option for the generation of bam", + "properties": { + "blackListFile" : { + "type": "string", + "format": "file-path", + "description": "A BED or GTF file containing regions that should be excluded from all analyses", + "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant" + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", From 2ccc17c81348c572a3cf1b3b681a14def355996a Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Thu, 16 Nov 2023 09:47:15 +0100 Subject: [PATCH 09/24] bugfix nextflow_schema --- nextflow_schema.json | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7e1cd4c..9683793 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -116,17 +116,6 @@ } } }, - "bam options":{ - "title": "option for the generation of bam", - "properties": { - "blackListFile" : { - "type": "string", - "format": "file-path", - "description": "A BED or GTF file containing regions that should be excluded from all analyses", - "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant" - } - } - }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -338,5 +327,13 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "blackListFile" : { + "type": "string", + "format": "file-path", + "description": "A BED or GTF file containing regions that should be excluded from all analyses", + "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant" + } + } } From 2887f65d3253cfc5f5889a1a652970f387601771 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 11:00:24 +0100 Subject: [PATCH 10/24] fix prettier and EditorConfig tests --- conf/modules.config | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 77c68e6..715dc71 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -126,7 +126,7 @@ process { } withName : ".*DEEPTOOLS_BAMCOVERAGE.*" { - + publishDir = [ path: { "${params.outdir}/single_tracks/deeptools" }, mode: params.publish_dir_mode, diff --git a/nextflow_schema.json b/nextflow_schema.json index 9683793..74324cc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -329,7 +329,7 @@ } ], "properties": { - "blackListFile" : { + "blackListFile": { "type": "string", "format": "file-path", "description": "A BED or GTF file containing regions that should be excluded from all analyses", From 6ae1f176b09c2868535e078cd4629e4e867cb393 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 11:09:25 +0100 Subject: [PATCH 11/24] changed test profile reference to align reads --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index ed15af0..74c4282 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,5 +23,5 @@ params { input = 'https://genome.isasi.cnr.it/biocomp/test-datasets/sammyseq/testdata/chr22/samplesheet_test_github_chr22_tinier.csv' // Genome references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + fasta = 'https://genome.isasi.cnr.it/biocomp/test-datasets/sammyseq/testdata/chr22/chr22.fa' } From 51eb68e8b117d089fbd2c836ec363810a371bf0d Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 12:58:38 +0100 Subject: [PATCH 12/24] modified example samplesheetmodified example samplesheet --- assets/samplesheet.csv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..3bc0416 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,fastq_1,fastq_2,experimentalID,fraction +SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,SAMPLE_PAIRED_END_EXPID,S2 +SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,,SAMPLE_SINGLE_END_EXPID,S2 From c5709a902d3412ee896df29c0812c06b09005fe1 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 13:21:55 +0100 Subject: [PATCH 13/24] bumped 0.01dev version --- assets/multiqc_config.yml | 4 ++-- nextflow.config | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 541c831..590b51c 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,8 +1,8 @@ report_comment: > - This report has been generated by the nf-core/sammyseq + This report has been generated by the nf-core/sammyseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-sammyseq-methods-description": diff --git a/nextflow.config b/nextflow.config index b63519c..082e371 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,7 +9,7 @@ // Global default params, used in configs params { - + // TODO nf-core: Specify your pipeline's command line flags // Input options @@ -22,7 +22,7 @@ params { igenomes_ignore = false bwa_index = null save_reference = false - + //overall analysis errorStrategy stopAt = '' combine_fractions = false @@ -55,7 +55,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -241,7 +241,7 @@ manifest { description = """analysis pipeline for SAMMY-seq""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.01' + version = '0.01dev' doi = '' } From 969f3f8cf4309b71f5db6a3da5da9d77a9743168 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 15:41:36 +0100 Subject: [PATCH 14/24] fixed link in multiqc report to dev version & temporarily disabled lint failing test --- .nf-core.yml | 3 +++ assets/multiqc_config.yml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc8..27becf0 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,4 @@ repository_type: pipeline +lint: + multiqc_config: + report_comment: False diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 590b51c..7dfa5ed 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,6 +1,6 @@ report_comment: > - This report has been generated by the nf-core/sammyseq + This report has been generated by the nf-core/sammyseq analysis pipeline. For information about how to interpret these results, please see the documentation. From 03f577bc5cc431253bf05435e7e95bc1405fd55e Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 15:51:46 +0100 Subject: [PATCH 15/24] fixed prettier --- .nf-core.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index 27becf0..5085ace 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,4 +1,4 @@ repository_type: pipeline lint: - multiqc_config: - report_comment: False + multiqc_config: + report_comment: False From f8fd5df5dcf7b742bd75e256c22be95281474497 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 16:43:40 +0100 Subject: [PATCH 16/24] updated CITATIONS.md --- CITATIONS.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 6411d58..fc47df4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,7 +12,7 @@ - [BWA](https://www.ncbi.nlm.nih.gov/pubmed/19451168/) -> Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234. + > Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234. - [deepTools](https://www.ncbi.nlm.nih.gov/pubmed/27079975/) @@ -32,7 +32,9 @@ > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002. -- [Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) +- [Trimmomatic](https://pubmed.ncbi.nlm.nih.gov/24695404/) + + > Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. 2014 Aug 1;30(15):2114-20. doi: 10.1093/bioinformatics/btu170. Epub 2014 Apr 1. PMID: 24695404; PMCID: PMC4103590. ## R packages From 3b7a6c8fdd703d7a7340243b93395441fd445501 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 18:11:02 +0100 Subject: [PATCH 17/24] updated parameters --- main.nf | 1 - nextflow_schema.json | 11 +++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 615706d..f1fd011 100644 --- a/main.nf +++ b/main.nf @@ -17,7 +17,6 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') diff --git a/nextflow_schema.json b/nextflow_schema.json index 74324cc..124f29c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -16,6 +16,7 @@ "type": "string", "description": "Specify after which step the pipeline should stop.", "fa_icon": "fas fa-stop-circle", + "hidden": true, "enum": [ "BEGIN", "PREPARE_GENOME", @@ -37,7 +38,7 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/sammyseq/usage#samplesheet-input).", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 5 columns, and a header row. See [usage docs](https://nf-co.re/sammyseq/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, "outdir": { @@ -61,14 +62,16 @@ "comparisonFile": { "type": "string", "format": "file-path", - "description": "Path to comma-separated file containing information about samples comparisons", + "description": "Path to comma-separated file containing the sample names for the desired paired comparisons", "mimetype": "text/csv", - "help_text": "You will need to create a comparisons file with information about the comparisons in your experiment before running the pipeline. Use this parameter to specify the necessary comparisons. It has to be a comma-separated file with 2 columns, one for the first sample to compair and the second one to sample against which to do it." + "help_text": "You will need to create a comparisons file with information about the desired comparisons in your experiment before running the pipeline. It has to be a comma-separated file with 2 columns: one for the first sample to compare and the second one to sample against which to do it. The sample names must correspond to sample names present in the first column of the samplesheet (--input parameter)", + "fa_icon": "fas fa-file-csv" }, "combine_fractions": { "type": "boolean", "description": "when combined, the fastqs of individual fractions from the same samples will be merged into a new fastq.", - "help_text": "the combined fastq file can be used in order to obtain information on the whole sample or to perform additional analyses not included" + "help_text": "the combined fastq file can be used in order to obtain information on the whole sample or to perform additional analyses not included", + "fa_icon": "fas fa-merge" } } }, From bf80c9b2a75b492022266318735c6dfc1e6067e3 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 16 Nov 2023 20:07:58 +0100 Subject: [PATCH 18/24] R module bugfixing --- modules/local/rtwosamplesmle/main.nf | 2 +- .../templates/two_samples_mle.R | 46 +++++++++++-------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index 646794a..a5d1e55 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -53,7 +53,7 @@ process RTWOSAMPLESMLE { //tuple val(meta), path("*.bam"), emit: bam // TODO nf-core: List additional required output channels/values here //path "versions.yml" , emit: versions - tuple val(meta), path("*_mle.txt") , emit: results + tuple val(meta), path("*_mle.bw") , emit: results when: task.ext.when == null || task.ext.when diff --git a/modules/local/rtwosamplesmle/templates/two_samples_mle.R b/modules/local/rtwosamplesmle/templates/two_samples_mle.R index 5eb2b3e..805e72a 100644 --- a/modules/local/rtwosamplesmle/templates/two_samples_mle.R +++ b/modules/local/rtwosamplesmle/templates/two_samples_mle.R @@ -14,15 +14,15 @@ require(rtracklayer) # FUNCTIONS ################################################ -sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) { - if (!(is.null(endcol))) { - x <- x[order(x[,endcol]),] - } - x <- x[order(x[,stcol]),] - chrs <- ordered(x[,chrcol], levels=chrorder) - # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep="")) - x <- x[order(chrs),] -} +# sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) { +# if (!(is.null(endcol))) { +# x <- x[order(x[,endcol]),] +# } +# x <- x[order(x[,stcol]),] +# chrs <- ordered(x[,chrcol], levels=chrorder) +# # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep="")) +# x <- x[order(chrs),] +# } ################################################ ## PARAMS @@ -36,8 +36,7 @@ sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=past ip_file <- "${bam1}" input_file <- "${bam2}" chromsizes_file <- "${chromsizes_file}" -mle_output_file <- "${bam1.baseName}VS${bam2.baseName}_mle.txt" -remove_anomalies <- FALSE +mle_output_file <- "${bam1.baseName}_VS_${bam2.baseName}_mle.bw" ################################################ # DEFAULT PARAMETERS @@ -45,6 +44,7 @@ remove_anomalies <- FALSE remove_anomalies <- FALSE debug_mode <- TRUE +stebp = 50 ################################################ # CORE PROCESSES @@ -66,6 +66,7 @@ chromsizes <- fread(chromsizes_file, data.table = FALSE) #chromsizes <- sortbychr(chromsizes, chrcol="V1", stcol="V2", chrorder=paste("chr", c(seq(19), "X", "Y"), sep="")) #chrs <- as.character(sub('chr', '', chromsizes[, 1])) chrs <- chromsizes[, 1] +rownames(chromsizes) <- chrs ### Import the data @@ -75,31 +76,40 @@ input <- read.bam.tags(input_file) ### Select the informative tags ip_tags <- ip[['tags']] input_tags <- input[['tags']] + print("second part") ### Remove the tags with anomalies if(remove_anomalies==TRUE){ ip_rm <- remove.local.tag.anomalies(ip_tags[chrs]) input_rm <- remove.local.tag.anomalies(input_tags[chrs]) }else{ - ip_rm <- ip_tags - input_rm <- input_tags + ip_rm <- ip_tags[chrs] + input_rm <- input_tags[chrs] } print("third part") ### Make the smoothing with the loglikelihood function -print("lane 90") +print("line 90") #print(paste0("ip_rm=",head(ip_rm))) #print(paste0("input_rm=",head(ip_rm))) -chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE) -print("lane 91") +# chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE) +ip_chrs <- names(ip_rm)[sapply(ip_rm, length) > 10] +input_chrs <- names(input_rm)[sapply(input_rm, length) > 10] +chr2keep <- intersect(ip_chrs, input_chrs) +# length(chr2keep) + +chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm[chr2keep], input_rm[chr2keep], tag.shift = 0, background.density.scaling = FALSE, step = stebp) + +print("line 91") #chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_tags, input_tags, tag.shift = 0, background.density.scaling = FALSE) ccl <- sapply(chip_smoothed_mle, nrow) cpos <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$x)) +cend <- cpos + stebp - 1 ccov <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$y)) print("line 97") -gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cpos), strand = Rle('*', sum(ccl)), score = ccov) -seqlengths(gr.mle) <- chromsizes[, 2] +gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cend), strand = Rle('*', sum(ccl)), score = ccov) +seqlengths(gr.mle) <- chromsizes[chr2keep, 2] #rename score col col_names <- names(mcols(gr.mle)) From 3be8ef117aa41ba4f096fb4fc61b6fd01f9c37f1 Mon Sep 17 00:00:00 2001 From: daisymut Date: Fri, 17 Nov 2023 04:22:51 +0100 Subject: [PATCH 19/24] updated usage and configuration --- README.md | 15 +----- conf/base.config | 4 +- docs/usage.md | 114 +++++++++++++++++++++++++++---------------- nextflow.config | 7 +-- nextflow_schema.json | 21 ++++---- 5 files changed, 89 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 1f2b3fa..340504e 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,6 @@ to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/i with `-profile test` before running the workflow on actual data. ::: - - First, prepare a samplesheet with your input data that looks as follows: `samplesheet.csv`: @@ -64,11 +61,10 @@ Each row represents a fastq file (single-end) or a pair of fastq files (paired e Now, you can run the pipeline using: - - ```bash nextflow run nf-core/sammyseq \ -profile \ + --fasta reference_genome.fa \ --input samplesheet.csv \ --outdir ``` @@ -78,19 +74,12 @@ or ```bash nextflow run nf-core/sammyseq \ -profile \ + --fasta reference_genome.fa \ --input samplesheet.csv \ --outdir \ --comparisonFile comparisons.csv ``` -`comparisons.csv`: - -```csv -sample1,sample2 -CTRL004_S2,CTRL004_S3 -CTRL004_S2,CTRL004_S4 -``` - :::warning Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; diff --git a/conf/base.config b/conf/base.config index f0a8691..05cea2b 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,7 @@ process { - // TODO nf-core: Check the defaults for all processes + // nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -24,7 +24,7 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. + // nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } diff --git a/docs/usage.md b/docs/usage.md index 30ecc2c..6233631 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,31 +6,79 @@ ## Introduction - +sammyseqis a workflow designed for the analysis of Sequential Analysis of MacroMolecules accessibilitY sequencing (SAMMY-seq) data, a cheap and effective methodology to analyze chromatin state in cells. SAMMY-seq is an innovative technique based on the separation of chromatin in fractions, each progressively based on their solubility and accessibility, and extraction and sequencing of the DNA present in each of them. + +## Running the pipeline + +The typical command for running the pipeline is as follows: + +```bash +nextflow run nf-core/sammyseq -r dev \ + -profile docker \ + --fasta ./reference_genome.fa\ + --input ./samplesheet.csv \ + --outdir ./results +``` + +This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. + +Note that the pipeline will create the following files in your working directory: + +```bash +work # Directory containing the nextflow working files + # Finished results in specified location (defined with --outdir) +.nextflow_log # Log file from Nextflow +# Other nextflow hidden files, eg. history of pipeline runs and old logs. +``` + +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run nf-core/sammyseq -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './samplesheet.csv' +outdir: './results/' +fasta: './genome.fa' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +Before running the pipeline, you will need to create a samplesheet with information about the samples you would like to analyze. Use this parameter to specify its location: ```bash ---input '[path to samplesheet file]' +--input '[full path to samplesheet file]' ``` +It has to be a comma-separated file with 5 columns, and a header row as shown in the examples below. + ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 2 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample fraction sequenced across 2 lanes: ```console sample,fastq_1,fastq_2,experimentalID,fraction -CONTROL_REP1_S2,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,CONTROL,S2 -CONTROL_REP1_S2,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,CONTROL,S2 +CONTROL_REP1_S2,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,CONTROL_REP1,S2 +CONTROL_REP1_S2,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,CONTROL_REP1,S2 ``` ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 5 columns to match those defined in the table below. - - +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can contain a mixture of single- and paired-end but in case of multiple runs of the same `sample` they have to be of the same type to be correctly merged. There can be additional columns but the first 5 have to match those defined in the table below. ```console ssample,fastq_1,fastq_2,experimentalID,fraction @@ -45,53 +93,35 @@ CTRL004_S4,/home/sammy/test_data/CTRL004_S4_chr22only.fq.gz,,CTRL004,S4 | `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | | `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | | `experimentalID` | Experimental sample identifier. This represents the biological specimen of interest and will be the same for all fractions exctracted. | -| `fraction` | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4 . | +| `fraction` | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -## Running the pipeline +### Pairwise comparisons -The typical command for running the pipeline is as follows: +It is possible to generate pairwise comparisons between two samples by providing a list with the parameter `--comparisonFile` to indicate the full path to a comma-separated file with 2 columns: -```bash -nextflow run nf-core/sammyseq --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker -``` - -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. - -Note that the pipeline will create the following files in your working directory: +`comparisons.csv`: -```bash -work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. +```csv +sample1,sample2 +CTRL004_S2,CTRL004_S3 +CTRL004_S2,CTRL004_S4 ``` -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. +It can contain any combination of sample identifiers, they have to correspond to identifiers present in the `sample` column in the input file. When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508 -Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. +### Combine fractions -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: - -The above pipeline run specified with a params file in yaml format: +Optionally, the fractions extracted from the same `experimentalID` can be combined together for later use by setting the parameter `--combine_fractions`. -```bash -nextflow run nf-core/sammyseq -profile docker -params-file params.yaml -``` +## Reference genome files -with `params.yaml` containing: +The minimum reference genome requirements is the FASTA file, provided with the mandatory parameter `--fasta`, the bwa index will be generated by the pipeline and can be saved for later reuse if the `--save_reference` parameter is passed. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space, if already present it can be passed using the `--bwa_index '/path/to/bwa/index/'` parameter. -```yaml -input: './samplesheet.csv' -outdir: './results/' -genome: 'GRCh37' -<...> -``` +## Blacklist bed files -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +A blacklist of regions that will be excluded by signal tracks can be provided using the optional parameter `--blackListFile` with full path to a coordinate file in bed format. Blacklist files for several genome builds can be found in the [ENCODE Blacklist Project](https://github.com/Boyle-Lab/Blacklist). ### Updating the pipeline diff --git a/nextflow.config b/nextflow.config index 082e371..ded694b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,9 +9,6 @@ // Global default params, used in configs params { - - - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null comparisonFile = null @@ -19,11 +16,11 @@ params { // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = false + igenomes_ignore = true bwa_index = null save_reference = false - //overall analysis errorStrategy + //overall analysis options stopAt = '' combine_fractions = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 124f29c..dbc2628 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -80,6 +80,7 @@ "type": "object", "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", + "required": ["fasta"], "properties": { "genome": { "type": "string", @@ -115,7 +116,15 @@ "description": "Do not load the iGenomes reference config.", "fa_icon": "fas fa-ban", "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "default": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. You can run `sammyseq` by specifying at least a FASTA genome file." + }, + "blackListFile": { + "type": "string", + "format": "file-path", + "fa_icon": "fas fa-ban", + "description": "A BED or GTF file containing regions that should be excluded from all analyses", + "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant" } } }, @@ -330,13 +339,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "blackListFile": { - "type": "string", - "format": "file-path", - "description": "A BED or GTF file containing regions that should be excluded from all analyses", - "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant" - } - } + ] } From 4fce10d27e278baba0733948dca163298e86a085 Mon Sep 17 00:00:00 2001 From: daisymut Date: Fri, 17 Nov 2023 11:27:12 +0100 Subject: [PATCH 20/24] initiated outpput documentation --- docs/output.md | 98 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 10 deletions(-) diff --git a/docs/output.md b/docs/output.md index 0a7fdf3..afe0f30 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,18 +6,23 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC +- [FastQC](#fastqc) +- [Trim adapters](#trim-adapters) +- [Align to Reference](#align-to-reference) +- [Mark Duplicates](#mark-duplicates) +- [`Signal track generation`](#signal-track-generation) +- [`Comparisons`](#comparisons) - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### FastQC +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about the sequenced reads. It provides information about the quality score distribution across reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +
Output files @@ -27,20 +32,83 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +#### Trim adapters -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software for read trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek +`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs. -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in: + +
+Output files for all samples -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +**Output directory: `{outdir}/preprocessing/fastp/`** + +- `__{1,2}.fastp.fastq.gz>` + - Bgzipped FastQ file + +
:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads. They can be compared to check removal of adapter sequence and potentially regions with low quality after the trimming step. ::: +### Align to Reference + +[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then sorted with [samtools](https://www.htslib.org/doc/samtools.html). + +
+Output files + +- `alignment/bwa/` + - `.bam` and `.bam.bai` + +
+ +### Mark Duplicates + +
+Output files + +- `alignment/markduplicates/` + - `.md.bam` and `.md.bam.bai` +- `reports/markduplicates/` + - `.md.MarkDuplicates.metrics.txt` + +
+ +Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file. + +### Signal track generation + +
+Output files + +- `single_tracks/deeptools/` + - `.bigWig` + +
+ +[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting. + +### Comparisons + +When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508 + +
+Output files + +- `comparisons/spp_mle/` + - `.md_VS_.md.bw` + +
+ ### MultiQC +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +
Output files @@ -51,9 +119,19 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +### Reference genome files -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +
+Output files + +- `genome/` + + - A number of genome-specific files are generated by the pipeline in order to aid in the filtering of the data, and because they are required by standard tools such as BEDTools. These can be found in this directory. + - `bwa/`: Directory containing BWA indices. + + - If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory. This can be quite a time-consuming process so it permits their reuse for future runs of the pipeline or for other purposes. + +
### Pipeline information From e5d034d75cf3c088547843b288760047b9d6e9c4 Mon Sep 17 00:00:00 2001 From: daisymut Date: Fri, 17 Nov 2023 11:55:20 +0100 Subject: [PATCH 21/24] updatedd output.md --- docs/output.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/output.md b/docs/output.md index afe0f30..adcc02a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -11,11 +11,11 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [FastQC](#fastqc) -- [Trim adapters](#trim-adapters) -- [Align to Reference](#align-to-reference) -- [Mark Duplicates](#mark-duplicates) -- [`Signal track generation`](#signal-track-generation) -- [`Comparisons`](#comparisons) +- [Trim reads](#trim-adapters) +- [Alignment on Reference](#align-to-reference) +- [Mark Duplicate reads](#mark-duplicates) +- [Signal track generation](#signal-track-generation) +- [Comparisons](#comparisons) - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution From f947060203b413efc9db3058f85ac72c7e42b047 Mon Sep 17 00:00:00 2001 From: daisymut Date: Fri, 17 Nov 2023 12:03:39 +0100 Subject: [PATCH 22/24] updated output.md --- docs/output.md | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/docs/output.md b/docs/output.md index adcc02a..47be2f8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -11,9 +11,9 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [FastQC](#fastqc) -- [Trim reads](#trim-adapters) -- [Alignment on Reference](#align-to-reference) -- [Mark Duplicate reads](#mark-duplicates) +- [Trim reads](#trim-reads) +- [Alignment on Reference](#alignment-on-reference) +- [Mark Duplicate reads](#mark-duplicate-reads) - [Signal track generation](#signal-track-generation) - [Comparisons](#comparisons) - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline @@ -32,30 +32,17 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d -#### Trim adapters +#### Trim reads -[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software for read trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek -`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs. - -The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in: - -
-Output files for all samples - -**Output directory: `{outdir}/preprocessing/fastp/`** - -- `__{1,2}.fastp.fastq.gz>` - - Bgzipped FastQ file - -
+[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc. :::note The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads. They can be compared to check removal of adapter sequence and potentially regions with low quality after the trimming step. ::: -### Align to Reference +### Alignment on Reference -[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then sorted with [samtools](https://www.htslib.org/doc/samtools.html). +The alignment is performed using [BWA](https://github.com/lh3/bwa) and the aligned reads are then sorted by chromosome coordinates with [samtools](https://www.htslib.org/doc/samtools.html).
Output files @@ -65,7 +52,7 @@ The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _tri
-### Mark Duplicates +### Mark Duplicate reads
Output files From 67e55899d8e796044141013a9ab8f2626be3c5ec Mon Sep 17 00:00:00 2001 From: daisymut Date: Fri, 17 Nov 2023 12:22:32 +0100 Subject: [PATCH 23/24] updated output.md --- docs/output.md | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/output.md b/docs/output.md index 47be2f8..4d92099 100644 --- a/docs/output.md +++ b/docs/output.md @@ -16,28 +16,32 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Mark Duplicate reads](#mark-duplicate-reads) - [Signal track generation](#signal-track-generation) - [Comparisons](#comparisons) -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [MultiQC](#multiqc) +- [Pipeline information](#pipeline-information) -### FastQC +### Read quality check + +#### FastQC [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about the sequenced reads. It provides information about the quality score distribution across reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +#### Trim reads + +[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc. +
Output files - `fastqc/` - `*_fastqc.html`: FastQC report containing quality metrics. - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + - `*_trim_fastqc.html`: FastQC report containing quality metrics for trimmed reads. + - `*_trim_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images for trimmed reads.
-#### Trim reads - -[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc. - :::note -The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads. They can be compared to check removal of adapter sequence and potentially regions with low quality after the trimming step. +The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads so they can be directly compared. ::: ### Alignment on Reference @@ -54,6 +58,8 @@ The alignment is performed using [BWA](https://github.com/lh3/bwa) and the align ### Mark Duplicate reads +Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file. +
Output files @@ -64,10 +70,10 @@ The alignment is performed using [BWA](https://github.com/lh3/bwa) and the align
-Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file. - ### Signal track generation +[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting. +
Output files @@ -76,11 +82,9 @@ Read pairs that are likely to have originated from duplicates of the same origin
-[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting. - ### Comparisons -When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508 +When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile smoothed by the Gaussian kernel is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508
Output files @@ -108,20 +112,20 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ ### Reference genome files +A number of genome-specific files if required by some of the analysis steps. If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory. +
Output files - `genome/` - - - A number of genome-specific files are generated by the pipeline in order to aid in the filtering of the data, and because they are required by standard tools such as BEDTools. These can be found in this directory. - `bwa/`: Directory containing BWA indices. - - If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory. This can be quite a time-consuming process so it permits their reuse for future runs of the pipeline or for other purposes. -
### Pipeline information +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +
Output files @@ -132,5 +136,3 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - Parameters used by the pipeline run: `params.json`.
- -[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. From c207edb6d4bd1bfdef6d28077119f2302eb7da52 Mon Sep 17 00:00:00 2001 From: daisymut Date: Fri, 17 Nov 2023 12:51:18 +0100 Subject: [PATCH 24/24] updated usage.md and renamed example samplesheet --- docs/usage.md | 10 +++++----- ...ome_isasi_chr22_tinier.csv => samplesheet_test.csv} | 0 2 files changed, 5 insertions(+), 5 deletions(-) rename tests/{samplesheet_test_genome_isasi_chr22_tinier.csv => samplesheet_test.csv} (100%) diff --git a/docs/usage.md b/docs/usage.md index 6233631..5cca560 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -20,7 +20,7 @@ nextflow run nf-core/sammyseq -r dev \ --outdir ./results ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker` configuration profile. See [below](#profile) for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -95,7 +95,7 @@ CTRL004_S4,/home/sammy/test_data/CTRL004_S4_chr22only.fq.gz,,CTRL004,S4 | `experimentalID` | Experimental sample identifier. This represents the biological specimen of interest and will be the same for all fractions exctracted. | | `fraction` | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4. | -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +An [example samplesheet](../tests/samplesheet_test.csv) has been provided with the pipeline. ### Pairwise comparisons @@ -119,11 +119,11 @@ Optionally, the fractions extracted from the same `experimentalID` can be combin The minimum reference genome requirements is the FASTA file, provided with the mandatory parameter `--fasta`, the bwa index will be generated by the pipeline and can be saved for later reuse if the `--save_reference` parameter is passed. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space, if already present it can be passed using the `--bwa_index '/path/to/bwa/index/'` parameter. -## Blacklist bed files +### Blacklist bed files A blacklist of regions that will be excluded by signal tracks can be provided using the optional parameter `--blackListFile` with full path to a coordinate file in bed format. Blacklist files for several genome builds can be found in the [ENCODE Blacklist Project](https://github.com/Boyle-Lab/Blacklist). -### Updating the pipeline +## Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -131,7 +131,7 @@ When you run the above command, Nextflow automatically pulls the pipeline code f nextflow pull nf-core/sammyseq ``` -### Reproducibility +## Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/tests/samplesheet_test_genome_isasi_chr22_tinier.csv b/tests/samplesheet_test.csv similarity index 100% rename from tests/samplesheet_test_genome_isasi_chr22_tinier.csv rename to tests/samplesheet_test.csv