From 6859ebf73933b458b835f94c63e78809be6b25ba Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Tue, 5 Sep 2023 19:33:41 +0200 Subject: [PATCH 01/14] added subworkflow to align with bwa --- modules.json | 26 +++- modules/nf-core/bwa/sampe/main.nf | 42 +++++++ modules/nf-core/bwa/sampe/meta.yml | 63 ++++++++++ modules/nf-core/bwa/samse/main.nf | 42 +++++++ modules/nf-core/bwa/samse/meta.yml | 64 ++++++++++ modules/nf-core/samtools/index/main.nf | 48 ++++++++ modules/nf-core/samtools/index/meta.yml | 53 +++++++++ .../nf-core/fastq_align_bwaaln/main.nf | 112 ++++++++++++++++++ .../nf-core/fastq_align_bwaaln/meta.yml | 51 ++++++++ .../fastq_align_bwaaln/nextflow.config | 26 ++++ workflows/sammyseq.nf | 10 +- 11 files changed, 534 insertions(+), 3 deletions(-) create mode 100644 modules/nf-core/bwa/sampe/main.nf create mode 100644 modules/nf-core/bwa/sampe/meta.yml create mode 100644 modules/nf-core/bwa/samse/main.nf create mode 100644 modules/nf-core/bwa/samse/meta.yml create mode 100644 modules/nf-core/samtools/index/main.nf create mode 100644 modules/nf-core/samtools/index/meta.yml create mode 100644 subworkflows/nf-core/fastq_align_bwaaln/main.nf create mode 100644 subworkflows/nf-core/fastq_align_bwaaln/meta.yml create mode 100644 subworkflows/nf-core/fastq_align_bwaaln/nextflow.config diff --git a/modules.json b/modules.json index 865e37c..64491b8 100644 --- a/modules.json +++ b/modules.json @@ -8,13 +8,23 @@ "bwa/aln": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["fastq_align_bwaaln", "modules"] }, "bwa/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "bwa/sampe": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["fastq_align_bwaaln"] + }, + "bwa/samse": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["fastq_align_bwaaln"] + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", @@ -40,6 +50,11 @@ "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", "installed_by": ["modules"] }, + "samtools/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["fastq_align_bwaaln"] + }, "trimmomatic": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -51,6 +66,15 @@ "installed_by": ["modules"] } } + }, + "subworkflows": { + "nf-core": { + "fastq_align_bwaaln": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + } + } } } } diff --git a/modules/nf-core/bwa/sampe/main.nf b/modules/nf-core/bwa/sampe/main.nf new file mode 100644 index 0000000..45bfffb --- /dev/null +++ b/modules/nf-core/bwa/sampe/main.nf @@ -0,0 +1,42 @@ +process BWA_SAMPE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bwa=0.7.17 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' : + 'biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }" + + input: + tuple val(meta), path(reads), path(sai) + tuple val(meta2), path(index) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def read_group = meta.read_group ? "-r ${meta.read_group}" : "" + + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa sampe \\ + $args \\ + $read_group \\ + \$INDEX \\ + $sai \\ + $reads | samtools sort -@ ${task.cpus} -O bam - > ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/sampe/meta.yml b/modules/nf-core/bwa/sampe/meta.yml new file mode 100644 index 0000000..0cefb96 --- /dev/null +++ b/modules/nf-core/bwa/sampe/meta.yml @@ -0,0 +1,63 @@ +name: bwa_sampe +description: Convert paired-end bwa SA coordinate files to SAM format +keywords: + - bwa + - aln + - short-read + - align + - reference + - fasta + - map + - sam + - bam +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://bio-bwa.sourceforge.net/ + doi: "10.1093/bioinformatics/btp324" + licence: ["GPL-3.0-or-later"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FASTQ files specified alongside meta in input channel. + pattern: "*.{fastq,fq}.gz" + - sai: + type: file + description: SAI file specified alongside meta and reads in input channel. + pattern: "*.sai" + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: directory + description: Directory containing BWA index files (amb,ann,bwt,pac,sa) from BWA_INDEX + pattern: "bwa/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: BAM file + pattern: "*.bam" + +authors: + - "@jfy133" diff --git a/modules/nf-core/bwa/samse/main.nf b/modules/nf-core/bwa/samse/main.nf new file mode 100644 index 0000000..38752ed --- /dev/null +++ b/modules/nf-core/bwa/samse/main.nf @@ -0,0 +1,42 @@ +process BWA_SAMSE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bwa=0.7.17 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' : + 'biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }" + + input: + tuple val(meta), path(reads), path(sai) + tuple val(meta2), path(index) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def read_group = meta.read_group ? "-r ${meta.read_group}" : "" + + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa samse \\ + $args \\ + $read_group \\ + \$INDEX \\ + $sai \\ + $reads | samtools sort -@ ${task.cpus - 1} -O bam - > ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/samse/meta.yml b/modules/nf-core/bwa/samse/meta.yml new file mode 100644 index 0000000..1deab21 --- /dev/null +++ b/modules/nf-core/bwa/samse/meta.yml @@ -0,0 +1,64 @@ +name: bwa_samse +description: Convert bwa SA coordinate file to SAM format +keywords: + - bwa + - aln + - short-read + - align + - reference + - fasta + - map + - sam + - bam + +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://bio-bwa.sourceforge.net/ + doi: "10.1093/bioinformatics/btp324" + licence: ["GPL-3.0-or-later"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FASTQ files specified alongside meta in input channel. + pattern: "*.{fastq,fq}.gz" + - sai: + type: file + description: SAI file specified alongside meta and reads in input channel. + pattern: "*.sai" + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: directory + description: Directory containing BWA index files (amb,ann,bwt,pac,sa) from BWA_INDEX + pattern: "bwa/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: BAM file + pattern: "*.bam" + +authors: + - "@jfy133" diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 0000000..0b20aa4 --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 0000000..8bd2fa6 --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/subworkflows/nf-core/fastq_align_bwaaln/main.nf b/subworkflows/nf-core/fastq_align_bwaaln/main.nf new file mode 100644 index 0000000..647e83a --- /dev/null +++ b/subworkflows/nf-core/fastq_align_bwaaln/main.nf @@ -0,0 +1,112 @@ +// +// Alignment with bwa aln and sort +// + +include { BWA_ALN } from '../../../modules/nf-core/bwa/aln/main' +include { BWA_SAMSE } from '../../../modules/nf-core/bwa/samse/main' +include { BWA_SAMPE } from '../../../modules/nf-core/bwa/sampe/main' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' + +workflow FASTQ_ALIGN_BWAALN { + + take: + ch_reads // channel (mandatory): [ val(meta), path(reads) ]. subworkImportant: meta REQUIRES single_end` entry! + ch_index // channel (mandatory): [ val(meta), path(index) ] + + main: + + ch_versions = Channel.empty() + + // WARNING: You must specify in your prefix `meta.id_index` in your `modules.conf` + // to ensure that you do not overwrite multiple BAM files from one sample mapped + // against multiple references. This meta map is added by the subworkflow but can be removed + // after if necessary. + + // Ensure when multiple references are provide, that reference/read combinations + // are correctly associated throughout the subworkflow by copying the sample + // specific metadata to the index on each combination + + ch_prepped_input = ch_reads + .combine(ch_index) + .map{ + meta, reads, meta_index, index -> + + // Create a combined id that includes the ids of the reads and the index used. + // Also keep the id of the index with a new name to avoid name collisions. + def key_read_ref = meta.id + "_" + meta_index.id + def id_index = meta_index.id + + [ meta + [key_read_ref: key_read_ref] + [id_index: id_index], reads, meta_index + [key_read_ref: key_read_ref] + [id_index: id_index], index ] + } + + // Drop the index_meta, as the id of the index is now kept within the read meta. + ch_preppedinput_for_bwaaln = ch_prepped_input + .multiMap { + meta, reads, meta_index, index -> + reads: [ meta, reads ] + index: [ meta, index ] + } + + + // Set as independent channel to allow repeated joining but _with_ sample specific metadata + // to ensure right reference goes with right sample + ch_reads_newid = ch_prepped_input.map{ meta, reads, meta_index, index -> [ meta, reads ] } + ch_index_newid = ch_prepped_input.map{ meta, reads, meta_index, index -> [ meta, index ] } + + // Alignment and conversion to bam + BWA_ALN ( ch_preppedinput_for_bwaaln.reads, ch_preppedinput_for_bwaaln.index ) + ch_versions = ch_versions.mix( BWA_ALN.out.versions.first() ) + + ch_sai_for_bam = ch_reads_newid + .join ( BWA_ALN.out.sai ) + .branch { + meta, reads, sai -> + pe: !meta.single_end + se: meta.single_end + } + + // Split as PE/SE have different SAI -> BAM commands + ch_sai_for_bam_pe = ch_sai_for_bam.pe + .join ( ch_index_newid ) + .multiMap { + meta, reads, sai, index -> + reads: [ meta, reads, sai ] + index: [ meta, index ] + } + + ch_sai_for_bam_se = ch_sai_for_bam.se + .join ( ch_index_newid ) + .multiMap { + meta, reads, sai, index -> + reads: [ meta, reads, sai ] + index: [ meta, index ] + } + + + BWA_SAMPE ( ch_sai_for_bam_pe.reads, ch_sai_for_bam_pe.index ) + ch_versions = ch_versions.mix( BWA_SAMPE.out.versions.first() ) + + BWA_SAMSE ( ch_sai_for_bam_se.reads, ch_sai_for_bam_se.index ) + ch_versions = ch_versions.mix( BWA_SAMSE.out.versions.first() ) + + ch_bam_for_index = BWA_SAMPE.out.bam.mix( BWA_SAMSE.out.bam ) + + // Index all + SAMTOOLS_INDEX ( ch_bam_for_index ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + + // Remove superfluous internal maps to minimise clutter as much as possible + ch_bam_for_emit = ch_bam_for_index.map{ meta, bam -> [meta - meta.subMap('key_read_ref'), bam] } + ch_bai_for_emit = SAMTOOLS_INDEX.out.bai.map{ meta, bai -> [meta - meta.subMap('key_read_ref'), bai] } + ch_csi_for_emit = SAMTOOLS_INDEX.out.csi.map{ meta, csi -> [meta - meta.subMap('key_read_ref'), csi] } + + emit: + // Note: output channels will contain meta with additional 'id_index' meta + // value to allow association of BAM file with the meta.id of input indicies + bam = ch_bam_for_emit // channel: [ val(meta), path(bam) ] + bai = ch_bai_for_emit // channel: [ val(meta), path(bai) ] + csi = ch_csi_for_emit // channel: [ val(meta), path(csi) ] + + versions = ch_versions // channel: [ path(versions.yml) ] +} + diff --git a/subworkflows/nf-core/fastq_align_bwaaln/meta.yml b/subworkflows/nf-core/fastq_align_bwaaln/meta.yml new file mode 100644 index 0000000..ad43398 --- /dev/null +++ b/subworkflows/nf-core/fastq_align_bwaaln/meta.yml @@ -0,0 +1,51 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fastq_align_bwaaln" +description: Align FASTQ files against reference genome with the bwa aln short-read aligner producing a sorted and indexed BAM files +keywords: + - bwa + - aln + - fasta + - bwa + - align + - map +components: + - bwa/aln + - bwa/sampe + - bwa/samse + - samtools/index +input: + - ch_reads: + description: | + Structure: [ val(meta), path(fastq) ] + One or two FASTQ files for single or paired end data respectively. + Note: the subworkflow adds a new meta ID `meta.id_index` that _must_ + be used in `prefix` to ensure unique file names. See the associated + nextflow.config file. + - ch_index: + description: | + Structure: [ val(meta), path(index) ] + A directory containing bwa aln reference indices as from `bwa index` + Contains files ending in extensions such as .amb, .ann, .bwt etc. + +output: + - bam: + description: | + Structure: [ val(meta), path(bam) ] + Sorted BAM/CRAM/SAM file + Note: the subworkflow adds a new meta ID `meta.id_index` that _must_ + be used in `prefix` to ensure unique file names. See the associated + nextflow.config file. + - bai: + description: | + Structure: [ val(meta), path(bai) ] + BAM/CRAM/SAM samtools index file + - csi: + description: | + Structure: [ val(meta), path(csi) ] + BAM/CRAM/SAM samtools index file for larger references + - versions: + description: | + Files containing software versions + Structure: [ path(versions.yml) ] +authors: + - "@jfy133" diff --git a/subworkflows/nf-core/fastq_align_bwaaln/nextflow.config b/subworkflows/nf-core/fastq_align_bwaaln/nextflow.config new file mode 100644 index 0000000..2ff1655 --- /dev/null +++ b/subworkflows/nf-core/fastq_align_bwaaln/nextflow.config @@ -0,0 +1,26 @@ +// IMPORTANT: Add this configuration to your modules.config +// These settings are to ensure you include the reference information within the output file names, to prevent overwritting! +// `meta.id_index` is created and used within the workflow. You must reuse this meta field if you wish to customise the file naming scheme. + +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: '.*FASTQ_ALIGN_BWAALN:BWA_ALN' { + ext.prefix = { "${meta.id}_${meta.id_index}" } + } + + withName: '.*FASTQ_ALIGN_BWAALN:BWA_SAMSE' { + ext.prefix = { "${meta.id}_${meta.id_index}" } + } + + withName: '.*FASTQ_ALIGN_BWAALN:BWA_SAMPE' { + ext.prefix = { "${meta.id}_${meta.id_index}" } + } + + withName: '.*FASTQ_ALIGN_BWAALN:SAMTOOLS_INDEX' { + ext.prefix = { "${meta.id}_${meta.id_index}" } + } + + +} diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 6d69919..f9e0fe3 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -37,6 +37,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { FASTQ_ALIGN_BWAALN } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -53,7 +54,7 @@ include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { TRIMMOMATIC } from '../modules/nf-core/trimmomatic' -include { BWA_ALN } from '../modules/nf-core/bwa/aln/' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -114,7 +115,12 @@ workflow SAMMYSEQ { merged_reads ) - BWA_ALN ( + //BWA_ALN ( + // TRIMMOMATIC.out.trimmed_reads, + // PREPARE_GENOME.out.bwa_index + //) + + FASTQ_ALIGN_BWAALN ( TRIMMOMATIC.out.trimmed_reads, PREPARE_GENOME.out.bwa_index ) From 09d0dd7ed6fa1d2f73e9d80f03ef552d0dc8df09 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Thu, 7 Sep 2023 20:22:23 +0200 Subject: [PATCH 02/14] markduplicates and filtering --- conf/modules.config | 122 ++++++++++++++++++ modules.json | 42 +++++- modules/nf-core/deeptools/bamcoverage/main.nf | 67 ++++++++++ .../nf-core/deeptools/bamcoverage/meta.yml | 60 +++++++++ modules/nf-core/picard/markduplicates/main.nf | 61 +++++++++ .../nf-core/picard/markduplicates/meta.yml | 71 ++++++++++ modules/nf-core/samtools/faidx/main.nf | 50 +++++++ modules/nf-core/samtools/faidx/meta.yml | 57 ++++++++ modules/nf-core/samtools/flagstat/main.nf | 46 +++++++ modules/nf-core/samtools/flagstat/meta.yml | 49 +++++++ modules/nf-core/samtools/idxstats/main.nf | 48 +++++++ modules/nf-core/samtools/idxstats/meta.yml | 50 +++++++ modules/nf-core/samtools/stats/main.nf | 49 +++++++ modules/nf-core/samtools/stats/meta.yml | 59 +++++++++ .../nf-core/bam_markduplicates_picard/main.nf | 52 ++++++++ .../bam_markduplicates_picard/meta.yml | 62 +++++++++ .../nf-core/bam_stats_samtools/main.nf | 32 +++++ .../nf-core/bam_stats_samtools/meta.yml | 41 ++++++ workflows/sammyseq.nf | 41 ++++++ 19 files changed, 1058 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/deeptools/bamcoverage/main.nf create mode 100644 modules/nf-core/deeptools/bamcoverage/meta.yml create mode 100644 modules/nf-core/picard/markduplicates/main.nf create mode 100644 modules/nf-core/picard/markduplicates/meta.yml create mode 100644 modules/nf-core/samtools/faidx/main.nf create mode 100644 modules/nf-core/samtools/faidx/meta.yml create mode 100644 modules/nf-core/samtools/flagstat/main.nf create mode 100644 modules/nf-core/samtools/flagstat/meta.yml create mode 100644 modules/nf-core/samtools/idxstats/main.nf create mode 100644 modules/nf-core/samtools/idxstats/meta.yml create mode 100644 modules/nf-core/samtools/stats/main.nf create mode 100644 modules/nf-core/samtools/stats/meta.yml create mode 100644 subworkflows/nf-core/bam_markduplicates_picard/main.nf create mode 100644 subworkflows/nf-core/bam_markduplicates_picard/meta.yml create mode 100644 subworkflows/nf-core/bam_stats_samtools/main.nf create mode 100644 subworkflows/nf-core/bam_stats_samtools/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 56cec34..0b65a9c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,6 +12,8 @@ process { + + publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, @@ -45,4 +47,124 @@ process { ext.args2 = 'ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:2:True LEADING:3 TRAILING:3 MINLEN:36' } + + // + // Alignment options + // + withName: 'BWA_MEM' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/bwa" }, + enabled: false, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.bam" + ] + } + + withName: 'SAMTOOLS_INDEX_BAM' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/bwa" }, + enabled: true, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.bai" + ] + } + + withName: 'SAMTOOLS_SORT_BAM' { + ext.args = "" + ext.prefix = { "${meta.id}.sorted" } + publishDir = [ + path: { "${params.outdir}/bwa" }, + enabled: true, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*sorted.bam" + ] + } + + // + // Picard MarkDuplicates and Filtering options + // + + withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { + ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES true --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' + ext.prefix = { "${meta.id}.md" } + publishDir = [ + [ + path: { "${params.outdir}/reports/markduplicates" }, + mode: params.publish_dir_mode, + pattern: '*metrics.txt', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/markduplicates/bam" }, + mode: params.publish_dir_mode, + pattern: '*.md.{bam,bai}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + //enable: params.save_markduplicates_bam + ] + ] + } + + withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' { + ext.prefix = { "${meta.id}.markdup.sorted" } + publishDir = [ + path: { "${params.outdir}/markduplicates/bam" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.{bai,csi}' + ] + } + + withName: '.*BAM_MARKDUPLICATES_PICARD:BAM_STATS_SAMTOOLS.*' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/reports/samtools_stats/${meta.id}/md/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.{stats,flagstat,idxstats}' + ] + } + + withName: 'SAMTOOLS_VIEW_FILTER' { + ext.args = '-h -F 0x0400' + publishDir = [ + path: { "${params.outdir}/markduplicates/duplicates_removed" }, + mode: params.publish_dir_mode, + pattern: '*filtered.{bai,bam}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + ext.prefix = { "${meta.id}.md.filtered" } + } + + withName: 'SAMTOOLS_SORT_FILTERED' { + ext.args = "" + ext.prefix = { "${meta.id}.md.filtered.sorted" } + publishDir = [ + path: { "${params.outdir}/markduplicates/duplicates_removed" }, + mode: params.publish_dir_mode, + pattern: '*filtered.sorted.bam', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + + withName: 'SAMTOOLS_INDEX_FILTERED' { + ext.args = "" + ext.prefix = { "${meta.id}.md.filtered.sorted" } + publishDir = [ + path: { "${params.outdir}/markduplicates/duplicates_removed" }, + mode: params.publish_dir_mode, + pattern: '*filtered.sorted.bai', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + + + } diff --git a/modules.json b/modules.json index 64491b8..ecb2175 100644 --- a/modules.json +++ b/modules.json @@ -35,6 +35,11 @@ "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", "installed_by": ["modules"] }, + "deeptools/bamcoverage": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", @@ -50,10 +55,35 @@ "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", "installed_by": ["modules"] }, + "picard/markduplicates": { + "branch": "master", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["bam_markduplicates_picard", "modules"] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "570ec5bcfe19c49e16c9ca35a7a116563af6cc1c", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "e662ab16e0c11f1e62983e21de9871f59371a639", + "installed_by": ["bam_stats_samtools"] + }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["fastq_align_bwaaln"] + "installed_by": ["bam_markduplicates_picard", "fastq_align_bwaaln"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["bam_stats_samtools"] }, "trimmomatic": { "branch": "master", @@ -69,6 +99,16 @@ }, "subworkflows": { "nf-core": { + "bam_markduplicates_picard": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["subworkflows"] + }, + "bam_stats_samtools": { + "branch": "master", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", + "installed_by": ["bam_markduplicates_picard"] + }, "fastq_align_bwaaln": { "branch": "master", "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", diff --git a/modules/nf-core/deeptools/bamcoverage/main.nf b/modules/nf-core/deeptools/bamcoverage/main.nf new file mode 100644 index 0000000..680fc5b --- /dev/null +++ b/modules/nf-core/deeptools/bamcoverage/main.nf @@ -0,0 +1,67 @@ +process DEEPTOOLS_BAMCOVERAGE { + tag "$meta.id" + label 'process_low' + + conda "bioconda::deeptools=3.5.1 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:62d1ebe2d3a2a9d1a7ad31e0b902983fa7c25fa7-0': + 'biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:62d1ebe2d3a2a9d1a7ad31e0b902983fa7c25fa7-0' }" + + input: + tuple val(meta), path(input), path(input_index) + path(fasta) + path(fasta_fai) + + output: + tuple val(meta), path("*.bigWig") , emit: bigwig, optional: true + tuple val(meta), path("*.bedgraph") , emit: bedgraph, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}.bigWig" + + // cram_input is currently not working with deeptools + // therefore it's required to convert cram to bam first + def is_cram = input.Extension == "cram" ? true : false + def input_out = is_cram ? input.BaseName + ".bam" : "${input}" + def fai_reference = fasta_fai ? "--fai-reference ${fasta_fai}" : "" + + if (is_cram){ + """ + samtools view -T $fasta $input $fai_reference -@ $task.cpus -o $input_out + samtools index -b $input_out -@ $task.cpus + + bamCoverage \\ + --bam $input_out \\ + $args \\ + --numberOfProcessors ${task.cpus} \\ + --outFileName ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + deeptools: \$(bamCoverage --version | sed -e "s/bamCoverage //g") + END_VERSIONS + """ + + } + else { + """ + bamCoverage \\ + --bam $input_out \\ + $args \\ + --numberOfProcessors ${task.cpus} \\ + --outFileName ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + deeptools: \$(bamCoverage --version | sed -e "s/bamCoverage //g") + END_VERSIONS + """ + } + +} diff --git a/modules/nf-core/deeptools/bamcoverage/meta.yml b/modules/nf-core/deeptools/bamcoverage/meta.yml new file mode 100644 index 0000000..769592c --- /dev/null +++ b/modules/nf-core/deeptools/bamcoverage/meta.yml @@ -0,0 +1,60 @@ +name: deeptools_bamcoverage +description: This tool takes an alignment of reads or fragments as input (BAM file) and generates a coverage track (bigWig or bedGraph) as output. +keywords: + - coverage + - depth + - track +tools: + - deeptools: + description: A set of user-friendly tools for normalization and visualzation of deep-sequencing data + homepage: https://deeptools.readthedocs.io/en/develop/content/tools/bamCoverage.html + documentation: https://deeptools.readthedocs.io/en/develop/content/tools/bamCoverage.html + tool_dev_url: https://github.com/deeptools/deepTools/ + doi: "10.1093/nar/gkw257" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAM/CRAM index file + pattern: "*.{bai,crai}" + - fasta: + type: file + description: Reference file the CRAM file was created with (required with CRAM input) + pattern: "*.{fasta,fa}" + - fasta_fai: + type: file + description: Index of the reference file (optional, but recommended) + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bigWig: + type: file + description: BigWig file + pattern: "*.bigWig" + - bedgraph: + type: file + description: Bedgraph file + pattern: "*.bedgraph" + +authors: + - "@FriederikeHanssen" + - "@SusiJo" diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf new file mode 100644 index 0000000..facd7ef --- /dev/null +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -0,0 +1,61 @@ +process PICARD_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::picard=3.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : + 'biocontainers/picard:3.0.0--hdfd78af_1' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.metrics.txt"), emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def avail_mem = 3072 + if (!task.memory) { + log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + picard \\ + -Xmx${avail_mem}M \\ + MarkDuplicates \\ + $args \\ + --INPUT $bam \\ + --OUTPUT ${prefix}.bam \\ + --REFERENCE_SEQUENCE $fasta \\ + --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + touch ${prefix}.MarkDuplicates.metrics.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ +} diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml new file mode 100644 index 0000000..f7693d2 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/meta.yml @@ -0,0 +1,71 @@ +name: picard_markduplicates +description: Locate and tag duplicate reads in a BAM file +keywords: + - markduplicates + - pcr + - duplicates + - bam + - sam + - cram +tools: + - picard: + description: | + A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) + data and formats such as SAM/BAM/CRAM and VCF. + homepage: https://broadinstitute.github.io/picard/ + documentation: https://broadinstitute.github.io/picard/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Reference genome fasta index + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with duplicate reads marked/removed + pattern: "*.{bam}" + - bai: + type: file + description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag + pattern: "*.{bai}" + - metrics: + type: file + description: Duplicate metrics file generated by picard + pattern: "*.{metrics.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@projectoriented" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 0000000..59ed308 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,50 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' + """ + ${fastacmd} + touch ${fasta}.fai + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 0000000..957b25e --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,57 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta + - faidx +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf new file mode 100644 index 0000000..b75707e --- /dev/null +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools \\ + flagstat \\ + --threads ${task.cpus} \\ + $bam \\ + > ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml new file mode 100644 index 0000000..954225d --- /dev/null +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -0,0 +1,49 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf new file mode 100644 index 0000000..83c7c34 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_IDXSTATS { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.idxstats"), emit: idxstats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + samtools \\ + idxstats \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml new file mode 100644 index 0000000..dda87e1 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -0,0 +1,50 @@ +name: samtools_idxstats +description: Reports alignment summary statistics for a BAM/CRAM/SAM file +keywords: + - stats + - mapping + - counts + - chromosome + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 0000000..4a2607d --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 0000000..90e6345 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,59 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/subworkflows/nf-core/bam_markduplicates_picard/main.nf b/subworkflows/nf-core/bam_markduplicates_picard/main.nf new file mode 100644 index 0000000..6e3df33 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/main.nf @@ -0,0 +1,52 @@ +// +// Picard MarkDuplicates, index BAM file and run samtools stats, flagstat and idxstats +// + +include { PICARD_MARKDUPLICATES } from '../../../modules/nf-core/picard/markduplicates/main' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { BAM_STATS_SAMTOOLS } from '../bam_stats_samtools/main' + +workflow BAM_MARKDUPLICATES_PICARD { + + take: + ch_bam // channel: [ val(meta), path(bam) ] + ch_fasta // channel: [ path(fasta) ] + ch_fai // channel: [ path(fai) ] + + main: + + ch_versions = Channel.empty() + + PICARD_MARKDUPLICATES ( ch_bam, ch_fasta, ch_fai ) + ch_versions = ch_versions.mix(PICARD_MARKDUPLICATES.out.versions.first()) + + SAMTOOLS_INDEX ( PICARD_MARKDUPLICATES.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + + ch_bam_bai = PICARD_MARKDUPLICATES.out.bam + .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) + .map { + meta, bam, bai, csi -> + if (bai) { + [ meta, bam, bai ] + } else { + [ meta, bam, csi ] + } + } + + BAM_STATS_SAMTOOLS ( ch_bam_bai, ch_fasta ) + ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) + + emit: + bam = PICARD_MARKDUPLICATES.out.bam // channel: [ val(meta), path(bam) ] + metrics = PICARD_MARKDUPLICATES.out.metrics // channel: [ val(meta), path(bam) ] + bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), path(bai) ] + csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), path(csi) ] + + stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), path(idxstats) ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml new file mode 100644 index 0000000..b924596 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml @@ -0,0 +1,62 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "bam_markduplicates_picard" +description: Picard MarkDuplicates, index BAM file and run samtools stats, flagstat and idxstats +keywords: + - markduplicates + - bam + - sam + - cram + +components: + - picard/markduplicates + - samtools/index + - samtools/stats + - samtools/idxstats + - samtools/flagstat + - bam_stats_samtools + +input: + - ch_bam: + description: | + BAM/CRAM/SAM file + Structure: [ val(meta), path(bam) ] + - ch_fasta: + description: | + Reference genome fasta file + Structure: [ path(fasta) ] + - ch_fasta: + description: | + Index of the reference genome fasta file + Structure: [ path(fai) ] +output: + - bam: + description: | + processed BAM/CRAM/SAM file + Structure: [ val(meta), path(bam) ] + - bai: + description: | + BAM/CRAM/SAM samtools index + Structure: [ val(meta), path(bai) ] + - csi: + description: | + CSI samtools index + Structure: [ val(meta), path(csi) ] + - stats: + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] + - flagstat: + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] + - idxstats: + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats) ] + - versions: + description: | + Files containing software versions + Structure: [ path(versions.yml) ] +authors: + - "@dmarron" + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_stats_samtools/main.nf b/subworkflows/nf-core/bam_stats_samtools/main.nf new file mode 100644 index 0000000..44d4c01 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/main.nf @@ -0,0 +1,32 @@ +// +// Run SAMtools stats, flagstat and idxstats +// + +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_IDXSTATS } from '../../../modules/nf-core/samtools/idxstats/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/samtools/flagstat/main' + +workflow BAM_STATS_SAMTOOLS { + take: + ch_bam_bai // channel: [ val(meta), path(bam), path(bai) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + + main: + ch_versions = Channel.empty() + + SAMTOOLS_STATS ( ch_bam_bai, ch_fasta ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) + + SAMTOOLS_FLAGSTAT ( ch_bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) + + SAMTOOLS_IDXSTATS ( ch_bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions) + + emit: + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), path(idxstats) ] + + versions = ch_versions // channel: [ path(versions.yml) ] +} diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml new file mode 100644 index 0000000..87863b1 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -0,0 +1,41 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: bam_stats_samtools +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +components: + - samtools/stats + - samtools/idxstats + - samtools/flagstat +input: + - ch_bam_bai: + description: | + The input channel containing the BAM/CRAM and it's index + Structure: [ val(meta), path(bam), path(bai) ] + - ch_fasta: + description: | + Reference genome fasta file + Structure: [ path(fasta) ] +output: + - stats: + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] + - flagstat: + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] + - idxstats: + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats)] + - versions: + description: | + Files containing software versions + Structure: [ path(versions.yml) ] +authors: + - "@drpatelh" diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index f9e0fe3..ba30723 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -1,3 +1,14 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } + +// Modify fasta channel to include meta data +ch_fasta_meta = ch_fasta.map{ it -> [[id:it[0].baseName], it] }.collect() + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRINT PARAMS SUMMARY @@ -38,6 +49,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { FASTQ_ALIGN_BWAALN } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf' +include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -54,7 +66,11 @@ include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { TRIMMOMATIC } from '../modules/nf-core/trimmomatic' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx' +// include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/samtools/view/main' +// include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/samtools/sort/main' +// include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FILTERED } from '../modules/nf-core/samtools/index/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -125,6 +141,31 @@ workflow SAMMYSEQ { PREPARE_GENOME.out.bwa_index ) + // PICARD MARK_DUPLICATES + // Index Fasta File for Markduplicates + SAMTOOLS_FAIDX ( + ch_fasta_meta, + [[], []] + ) + + // MARK DUPLICATES IN BAM FILE + BAM_MARKDUPLICATES_PICARD ( + FASTQ_ALIGN_BWAALN.out.bam, + ch_fasta_meta, + SAMTOOLS_FAIDX.out.fai.collect() + ) + + // SAMTOOLS_VIEW_FILTER ( + // ch_bam_sorted.join(ch_bam_sorted_bai), + // ch_fasta_meta, + // [] + // ) + // ch_versions = ch_versions.mix(SAMTOOLS_VIEW_FILTER.out.versions) + + ch_bam_from_markduplicates = BAM_MARKDUPLICATES_PICARD.bam + + + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) From 9b0b040e36513eab40d30f84328acc6710fb589b Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Thu, 7 Sep 2023 20:23:18 +0200 Subject: [PATCH 03/14] bugfix include removal --- workflows/sammyseq.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index ba30723..85d643d 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -67,6 +67,7 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { TRIMMOMATIC } from '../modules/nf-core/trimmomatic' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx' +// include { } // include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/samtools/view/main' // include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/samtools/sort/main' From 0528894df06014c6471e278fdd313197ae0aa042 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Tue, 12 Sep 2023 09:04:50 +0200 Subject: [PATCH 04/14] add stopAt parameter --- nextflow.config | 2 ++ nextflow_schema.json | 5 +++++ workflows/sammyseq.nf | 51 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 1fbf588..cb04645 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,6 +9,8 @@ // Global default params, used in configs params { + stopAt = '' + // TODO nf-core: Specify your pipeline's command line flags // Input options input = null diff --git a/nextflow_schema.json b/nextflow_schema.json index b5c8a9d..9352576 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -12,6 +12,11 @@ "description": "Define where the pipeline should find input data and save output data.", "required": ["input", "outdir"], "properties": { + "stopAt": { + "type": "string", + "description": "Specify after which step the pipeline should stop.", + "fa_icon": "fas fa-stop-circle" + }, "input": { "type": "string", "format": "file-path", diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 85d643d..1ed3eec 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -67,8 +67,8 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { TRIMMOMATIC } from '../modules/nf-core/trimmomatic' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx' -// include { } - +include { DEEPTOOLS_BAMCOVERAGE } from '../modules/nf-core/deeptools/bamcoverage' + // include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/samtools/view/main' // include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/samtools/sort/main' // include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FILTERED } from '../modules/nf-core/samtools/index/main' @@ -132,6 +132,11 @@ workflow SAMMYSEQ { merged_reads ) + if (params.stopAt == 'TRIMMOMATIC') { + return + } + + //BWA_ALN ( // TRIMMOMATIC.out.trimmed_reads, // PREPARE_GENOME.out.bwa_index @@ -142,6 +147,11 @@ workflow SAMMYSEQ { PREPARE_GENOME.out.bwa_index ) + if (params.stopAt == 'FASTQ_ALIGN_BWAALN') { + return + } + + // PICARD MARK_DUPLICATES // Index Fasta File for Markduplicates SAMTOOLS_FAIDX ( @@ -149,6 +159,8 @@ workflow SAMMYSEQ { [[], []] ) + + // MARK DUPLICATES IN BAM FILE BAM_MARKDUPLICATES_PICARD ( FASTQ_ALIGN_BWAALN.out.bam, @@ -156,6 +168,9 @@ workflow SAMMYSEQ { SAMTOOLS_FAIDX.out.fai.collect() ) + if (params.stopAt == 'BAM_MARKDUPLICATES_PICARD') { + return + } // SAMTOOLS_VIEW_FILTER ( // ch_bam_sorted.join(ch_bam_sorted_bai), // ch_fasta_meta, @@ -163,8 +178,38 @@ workflow SAMMYSEQ { // ) // ch_versions = ch_versions.mix(SAMTOOLS_VIEW_FILTER.out.versions) - ch_bam_from_markduplicates = BAM_MARKDUPLICATES_PICARD.bam + //ch_bam_from_markduplicates = BAM_MARKDUPLICATES_PICARD.bam + //BAM_MARKDUPLICATES_PICARD.out.bam.view() + //BAM_MARKDUPLICATES_PICARD.out.bai.view() + + //ch_bam_bai_combined = BAM_MARKDUPLICATES_PICARD.out.bam.join(BAM_MARKDUPLICATES_PICARD.out.bai, by: [0]) + + ch_bam_bai_combined = BAM_MARKDUPLICATES_PICARD.out.bam + .join(BAM_MARKDUPLICATES_PICARD.out.bai, by: [0], remainder: true) + .map { + meta, bam, bai -> + [ meta, bam, bai ] + + } + + //ch_bam_bai_combined.view() + + //ch_for_bamcoverage = ch_bam_bai_combined.map { meta1, bam, meta2, bai -> + // tuple(meta1, bam, bai) + //} + + //ch_for_bamcoverage.view() + + DEEPTOOLS_BAMCOVERAGE ( + ch_bam_bai_combined, + ch_fasta_meta.map { it[2] }, + SAMTOOLS_FAIDX.out.fai.collect() + ) + + if (params.stopAt == 'DEEPTOOLS_BAMCOVERAGE') { + return + } CUSTOM_DUMPSOFTWAREVERSIONS ( From 8ff8d478251a2a69324976505dc259c4179a5839 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Wed, 13 Sep 2023 10:39:53 +0200 Subject: [PATCH 05/14] deeptools coverage bugfix --- subworkflows/local/prepare_genome.nf | 15 ++++++++------- workflows/sammyseq.nf | 26 +++++++++++++++----------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 678d970..03d6795 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -19,8 +19,9 @@ include { include { BWA_INDEX } from '../../modules/nf-core/bwa/index/main' workflow PREPARE_GENOME { - //take: - + take: + input_fasta + input_bwaindex main: @@ -30,11 +31,11 @@ workflow PREPARE_GENOME { // Uncompress genome fasta file if required // ch_fasta = Channel.empty() - if (params.fasta.endsWith('.gz')) { + if (input_fasta.endsWith('.gz')) { ch_fasta = GUNZIP_FASTA ( [ [:], params.fasta ] ).gunzip.map{ it[1] } ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) } else { - ch_fasta = [ [:], file(params.fasta) ] + ch_fasta = [ [:], file(input_fasta) ] } //println(ch_fasta) @@ -56,11 +57,11 @@ workflow PREPARE_GENOME { ch_bwa_index = Channel.empty() if (params.bwa_index) { - if (params.bwa_index.endsWith('.tar.gz')) { - ch_bwa_index = UNTAR_BWA_INDEX ( [ [:], params.bwa_index ] ).untar.map{ it[1] } + if (input_bwaindex.endsWith('.tar.gz')) { + ch_bwa_index = UNTAR_BWA_INDEX ( [ [:], input_bwaindex ] ).untar.map{ it[1] } ch_versions = ch_versions.mix(UNTAR_BWA_INDEX.out.versions) } else { - ch_bwa_index = file(params.bwa_index) + ch_bwa_index = file(input_bwaindex) } } else { ch_bwa_index = BWA_INDEX ( ch_fasta ).index diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 1ed3eec..1052ecc 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -89,7 +89,9 @@ workflow SAMMYSEQ { // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - PREPARE_GENOME () + PREPARE_GENOME (params.fasta, + params.bwa_index) + ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) INPUT_CHECK ( @@ -141,7 +143,7 @@ workflow SAMMYSEQ { // TRIMMOMATIC.out.trimmed_reads, // PREPARE_GENOME.out.bwa_index //) - + FASTQ_ALIGN_BWAALN ( TRIMMOMATIC.out.trimmed_reads, PREPARE_GENOME.out.bwa_index @@ -194,17 +196,19 @@ workflow SAMMYSEQ { } //ch_bam_bai_combined.view() - - //ch_for_bamcoverage = ch_bam_bai_combined.map { meta1, bam, meta2, bai -> - // tuple(meta1, bam, bai) - //} - - //ch_for_bamcoverage.view() + //ch_fasta_meta.map { it[2] }.view + //ch_fasta_meta.collect().view() + //SAMTOOLS_FAIDX.out.fai.collect().view() + //SAMTOOLS_FAIDX.out.fai.map { it['path'] }.collect().view() + ch_fai_path = SAMTOOLS_FAIDX.out.fai.map { it[1] } + //ch_fai_path.view() + ch_fasta_path = ch_fasta_meta.map { it[1] } + //ch_fasta_path.view() DEEPTOOLS_BAMCOVERAGE ( - ch_bam_bai_combined, - ch_fasta_meta.map { it[2] }, - SAMTOOLS_FAIDX.out.fai.collect() + ch_bam_bai_combined, + ch_fasta_path, + ch_fai_path ) if (params.stopAt == 'DEEPTOOLS_BAMCOVERAGE') { From 3bcb7221e5a9154edbe61341d9cafe4ec6dc780b Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Thu, 21 Sep 2023 16:42:56 +0200 Subject: [PATCH 06/14] docker loading --- modules/local/rtwosamplesmle/main.nf | 86 +++++++++++++++++++ .../templates/two_samples_mle.R | 45 ++++++++++ workflows/sammyseq.nf | 13 +++ 3 files changed, 144 insertions(+) create mode 100644 modules/local/rtwosamplesmle/main.nf create mode 100644 modules/local/rtwosamplesmle/templates/two_samples_mle.R diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf new file mode 100644 index 0000000..495df66 --- /dev/null +++ b/modules/local/rtwosamplesmle/main.nf @@ -0,0 +1,86 @@ +// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) +// https://github.com/nf-core/modules/tree/master/modules/nf-core/ +// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: +// https://nf-co.re/join +// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. +// All other parameters MUST be provided using the "task.ext" directive, see here: +// https://www.nextflow.io/docs/latest/process.html#ext +// where "task.ext" is a string. +// Any parameters that need to be evaluated in the context of a particular sample +// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. +// TODO nf-core: Software that can be piped together SHOULD be added to separate module files +// unless there is a run-time, storage advantage in implementing in this way +// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: +// bwa mem | samtools view -B -T ref.fasta +// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty +// list (`[]`) instead of a file can be used to work around this issue. + +process RTWOSAMPLESMLE { + //tag "$meta.id" + label 'process_medium' + + // TODO nf-core: List required Conda package(s). + // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). + // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. + // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. + + //conda "YOUR-TOOL-HERE" + + //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + // 'biocontainers/YOUR-TOOL-HERE' }" + container 'lucidif/r_two_samples_mle:latest' + + + //input: + // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" + // MUST be provided as an input via a Groovy Map called "meta". + // This information may not be required in some instances e.g. indexing reference genome files: + // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf + // TODO nf-core: Where applicable please provide/convert compressed files as input/output + // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. + //tuple val(meta), path(bam) + + //output: + // TODO nf-core: Named file extensions MUST be emitted for ALL output channels + //tuple val(meta), path("*.bam"), emit: bam + // TODO nf-core: List additional required output channels/values here + //path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + + //def args = task.ext.args ?: '' + //def prefix = task.ext.prefix ?: "${meta.id}" + + // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 + // If the software is unable to output a version number on the command-line then it can be manually specified + // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf + // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive + // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter + // using the Nextflow "task" variable e.g. "--threads $task.cpus" + // TODO nf-core: Please replace the example samtools command below with your module's command + // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + template 'two_samples_mle.R' + + //stub: + + //def args = task.ext.args ?: '' + //def prefix = task.ext.prefix ?: "${meta.id}" + + // TODO nf-core: A stub section should mimic the execution of the original module as best as possible + // Have a look at the following examples: + // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 + // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 + // """ + // touch ${prefix}.bam + + // cat <<-END_VERSIONS > versions.yml + // "${task.process}": + // : \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + // END_VERSIONS + // """ +} diff --git a/modules/local/rtwosamplesmle/templates/two_samples_mle.R b/modules/local/rtwosamplesmle/templates/two_samples_mle.R new file mode 100644 index 0000000..e226cc0 --- /dev/null +++ b/modules/local/rtwosamplesmle/templates/two_samples_mle.R @@ -0,0 +1,45 @@ +#!/usr/bin/env Rscript + +# usage: +# Rscript /path/to/make_twosample_mle_spp.R sample_bam control_bam chromsizes_file mle_output_file" + +## LIBRARIES + +require(Rcpp) +require(data.table) +require(spp) +require(rtracklayer) + +print("Hello World") + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +##sink(paste(output_prefix, "R_sessionInfo.log", sep = '.')) +##print(sessionInfo()) +##sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +# r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +# deseq2.version <- as.character(packageVersion('DESeq2')) + +# writeLines( +# c( +# '"${task.process}":', +# paste(' r-base:', r.version), +# paste(' bioconductor-deseq2:', deseq2.version) +# ), +# 'versions.yml') + +################################################ +################################################ +################################################ +################################################ \ No newline at end of file diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 1052ecc..ff59892 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -50,6 +50,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { FASTQ_ALIGN_BWAALN } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf' include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard' +include { RTWOSAMPLESMLE } from '../modules/local/rtwosamplesmle' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,11 +90,23 @@ workflow SAMMYSEQ { // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // + + RTWOSAMPLESMLE() + + if (params.stopAt == 'BEGIN') { + return + } + + PREPARE_GENOME (params.fasta, params.bwa_index) ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) + if (params.stopAt == 'PREPARE_GENOME') { + return + } + INPUT_CHECK ( file(params.input) ) From ff1bd86ac96c4615fa5a08302d15c5b1825d70e4 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Mon, 25 Sep 2023 17:35:05 +0200 Subject: [PATCH 07/14] subworkflow faidx (incomplete) --- modules/local/chromosomes_size.nf | 16 ++++++++++++ subworkflows/local/faidx_cut_chr_size.nf | 32 ++++++++++++++++++++++++ subworkflows/local/prepare_genome.nf | 5 ++++ workflows/sammyseq.nf | 7 +++--- 4 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 modules/local/chromosomes_size.nf create mode 100644 subworkflows/local/faidx_cut_chr_size.nf diff --git a/modules/local/chromosomes_size.nf b/modules/local/chromosomes_size.nf new file mode 100644 index 0000000..6c4151e --- /dev/null +++ b/modules/local/chromosomes_size.nf @@ -0,0 +1,16 @@ +process CUT_SIZES_GENOME { + tag "${sample_id}" + publishDir "${params.outdir}/genome", mode: 'copy' + + input: + tuple val(meta), path(fai) + + output: + path("sizes.genome") , emit: ch_sizes_genome + + script: + """ + cut -f1,2 ${fai} > sizes.genome + """ +} + diff --git a/subworkflows/local/faidx_cut_chr_size.nf b/subworkflows/local/faidx_cut_chr_size.nf new file mode 100644 index 0000000..22b1038 --- /dev/null +++ b/subworkflows/local/faidx_cut_chr_size.nf @@ -0,0 +1,32 @@ +// Import SAMTOOLS_FAIDX module +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx' + +// Definition of the CUT_SIZES_GENOME process +process CUT_SIZES_GENOME { + tag "${sample_id}" + publishDir "${params.outdir}/genome", mode: 'copy' + + input: + tuple val(meta), path(fai) + + output: + path("sizes.genome") , emit: ch_sizes_genome + + script: + """ + cut -f1,2 ${fai} > sizes.genome + """ +} + +// Definition of the subworkflow +workflow FAIDX_SUBWORKFLOW { + take: + path fasta_file + + main: + SAMTOOLS_FAIDX(fasta_file) + CUT_SIZES_GENOME(SAMTOOLS_FAIDX.out.fai) + + emit: + path("sizes.genome"), from: CUT_SIZES_GENOME.out.ch_sizes_genome +} \ No newline at end of file diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 03d6795..bc58e65 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -17,6 +17,7 @@ include { //include { GFFREAD } from '../../modules/nf-core/gffread/main' include { BWA_INDEX } from '../../modules/nf-core/bwa/index/main' +//include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx' workflow PREPARE_GENOME { take: @@ -69,6 +70,10 @@ workflow PREPARE_GENOME { } + //make chromosome size index + + + // // Uncompress Bowtie2 index or generate from scratch if required // diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index ff59892..f6a38d8 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -50,6 +50,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { FASTQ_ALIGN_BWAALN } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf' include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard' +include { CUT_SIZES_GENOME } from "../modules/local/chromosomes_size" include { RTWOSAMPLESMLE } from '../modules/local/rtwosamplesmle' /* @@ -91,8 +92,6 @@ workflow SAMMYSEQ { // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - RTWOSAMPLESMLE() - if (params.stopAt == 'BEGIN') { return } @@ -174,7 +173,9 @@ workflow SAMMYSEQ { [[], []] ) - + ch_fai_for_cut = SAMTOOLS_FAIDX.out.fai.collect() + + CUT_SIZES_GENOME(ch_fai_for_cut) // MARK DUPLICATES IN BAM FILE BAM_MARKDUPLICATES_PICARD ( From 117c237efcd94181d4b965037b2d281380dc5e14 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Wed, 27 Sep 2023 19:37:02 +0200 Subject: [PATCH 08/14] comparison test (not work with multiple comparison with same sample) --- modules/local/rtwosamplesmle/main.nf | 12 ++- .../templates/two_samples_mle.R | 32 +++++++- nextflow.config | 2 + nextflow_schema.json | 8 ++ workflows/sammyseq.nf | 75 ++++++++++++++++++- 5 files changed, 120 insertions(+), 9 deletions(-) diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index 495df66..996e72a 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -29,18 +29,22 @@ process RTWOSAMPLESMLE { //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': // 'biocontainers/YOUR-TOOL-HERE' }" - container 'lucidif/r_two_samples_mle:latest' + //docker pull lucidif/r_two_samples_mle:0.0.1 + //docker image tag 96ceba68eb6d quay.io/lucidif/r_two_samples_mle:0.0.1 + container 'lucidif/r_two_samples_mle:0.0.1' - //input: + + input: // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" // MUST be provided as an input via a Groovy Map called "meta". // This information may not be required in some instances e.g. indexing reference genome files: // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf // TODO nf-core: Where applicable please provide/convert compressed files as input/output // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - //tuple val(meta), path(bam) - + tuple val(meta1), val(meta2), path(bam1), path(bam2) + path chromsizes_file + //output: // TODO nf-core: Named file extensions MUST be emitted for ALL output channels //tuple val(meta), path("*.bam"), emit: bam diff --git a/modules/local/rtwosamplesmle/templates/two_samples_mle.R b/modules/local/rtwosamplesmle/templates/two_samples_mle.R index e226cc0..c9dc805 100644 --- a/modules/local/rtwosamplesmle/templates/two_samples_mle.R +++ b/modules/local/rtwosamplesmle/templates/two_samples_mle.R @@ -10,7 +10,37 @@ require(data.table) require(spp) require(rtracklayer) -print("Hello World") +################################################ +## PARAMS +################################################ + +# ip_file <- args[1] +# input_file <- args[2] +# chromsizes_file <- args[3] +# mle_output_file <- args[4] + +ip_file <- "${bam1}" +input_file <- "${bam2}" +chromsizes_file <- "${chromsizes_file}" +mle_output_file <- "${bam1.baseName}VS${bam2.baseName}_mle.txt" + +################################################ +# DEFAULT PARAMETERS +################################################ + +remove_anomalies <- FALSE +debug_mode <- TRUE + +################################################ +# CORE PROCESSES +################################################ + +if (debug_mode){ + print(c("ip_file=",ip_file)) + print(c("input_file=",input_file)) + print(c("chromsizes_file=",chromsizes_file)) + print(c("mle_output_file",mle_output_file)) +} ################################################ ################################################ diff --git a/nextflow.config b/nextflow.config index cb04645..06ae7c3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,8 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null + comparisonFile = null + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes' diff --git a/nextflow_schema.json b/nextflow_schema.json index 9352576..e833cf5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -44,6 +44,14 @@ "type": "string", "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" + }, + "comparisonFile": { + "type" : "string", + "format": "file-path", + "description": "Path to comma-separated file containing information about samples comparisons", + "mimetype": "text/csv", + "help_text": "You will need to create a comparisons file with information about the comparisons in your experiment before running the pipeline. Use this parameter to specify the necessary comparisons. It has to be a comma-separated file with 2 columns, one for the first sample to compair and the second one to sample against which to do it." + } } }, diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index f6a38d8..5dec07d 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -109,6 +109,9 @@ workflow SAMMYSEQ { INPUT_CHECK ( file(params.input) ) + + //INPUT_CHECK.out.reads.view() + ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ @@ -137,6 +140,9 @@ workflow SAMMYSEQ { // // MODULE: Run FastQC // + //merged_reads.view() + + FASTQC ( merged_reads ) @@ -151,10 +157,7 @@ workflow SAMMYSEQ { } - //BWA_ALN ( - // TRIMMOMATIC.out.trimmed_reads, - // PREPARE_GENOME.out.bwa_index - //) + //TRIMMOMATIC.out.trimmed_reads.view() FASTQ_ALIGN_BWAALN ( TRIMMOMATIC.out.trimmed_reads, @@ -177,6 +180,8 @@ workflow SAMMYSEQ { CUT_SIZES_GENOME(ch_fai_for_cut) + //FASTQ_ALIGN_BWAALN.out.bam.view() + // MARK DUPLICATES IN BAM FILE BAM_MARKDUPLICATES_PICARD ( FASTQ_ALIGN_BWAALN.out.bam, @@ -229,6 +234,68 @@ workflow SAMMYSEQ { return } + if (params.comparisonFile) { + + //BAM_MARKDUPLICATES_PICARD.out.bam.view() + + // Declare the dictionary that will store the comparisons + def comparisons = [:] + + // Read the CSV file + csvFile = file(params.comparisonFile) + csvFile.eachLine { line -> + // Skip the first line (header) + if (line.startsWith("sample1")) { + return + } + + // Extract the samples and add the "_T1" suffix + def (sample1, sample2) = line.split(',') + comparisons[sample1.trim() + "_T1"] = sample2.trim() + "_T1" + println("Comparisons Map: $comparisons") + } + + //println("Comparisons Map: $comparisons") // Print the comparison map + + // Filter BAMs using the updated comparisons map + + + ch_filtered_bams = BAM_MARKDUPLICATES_PICARD.out.bam + .filter { meta, bam -> + boolean isPresent = comparisons.keySet().contains(meta.id) || comparisons.values().contains(meta.id) + if (!isPresent) { + //println("BAM not in comparisons list: $bam") // Print BAMs not in the comparison list + } + return isPresent + } + + // Print filtered BAMs + //ch_filtered_bams.view() + + // Create BAM pairs for comparison + + ch_filtered_bams + .groupTuple(by: [0]) // Group by the metadata + .map { meta, bams -> + def comparisonValue = comparisons[meta.id] + println("Comparison Value for ${meta.id}: $comparisonValue") + } + .filter { it != null } // Remove any null values from the channel + .set { ch_bam_pairs_for_comparison } + println("Before view") + + ch_bam_pairs_for_comparison.count().view() + + ch_bam_pairs_for_comparison.view() + + println("after view") + + if (params.stopAt == 'RTWOSAMPLESMLE') { + return + } + } + + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') From 03d69fb3066ea7c988db64be5315f84382ecce48 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Fri, 29 Sep 2023 11:13:02 +0200 Subject: [PATCH 09/14] second test for comparison --- workflows/sammyseq.nf | 172 ++++++++++++++++++++++++++++++++---------- 1 file changed, 134 insertions(+), 38 deletions(-) diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 5dec07d..4bb536e 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -189,6 +189,12 @@ workflow SAMMYSEQ { SAMTOOLS_FAIDX.out.fai.collect() ) + + //BAM_MARKDUPLICATES_PICARD.out.bam.view() + + ch_mle_in = BAM_MARKDUPLICATES_PICARD.out.bam + //ch_mle_in.view() + if (params.stopAt == 'BAM_MARKDUPLICATES_PICARD') { return } @@ -206,7 +212,7 @@ workflow SAMMYSEQ { //ch_bam_bai_combined = BAM_MARKDUPLICATES_PICARD.out.bam.join(BAM_MARKDUPLICATES_PICARD.out.bai, by: [0]) - ch_bam_bai_combined = BAM_MARKDUPLICATES_PICARD.out.bam + ch_bam_bai_combined = BAM_MARKDUPLICATES_PICARD.out.bam .join(BAM_MARKDUPLICATES_PICARD.out.bai, by: [0], remainder: true) .map { meta, bam, bai -> @@ -219,6 +225,12 @@ workflow SAMMYSEQ { //ch_fasta_meta.collect().view() //SAMTOOLS_FAIDX.out.fai.collect().view() //SAMTOOLS_FAIDX.out.fai.map { it['path'] }.collect().view() + // println("first BAM_MARKDUPLICATES_PICARD.out.bam.view()") + // BAM_MARKDUPLICATES_PICARD.out.bam.view() + // ch_bam_bai_combined.view() + // println("first END BAM_MARKDUPLICATES_PICARD.out.bam.view()") + + ch_fai_path = SAMTOOLS_FAIDX.out.fai.map { it[1] } //ch_fai_path.view() ch_fasta_path = ch_fasta_meta.map { it[1] } @@ -235,60 +247,144 @@ workflow SAMMYSEQ { } if (params.comparisonFile) { - - //BAM_MARKDUPLICATES_PICARD.out.bam.view() - - // Declare the dictionary that will store the comparisons + // Add the suffix "_T1" to each sample ID in the comparison file def comparisons = [:] - - // Read the CSV file + def isFirstLine = true csvFile = file(params.comparisonFile) csvFile.eachLine { line -> - // Skip the first line (header) - if (line.startsWith("sample1")) { + if (isFirstLine) { + isFirstLine = false return } - - // Extract the samples and add the "_T1" suffix - def (sample1, sample2) = line.split(',') - comparisons[sample1.trim() + "_T1"] = sample2.trim() + "_T1" - println("Comparisons Map: $comparisons") + def (sample1, sample2) = line.split(',') + comparisons[sample1.trim() + "_T1"] = sample2.trim() + "_T1" } - //println("Comparisons Map: $comparisons") // Print the comparison map - // Filter BAMs using the updated comparisons map + // ch_filtered_bams = ch_mle_in.filter { meta, bam -> + // boolean isInKeys = comparisons.keySet().contains(meta.id) + // boolean isInValues = comparisons.values().contains(meta.id) + + // boolean isPresent = isInKeys || isInValues + + // //println("Is ${meta.id} present? $isPresent") + + // return isPresent + // } + +def comparison_channels = [:] + + + // comparisons.each { sample1, sample2 -> + // def ch_name = "${sample1}_VS_${sample2}" + // comparison_channels[ch_name] = ch_mle_in.filter { meta, bam -> + // println("meta.id = $meta.id") + // println("sample1 = $sample1") + // println("sample2 = $sample2") + // meta.id == sample1 || meta.id == sample2 + // } + // } + + // comparison_channels.each { comparison, channel -> + // println("Checking channel for: $comparison") + // channel.view() + // } + + // comparison_channels.each { comparison, channel -> + // channel + // .map { ... } // Qualsiasi operazione desideri eseguire + // .set { ... } // Definisci la variabile di output per ulteriori fasi + // } + + + //ch_mle_in.map{meta, bam -> return tuple(meta.id, bam)},set{test_ch} + + //ch_filtered_bams.flatten().view() + + //ch_filtered_bams.toList().transpose().set { all_filtered_bams } + + //all_filtered_bams.view() + + // 1. Create a Comparisons Channel + Channel + .fromPath(params.comparisonFile) + .splitCsv(header: true, sep: ',') + .map { row -> [row.sample1 + "_T1", row.sample2 + "_T1"] } + .set { comparisons_ch } + +// comparisons_ch.map { sample1, sample2 -> +// def filtered_channel = ch_mle_in.filter { meta, bam -> +// meta.id == sample1 || meta.id == sample2 +// }.collect() +// return [ "${sample1}_VS_${sample2}", filtered_channel ] +// }.set { mapped_comparison_channels } + +comparisons_ch.map { sample1, sample2 -> + def ch_name = "${sample1}_VS_${sample2}" + def filtered_channel = ch_mle_in.filter { meta, bam -> + meta.id == sample1 || meta.id == sample2 + }.toList() + [ ch_name, filtered_channel ] +}.set { mapped_comparison_channels } + +mapped_comparison_channels.view() - ch_filtered_bams = BAM_MARKDUPLICATES_PICARD.out.bam - .filter { meta, bam -> - boolean isPresent = comparisons.keySet().contains(meta.id) || comparisons.values().contains(meta.id) - if (!isPresent) { - //println("BAM not in comparisons list: $bam") // Print BAMs not in the comparison list - } - return isPresent - } - // Print filtered BAMs + + //BAM_MARKDUPLICATES_PICARD.out.bam.view() //ch_filtered_bams.view() + //all_filtered_bams.view() - // Create BAM pairs for comparison + //test_ch=BAM_MARKDUPLICATES_PICARD.out.bam - ch_filtered_bams - .groupTuple(by: [0]) // Group by the metadata - .map { meta, bams -> - def comparisonValue = comparisons[meta.id] - println("Comparison Value for ${meta.id}: $comparisonValue") - } - .filter { it != null } // Remove any null values from the channel - .set { ch_bam_pairs_for_comparison } - println("Before view") - ch_bam_pairs_for_comparison.count().view() + //simplified_ch = channel.fromList ( all_filtered_bams ) + // simplified_ch = all_filtered_bams.map { it -> + + // meta=it + // path=it + // println("meta: $meta") + // println("path: $path") + // //println("bamfile: $bamfile") + // return tuple(meta, path) + // } + // simplified_ch.view() + + //comparisons_ch.view() + + //all_filtered_bams.join(comparisons_ch).view() + + + // Creiamo una mappa da 'id' a 'path' +// def filterBamsByComparison(metaList, pathList, comparison) { +// def ids = comparison.collect() +// def selectedMetas = metaList.findAll { it['id'] in ids } +// def selectedPaths = pathList.findAll { path -> +// def matchingMeta = metaList.find { it['id'] in ids } +// path.contains(matchingMeta['id']) +// } +// return tuple(selectedMetas, selectedPaths) +// } + +// Channel +// .combine(all_filtered_bams, comparisons_ch) +// .map { allMeta, allPath, comparison -> +// return filterBamsByComparison(allMeta, allPath, comparison) +// } set {iltered_bams_ch} + +// filtered_bams_ch.view() // Solo per debug + + + + + //simplified_ch.view() + + + - ch_bam_pairs_for_comparison.view() - println("after view") + //all_filtered_bams.view() if (params.stopAt == 'RTWOSAMPLESMLE') { return From f01de83e2d53d53f737fdbd66c1334644a133116 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Tue, 3 Oct 2023 09:40:34 +0200 Subject: [PATCH 10/14] make comparison channel --- modules/local/rtwosamplesmle/main.nf | 14 +- workflows/sammyseq.nf | 203 +++++++++++++++------------ 2 files changed, 129 insertions(+), 88 deletions(-) diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index 996e72a..fe691e0 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -31,7 +31,7 @@ process RTWOSAMPLESMLE { // 'biocontainers/YOUR-TOOL-HERE' }" //docker pull lucidif/r_two_samples_mle:0.0.1 - //docker image tag 96ceba68eb6d quay.io/lucidif/r_two_samples_mle:0.0.1 + //docker image tag ae028b17f7d3 quay.io/lucidif/r_two_samples_mle:0.0.1 container 'lucidif/r_two_samples_mle:0.0.1' @@ -42,9 +42,11 @@ process RTWOSAMPLESMLE { // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf // TODO nf-core: Where applicable please provide/convert compressed files as input/output // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta1), val(meta2), path(bam1), path(bam2) + //tuple val(meta1), val(meta2), path(bam1), path(bam2) + tuple val(meta), path(bam1), path(bam2) path chromsizes_file + //output: // TODO nf-core: Named file extensions MUST be emitted for ALL output channels //tuple val(meta), path("*.bam"), emit: bam @@ -68,6 +70,14 @@ process RTWOSAMPLESMLE { // using the Nextflow "task" variable e.g. "--threads $task.cpus" // TODO nf-core: Please replace the example samtools command below with your module's command // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + + + + // bam_list_ch + // .map { it[0].collate(2) } // Raggruppa ogni due elementi + // .set { paired_bam_list_ch } + + template 'two_samples_mle.R' //stub: diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 4bb536e..5aca99c 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -179,6 +179,7 @@ workflow SAMMYSEQ { ch_fai_for_cut = SAMTOOLS_FAIDX.out.fai.collect() CUT_SIZES_GENOME(ch_fai_for_cut) + //CUT_SIZES_GENOME.out.ch_sizes_genome.view() //FASTQ_ALIGN_BWAALN.out.bam.view() @@ -261,130 +262,160 @@ workflow SAMMYSEQ { } - // ch_filtered_bams = ch_mle_in.filter { meta, bam -> - // boolean isInKeys = comparisons.keySet().contains(meta.id) - // boolean isInValues = comparisons.values().contains(meta.id) - - // boolean isPresent = isInKeys || isInValues - - // //println("Is ${meta.id} present? $isPresent") + // 1. Create a Comparisons Channel - // return isPresent - // } + Channel + .fromPath(params.comparisonFile) + .splitCsv(header: true, sep: ',') + .map { row -> [row.sample1 + "_T1", row.sample2 + "_T1"] } + .set { comparisons_ch } -def comparison_channels = [:] - // comparisons.each { sample1, sample2 -> - // def ch_name = "${sample1}_VS_${sample2}" - // comparison_channels[ch_name] = ch_mle_in.filter { meta, bam -> - // println("meta.id = $meta.id") - // println("sample1 = $sample1") - // println("sample2 = $sample2") - // meta.id == sample1 || meta.id == sample2 - // } - // } + // ch_mle_in.collect { meta, bam -> [meta.id, bam] } + // .toList() + // .set { bam_list_ch } - // comparison_channels.each { comparison, channel -> - // println("Checking channel for: $comparison") - // channel.view() - // } + //bam_list_ch.view() - // comparison_channels.each { comparison, channel -> - // channel - // .map { ... } // Qualsiasi operazione desideri eseguire - // .set { ... } // Definisci la variabile di output per ulteriori fasi - // } + // bam_list_ch + // .map { it[0].collate(2) } // Raggruppa ogni due elementi + // .set { paired_bam_list_ch } - //ch_mle_in.map{meta, bam -> return tuple(meta.id, bam)},set{test_ch} + //paired_bam_list_ch.view() - //ch_filtered_bams.flatten().view() + // paired_bam_list_ch + // .transpose() + // .set { bam_pairs_ch } - //ch_filtered_bams.toList().transpose().set { all_filtered_bams } + // bam_pairs_ch.merge().view() - //all_filtered_bams.view() +ch_bam_input=BAM_MARKDUPLICATES_PICARD.out.bam//.view{"get class : ${it.getClass()}"} +//comparisons_ch.view() +//CUT_SIZES_GENOME.out.ch_sizes_genome.view{"get class : ${it.getClass()}"} - // 1. Create a Comparisons Channel - Channel - .fromPath(params.comparisonFile) - .splitCsv(header: true, sep: ',') - .map { row -> [row.sample1 + "_T1", row.sample2 + "_T1"] } - .set { comparisons_ch } -// comparisons_ch.map { sample1, sample2 -> -// def filtered_channel = ch_mle_in.filter { meta, bam -> -// meta.id == sample1 || meta.id == sample2 -// }.collect() -// return [ "${sample1}_VS_${sample2}", filtered_channel ] -// }.set { mapped_comparison_channels } +Channel.fromPath(params.comparisonFile) + .splitCsv(header : true) + .map{ row -> + //sample1 = [sample1:row.sample1 + "_T1"] + //sample2 = [sample2:row.sample2 + "_T1"] + [ row.sample1 + "_T1", row.sample2 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + } + .set { comparisons_ch_s1 } + //.view() + +Channel.fromPath(params.comparisonFile) + .splitCsv(header : true) + .map{ row -> + //sample1 = [sample1:row.sample1 + "_T1"] + //sample2 = [sample2:row.sample2 + "_T1"] + [ row.sample2 + "_T1", row.sample1 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + } + .set { comparisons_ch_s2 } -comparisons_ch.map { sample1, sample2 -> - def ch_name = "${sample1}_VS_${sample2}" - def filtered_channel = ch_mle_in.filter { meta, bam -> - meta.id == sample1 || meta.id == sample2 - }.toList() - [ ch_name, filtered_channel ] -}.set { mapped_comparison_channels } -mapped_comparison_channels.view() +//Channel.fromList(ch_bam_input).view() + // ch_bam_input.collect { meta, bam -> [meta.id, bam] } + // .toList() + // .set { ch_bam_fix } - //BAM_MARKDUPLICATES_PICARD.out.bam.view() - //ch_filtered_bams.view() - //all_filtered_bams.view() - //test_ch=BAM_MARKDUPLICATES_PICARD.out.bam + +ch_bam_input + .map {meta, bam -> + id=meta.subMap('id') + [id.id, bam] + + } + //.view{"ch_bam+convert: ${it}"} + .set { ch_bam_reformat } + + +//ch_bam_test +//ch_bam_reformat.combine(comparisons_ch_s1, by:0).view{ "combine: ${it}" } + + +// ch_bam_reformat +// .toList() +// .flatten() +// .set { fix_ch_bam } + - //simplified_ch = channel.fromList ( all_filtered_bams ) - // simplified_ch = all_filtered_bams.map { it -> +//comparisons_ch.view{"comparisons_ch= ${it}"} +//ch_bam_reformat.view{"ch_bam_reformat= ${it}"} +//ch_bam_input.view{"ch_bam_input= ${it}"} - // meta=it - // path=it - // println("meta: $meta") - // println("path: $path") - // //println("bamfile: $bamfile") - // return tuple(meta, path) - // } - // simplified_ch.view() - //comparisons_ch.view() +comparisons_ch_s1.combine(ch_bam_reformat , by:0) + .map { sample1, sample2, comparison, bam -> + //[ comparison:comparison, sample1:sample1, sample2:sample2, bam1:bam1] + [ comparison, bam] + }.set { bam1_comparison } - //all_filtered_bams.join(comparisons_ch).view() + //.view{"join= ${it}"} +comparisons_ch_s2.combine(ch_bam_reformat, by:0) + .map{ sample2, sample1, comparison, bam -> + //[ comparison:comparison , sample1:sample1, sample2:sample2 ,bam2:bam2 ] + [ comparison, bam] + } + .set{ bam2_comparison } +bam1_comparison.join(bam2_comparison, remainder: false, by: 0 ).set{comparisons_merge_ch} - // Creiamo una mappa da 'id' a 'path' -// def filterBamsByComparison(metaList, pathList, comparison) { -// def ids = comparison.collect() -// def selectedMetas = metaList.findAll { it['id'] in ids } -// def selectedPaths = pathList.findAll { path -> -// def matchingMeta = metaList.find { it['id'] in ids } -// path.contains(matchingMeta['id']) -// } -// return tuple(selectedMetas, selectedPaths) -// } +//comparisons_ch_s1.view{ "comparisons_ch_s1: ${it}" } +//comparisons_ch_s2.view{ "comparisons_ch_s2: ${it}" } -// Channel -// .combine(all_filtered_bams, comparisons_ch) -// .map { allMeta, allPath, comparison -> -// return filterBamsByComparison(allMeta, allPath, comparison) -// } set {iltered_bams_ch} +//bam1_comparison.view{ "bam1_comparison: ${it}" } +//bam2_comparison.view{ "bam2_comparison: ${it}" } -// filtered_bams_ch.view() // Solo per debug +comparisons_merge_ch.view{ "comparisons_merge_ch: ${it}" } +//ch_bam_reformat.view{ "ch_bam_reformat: ${it}" } +//ch_bam_reformat.join(comparisons_ch, remainder:true ,by:[1]).view{"join= ${it}"} - //simplified_ch.view() +// canale1.flatten() +// .combine(canale2, by: 0) +// .groupTuple(by: [0,1]) +// .map { it.flatten() } +// .view() + + +//ch_bam_reformat.view() +//comparisons_ch.view() + +//comparisons_ch.cross( ch_bam_reformat ).view() + +// first_join = comparisons_ch +// .join(ch_bam_reformat, by: { it[0] }).view() +// //.map { it -> [it[0], it[2], it[3]] }.view() + + +// Channel +// .of( 'alpha,beta,gamma\n10,20,30\n70,80,90' ) +// .splitCsv(header: true) +// .view { row -> "${row.alpha} - ${row.beta} - ${row.gamma}" } +// comparisons_ch +// .flatMap { sample1, sample2 -> [ [sample1, sample2] ] } // perché hai bisogno di entrambe le combinazioni? +// //.join(ch_bam_input, by: { it[0] }) +// //.join(ch_bam_input, by: { it[1] }) +// .set { ch_final_input } +// ch_final_input.view() - //all_filtered_bams.view() + RTWOSAMPLESMLE (comparisons_merge_ch, + CUT_SIZES_GENOME.out.ch_sizes_genome + ) if (params.stopAt == 'RTWOSAMPLESMLE') { return From 99f4ad0405e8d30b74a12fbfd4dffc87af892aa0 Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Tue, 3 Oct 2023 09:56:50 +0200 Subject: [PATCH 11/14] cleaning after comparison channel creation --- workflows/sammyseq.nf | 183 +++++++++--------------------------------- 1 file changed, 40 insertions(+), 143 deletions(-) diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 5aca99c..006a4ca 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -221,16 +221,6 @@ workflow SAMMYSEQ { } - //ch_bam_bai_combined.view() - //ch_fasta_meta.map { it[2] }.view - //ch_fasta_meta.collect().view() - //SAMTOOLS_FAIDX.out.fai.collect().view() - //SAMTOOLS_FAIDX.out.fai.map { it['path'] }.collect().view() - // println("first BAM_MARKDUPLICATES_PICARD.out.bam.view()") - // BAM_MARKDUPLICATES_PICARD.out.bam.view() - // ch_bam_bai_combined.view() - // println("first END BAM_MARKDUPLICATES_PICARD.out.bam.view()") - ch_fai_path = SAMTOOLS_FAIDX.out.fai.map { it[1] } //ch_fai_path.view() @@ -249,6 +239,9 @@ workflow SAMMYSEQ { if (params.comparisonFile) { // Add the suffix "_T1" to each sample ID in the comparison file + + ch_bam_input=BAM_MARKDUPLICATES_PICARD.out.bam + def comparisons = [:] def isFirstLine = true csvFile = file(params.comparisonFile) @@ -262,156 +255,60 @@ workflow SAMMYSEQ { } - // 1. Create a Comparisons Channel + // 1. Create a Comparisons Channels (one for sample 1 in comparison and another for sample 2 in comparison) Channel .fromPath(params.comparisonFile) - .splitCsv(header: true, sep: ',') - .map { row -> [row.sample1 + "_T1", row.sample2 + "_T1"] } - .set { comparisons_ch } - - - - - // ch_mle_in.collect { meta, bam -> [meta.id, bam] } - // .toList() - // .set { bam_list_ch } - - //bam_list_ch.view() - - // bam_list_ch - // .map { it[0].collate(2) } // Raggruppa ogni due elementi - // .set { paired_bam_list_ch } - - - //paired_bam_list_ch.view() - - // paired_bam_list_ch - // .transpose() - // .set { bam_pairs_ch } - - // bam_pairs_ch.merge().view() - -ch_bam_input=BAM_MARKDUPLICATES_PICARD.out.bam//.view{"get class : ${it.getClass()}"} -//comparisons_ch.view() -//CUT_SIZES_GENOME.out.ch_sizes_genome.view{"get class : ${it.getClass()}"} - - -Channel.fromPath(params.comparisonFile) - .splitCsv(header : true) - .map{ row -> - //sample1 = [sample1:row.sample1 + "_T1"] - //sample2 = [sample2:row.sample2 + "_T1"] - [ row.sample1 + "_T1", row.sample2 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] - } - .set { comparisons_ch_s1 } - //.view() - -Channel.fromPath(params.comparisonFile) - .splitCsv(header : true) - .map{ row -> - //sample1 = [sample1:row.sample1 + "_T1"] - //sample2 = [sample2:row.sample2 + "_T1"] - [ row.sample2 + "_T1", row.sample1 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] - } - .set { comparisons_ch_s2 } - - -//Channel.fromList(ch_bam_input).view() - - // ch_bam_input.collect { meta, bam -> [meta.id, bam] } - // .toList() - // .set { ch_bam_fix } - - - - -ch_bam_input - .map {meta, bam -> - id=meta.subMap('id') - [id.id, bam] - - } - //.view{"ch_bam+convert: ${it}"} - .set { ch_bam_reformat } - - -//ch_bam_test -//ch_bam_reformat.combine(comparisons_ch_s1, by:0).view{ "combine: ${it}" } - - -// ch_bam_reformat -// .toList() -// .flatten() -// .set { fix_ch_bam } + .splitCsv(header : true) + .map{ row -> + [ row.sample1 + "_T1", row.sample2 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + } + .set { comparisons_ch_s1 } + //.view() - + Channel.fromPath(params.comparisonFile) + .splitCsv(header : true) + .map{ row -> + [ row.sample2 + "_T1", row.sample1 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + } + .set { comparisons_ch_s2 } -//comparisons_ch.view{"comparisons_ch= ${it}"} -//ch_bam_reformat.view{"ch_bam_reformat= ${it}"} -//ch_bam_input.view{"ch_bam_input= ${it}"} + //2. convert bam file to input + ch_bam_input + .map {meta, bam -> + id=meta.subMap('id') + [id.id, bam] + + } + .set { ch_bam_reformat } -comparisons_ch_s1.combine(ch_bam_reformat , by:0) + //3. combine comparison channel with bam list channel + comparisons_ch_s1 + .combine(ch_bam_reformat , by:0) .map { sample1, sample2, comparison, bam -> //[ comparison:comparison, sample1:sample1, sample2:sample2, bam1:bam1] - [ comparison, bam] - }.set { bam1_comparison } + [ comparison, bam] + } + .set { bam1_comparison } - //.view{"join= ${it}"} -comparisons_ch_s2.combine(ch_bam_reformat, by:0) + //.view{"join= ${it}"} + comparisons_ch_s2 + .combine(ch_bam_reformat, by:0) .map{ sample2, sample1, comparison, bam -> //[ comparison:comparison , sample1:sample1, sample2:sample2 ,bam2:bam2 ] [ comparison, bam] - } + } .set{ bam2_comparison } -bam1_comparison.join(bam2_comparison, remainder: false, by: 0 ).set{comparisons_merge_ch} - -//comparisons_ch_s1.view{ "comparisons_ch_s1: ${it}" } -//comparisons_ch_s2.view{ "comparisons_ch_s2: ${it}" } - -//bam1_comparison.view{ "bam1_comparison: ${it}" } -//bam2_comparison.view{ "bam2_comparison: ${it}" } - - -comparisons_merge_ch.view{ "comparisons_merge_ch: ${it}" } - -//ch_bam_reformat.view{ "ch_bam_reformat: ${it}" } - -//ch_bam_reformat.join(comparisons_ch, remainder:true ,by:[1]).view{"join= ${it}"} - -// canale1.flatten() -// .combine(canale2, by: 0) -// .groupTuple(by: [0,1]) -// .map { it.flatten() } -// .view() - - -//ch_bam_reformat.view() -//comparisons_ch.view() - -//comparisons_ch.cross( ch_bam_reformat ).view() - -// first_join = comparisons_ch -// .join(ch_bam_reformat, by: { it[0] }).view() -// //.map { it -> [it[0], it[2], it[3]] }.view() - - -// Channel -// .of( 'alpha,beta,gamma\n10,20,30\n70,80,90' ) -// .splitCsv(header: true) -// .view { row -> "${row.alpha} - ${row.beta} - ${row.gamma}" } - - -// comparisons_ch -// .flatMap { sample1, sample2 -> [ [sample1, sample2] ] } // perché hai bisogno di entrambe le combinazioni? -// //.join(ch_bam_input, by: { it[0] }) -// //.join(ch_bam_input, by: { it[1] }) -// .set { ch_final_input } + bam1_comparison + .join(bam2_comparison, remainder: false, by: 0 ) + .set{comparisons_merge_ch} -// ch_final_input.view() + //comparisons_merge_ch + // .view{ "comparisons_merge_ch: ${it}" } + //4.run rscript RTWOSAMPLESMLE (comparisons_merge_ch, CUT_SIZES_GENOME.out.ch_sizes_genome From 0d28684aaade476a783d1c221fac63325ad90a6f Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Wed, 4 Oct 2023 19:16:10 +0200 Subject: [PATCH 12/14] R mle comparison integration --- modules/local/rtwosamplesmle/main.nf | 6 +- .../templates/two_samples_mle.R | 76 ++++++++++++++++++- workflows/sammyseq.nf | 38 +++++----- 3 files changed, 96 insertions(+), 24 deletions(-) diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index fe691e0..4d981bd 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -26,9 +26,9 @@ process RTWOSAMPLESMLE { //conda "YOUR-TOOL-HERE" - //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - // 'biocontainers/YOUR-TOOL-HERE' }" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + 'biocontainers/YOUR-TOOL-HERE' }" //docker pull lucidif/r_two_samples_mle:0.0.1 //docker image tag ae028b17f7d3 quay.io/lucidif/r_two_samples_mle:0.0.1 diff --git a/modules/local/rtwosamplesmle/templates/two_samples_mle.R b/modules/local/rtwosamplesmle/templates/two_samples_mle.R index c9dc805..4d6eda4 100644 --- a/modules/local/rtwosamplesmle/templates/two_samples_mle.R +++ b/modules/local/rtwosamplesmle/templates/two_samples_mle.R @@ -10,6 +10,20 @@ require(data.table) require(spp) require(rtracklayer) +################################################ +# FUNCTIONS +################################################ + +sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) { + if (!(is.null(endcol))) { + x <- x[order(x[,endcol]),] + } + x <- x[order(x[,stcol]),] + chrs <- ordered(x[,chrcol], levels=chrorder) + # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep="")) + x <- x[order(chrs),] +} + ################################################ ## PARAMS ################################################ @@ -23,6 +37,7 @@ ip_file <- "${bam1}" input_file <- "${bam2}" chromsizes_file <- "${chromsizes_file}" mle_output_file <- "${bam1.baseName}VS${bam2.baseName}_mle.txt" +remove_anomalies <- FALSE ################################################ # DEFAULT PARAMETERS @@ -42,15 +57,70 @@ if (debug_mode){ print(c("mle_output_file",mle_output_file)) } +## PROCESSING + +### Make the list of chromosomes + +print("first part") +chromsizes <- fread(chromsizes_file, data.table = FALSE) +#chromsizes <- sortbychr(chromsizes, chrcol="V1", stcol="V2", chrorder=paste("chr", c(seq(19), "X", "Y"), sep="")) +#chrs <- as.character(sub('chr', '', chromsizes[, 1])) +chrs <- chromsizes[, 1] + +### Import the data + +ip <- read.bam.tags(ip_file) +input <- read.bam.tags(input_file) + +### Select the informative tags +ip_tags <- ip[['tags']] +input_tags <- input[['tags']] +print("second part") +### Remove the tags with anomalies +if(remove_anomalies==TRUE){ + ip_rm <- remove.local.tag.anomalies(ip_tags[chrs]) + input_rm <- remove.local.tag.anomalies(input_tags[chrs]) +}else{ + ip_rm <- ip_tags + input_rm <- input_tags +} + +print("third part") +### Make the smoothing with the loglikelihood function +print("lane 90") +#print(paste0("ip_rm=",head(ip_rm))) +#print(paste0("input_rm=",head(ip_rm))) +chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE) +print("lane 91") +#chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_tags, input_tags, tag.shift = 0, background.density.scaling = FALSE) + +ccl <- sapply(chip_smoothed_mle, nrow) +cpos <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$x)) +ccov <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$y)) +print("line 97") +gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cpos), strand = Rle('*', sum(ccl)), score = ccov) +seqlengths(gr.mle) <- chromsizes[, 2] + +#rename score col +col_names <- names(mcols(gr.mle)) +#col_names <- sub("\\..*\$", "", col_names) +col_names <- "score" +names(mcols(gr.mle)) <- col_names + +print("fouth part") +#export out file +export.bw(gr.mle, mle_output_file) + + ################################################ ################################################ ## R SESSION INFO ## ################################################ ################################################ -##sink(paste(output_prefix, "R_sessionInfo.log", sep = '.')) -##print(sessionInfo()) -##sink() +# sink(paste(output_prefix, "R_sessionInfo.log", sep = '.')) +# print(sessionInfo()) +# sink() ################################################ ################################################ diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 006a4ca..ef0e3ce 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -242,17 +242,17 @@ workflow SAMMYSEQ { ch_bam_input=BAM_MARKDUPLICATES_PICARD.out.bam - def comparisons = [:] - def isFirstLine = true - csvFile = file(params.comparisonFile) - csvFile.eachLine { line -> - if (isFirstLine) { - isFirstLine = false - return - } - def (sample1, sample2) = line.split(',') - comparisons[sample1.trim() + "_T1"] = sample2.trim() + "_T1" - } + // def comparisons = [:] + // def isFirstLine = true + // csvFile = file(params.comparisonFile) + // csvFile.eachLine { line -> + // if (isFirstLine) { + // isFirstLine = false + // return + // } + // def (sample1, sample2) = line.split(',') + // comparisons[sample1.trim() + "_T1"] = sample2.trim() + "_T1" + // } // 1. Create a Comparisons Channels (one for sample 1 in comparison and another for sample 2 in comparison) @@ -261,7 +261,8 @@ workflow SAMMYSEQ { .fromPath(params.comparisonFile) .splitCsv(header : true) .map{ row -> - [ row.sample1 + "_T1", row.sample2 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + //[ row.sample1 + "_T1", row.sample2 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + [ row.sample1 + "_T1", row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] } .set { comparisons_ch_s1 } //.view() @@ -269,12 +270,13 @@ workflow SAMMYSEQ { Channel.fromPath(params.comparisonFile) .splitCsv(header : true) .map{ row -> - [ row.sample2 + "_T1", row.sample1 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + // [ row.sample2 + "_T1", row.sample1 + "_T1",row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] + [ row.sample2 + "_T1", row.sample1 + "_T1_VS_" + row.sample2 + "_T1"] } .set { comparisons_ch_s2 } //2. convert bam file to input - + // [[id:ggg, paired:true],path.bam] ch_bam_input .map {meta, bam -> id=meta.subMap('id') @@ -286,7 +288,7 @@ workflow SAMMYSEQ { //3. combine comparison channel with bam list channel comparisons_ch_s1 .combine(ch_bam_reformat , by:0) - .map { sample1, sample2, comparison, bam -> + .map { sample1, comparison, bam -> //[ comparison:comparison, sample1:sample1, sample2:sample2, bam1:bam1] [ comparison, bam] } @@ -295,7 +297,7 @@ workflow SAMMYSEQ { //.view{"join= ${it}"} comparisons_ch_s2 .combine(ch_bam_reformat, by:0) - .map{ sample2, sample1, comparison, bam -> + .map{ sample2, comparison, bam -> //[ comparison:comparison , sample1:sample1, sample2:sample2 ,bam2:bam2 ] [ comparison, bam] } @@ -311,8 +313,8 @@ workflow SAMMYSEQ { //4.run rscript RTWOSAMPLESMLE (comparisons_merge_ch, - CUT_SIZES_GENOME.out.ch_sizes_genome - ) + CUT_SIZES_GENOME.out.ch_sizes_genome + ) if (params.stopAt == 'RTWOSAMPLESMLE') { return From 99ccc8f8767f5470a5e691544b60c3ba463b3eca Mon Sep 17 00:00:00 2001 From: Lucio Di Filippo Date: Thu, 5 Oct 2023 18:00:38 +0200 Subject: [PATCH 13/14] clear code and add samtools to multiqc --- conf/modules.config | 70 +++------------------------- modules/local/rtwosamplesmle/main.nf | 10 ++-- workflows/sammyseq.nf | 15 ++---- 3 files changed, 15 insertions(+), 80 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0b65a9c..d76dcc9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -51,39 +51,11 @@ process { // // Alignment options // - withName: 'BWA_MEM' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/bwa" }, - enabled: false, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: "*.bam" - ] - } - withName: 'SAMTOOLS_INDEX_BAM' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/bwa" }, - enabled: true, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: "*.bai" - ] - } - withName: 'SAMTOOLS_SORT_BAM' { - ext.args = "" - ext.prefix = { "${meta.id}.sorted" } - publishDir = [ - path: { "${params.outdir}/bwa" }, - enabled: true, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: "*sorted.bam" - ] - } + + + // // Picard MarkDuplicates and Filtering options @@ -129,41 +101,11 @@ process { ] } - withName: 'SAMTOOLS_VIEW_FILTER' { - ext.args = '-h -F 0x0400' - publishDir = [ - path: { "${params.outdir}/markduplicates/duplicates_removed" }, - mode: params.publish_dir_mode, - pattern: '*filtered.{bai,bam}', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - ext.prefix = { "${meta.id}.md.filtered" } - } - withName: 'SAMTOOLS_SORT_FILTERED' { - ext.args = "" - ext.prefix = { "${meta.id}.md.filtered.sorted" } - publishDir = [ - path: { "${params.outdir}/markduplicates/duplicates_removed" }, - mode: params.publish_dir_mode, - pattern: '*filtered.sorted.bam', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'SAMTOOLS_INDEX_FILTERED' { - ext.args = "" - ext.prefix = { "${meta.id}.md.filtered.sorted" } - publishDir = [ - path: { "${params.outdir}/markduplicates/duplicates_removed" }, - mode: params.publish_dir_mode, - pattern: '*filtered.sorted.bai', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } + + + diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index 4d981bd..f0c05a3 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -26,14 +26,14 @@ process RTWOSAMPLESMLE { //conda "YOUR-TOOL-HERE" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + // 'biocontainers/YOUR-TOOL-HERE' }" //docker pull lucidif/r_two_samples_mle:0.0.1 //docker image tag ae028b17f7d3 quay.io/lucidif/r_two_samples_mle:0.0.1 - container 'lucidif/r_two_samples_mle:0.0.1' - + //container 'lucidif/r_two_samples_mle:0.0.1' + //container 'https://registry-1.docker.io/v2/lucidif/r_two_samples_mle/blobs/sha256:038e9a5189bd7f946bc85db99ea5c43cce8b643c4e3723d52a2a763b8cf21304' input: // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index ef0e3ce..1b7d95e 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -242,17 +242,6 @@ workflow SAMMYSEQ { ch_bam_input=BAM_MARKDUPLICATES_PICARD.out.bam - // def comparisons = [:] - // def isFirstLine = true - // csvFile = file(params.comparisonFile) - // csvFile.eachLine { line -> - // if (isFirstLine) { - // isFirstLine = false - // return - // } - // def (sample1, sample2) = line.split(',') - // comparisons[sample1.trim() + "_T1"] = sample2.trim() + "_T1" - // } // 1. Create a Comparisons Channels (one for sample 1 in comparison and another for sample 2 in comparison) @@ -342,6 +331,10 @@ workflow SAMMYSEQ { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.idxstats.collect{it[1]}.ifEmpty([])) + MULTIQC ( ch_multiqc_files.collect(), ch_multiqc_config.toList(), From 8ec7d758333a89b292532fdb60cc3d3937dcf433 Mon Sep 17 00:00:00 2001 From: daisymut Date: Thu, 5 Oct 2023 18:53:28 +0200 Subject: [PATCH 14/14] fixed prettier --- nextflow_schema.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index e833cf5..1ff736c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -46,12 +46,11 @@ "fa_icon": "fas fa-file-signature" }, "comparisonFile": { - "type" : "string", + "type": "string", "format": "file-path", "description": "Path to comma-separated file containing information about samples comparisons", "mimetype": "text/csv", "help_text": "You will need to create a comparisons file with information about the comparisons in your experiment before running the pipeline. Use this parameter to specify the necessary comparisons. It has to be a comma-separated file with 2 columns, one for the first sample to compair and the second one to sample against which to do it." - } } },