diff --git a/conf/modules/germlinecnvcaller_cohort.config b/conf/modules/germlinecnvcaller_cohort.config index b7204f5..2785d89 100644 --- a/conf/modules/germlinecnvcaller_cohort.config +++ b/conf/modules/germlinecnvcaller_cohort.config @@ -36,6 +36,14 @@ process { ] } + withName: '.*GERMLINECNVCALLER_COHORT:GATK4_BEDTOINTERVALLIST_TARGETS' { + ext.when = { params.analysis_type.equals("wes") && params.target_interval_list.equals(null) && params.target_bed } + } + + withName: '.*GERMLINECNVCALLER_COHORT:GATK4_BEDTOINTERVALLIST_EXCLUDE' { + ext.when = { params.analysis_type.equals("wes") && params.exclude_interval_list.equals(null) && params.exclude_bed } + } + withName: '.*GERMLINECNVCALLER_COHORT:GATK4_PREPROCESSINTERVALS' { ext.args = { ["--imr OVERLAPPING_ONLY", "--padding ${params.padding}", diff --git a/docs/usage.md b/docs/usage.md index c2ac6a6..97d9d07 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -111,10 +111,18 @@ If you wish to share such profile (such as upload as supplementary material for If you are running the pipeline to generate references for the GATK's germlinecnvcalling workflow, you should ensure that you have provided all the mandatory options specified in the table below. -| Mandatory | Optional | -| ------------------------- | -------- | -| fasta/genomes | fai | -| ploidy_priors1 | dict | +| Mandatory | Optional | +| ------------------------- | --------------------------------- | +| fasta/genomes | fai | +| ploidy_priors1 | dict | +| | target_bed/target_interval_list | +| | exclude_bed/exclude_interval_list | +| | bin_length | +| | mappable_regions | +| | padding | +| | readcount_format | +| | scatter_content | +| | segmental_duplications | 1 To learn more about this file, see [this comment](https://gatk.broadinstitute.org/hc/en-us/community/posts/360074399831/comments/13441240230299) on GATK forum.
diff --git a/main.nf b/main.nf index c06659e..71b8149 100644 --- a/main.nf +++ b/main.nf @@ -16,7 +16,6 @@ nextflow.enable.dsl = 2 IMPORT FUNCTIONS / MODULES / SUBWORKFLOWS / WORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - include { CREATEPANELREFS } from './workflows/createpanelrefs' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_createpanelrefs_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_createpanelrefs_pipeline' @@ -31,9 +30,13 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_crea // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` -params.fasta = getGenomeAttribute('fasta') -params.fai = getGenomeAttribute('fai') -params.dict = getGenomeAttribute('dict') +params.fasta = getGenomeAttribute('fasta') +params.fai = getGenomeAttribute('fai') +params.dict = getGenomeAttribute('dict') +params.target_bed = getGenomeAttribute('target_bed') +params.target_interval_list = getGenomeAttribute('target_interval_list') +params.exclude_bed = getGenomeAttribute('exclude_bed') +params.exclude_interval_list = getGenomeAttribute('exclude_interval_list') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 02edf9b..0479098 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,11 @@ "git_sha": "42ae163c3c6eb23646189c30c07a889ad39c9b0e", "installed_by": ["modules"] }, + "gatk4/bedtointervallist": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "gatk4/collectreadcounts": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/nf-core/gatk4/bedtointervallist/environment.yml b/modules/nf-core/gatk4/bedtointervallist/environment.yml new file mode 100644 index 0000000..e7cb428 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_bedtointervallist +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/bedtointervallist/main.nf b/modules/nf-core/gatk4/bedtointervallist/main.nf new file mode 100644 index 0000000..88b24b1 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/main.nf @@ -0,0 +1,56 @@ +process GATK4_BEDTOINTERVALLIST { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(bed) + tuple val(meta2), path(dict) + + output: + tuple val(meta), path('*.interval_list'), emit: interval_list + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK BedToIntervalList] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + BedToIntervalList \\ + --INPUT $bed \\ + --OUTPUT ${prefix}.interval_list \\ + --SEQUENCE_DICTIONARY $dict \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.interval_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/bedtointervallist/meta.yml b/modules/nf-core/gatk4/bedtointervallist/meta.yml new file mode 100644 index 0000000..187da88 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/meta.yml @@ -0,0 +1,51 @@ +name: gatk4_bedtointervallist +description: Creates an interval list from a bed file and a reference dict +keywords: + - bed + - bedtointervallist + - gatk4 + - interval list +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - bed: + type: file + description: Input bed file + pattern: "*.bed" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - dict: + type: file + description: Sequence dictionary + pattern: "*.dict" +output: + - interval_list: + type: file + description: gatk interval list file + pattern: "*.interval_list" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@ramprasadn" +maintainers: + - "@kevinmenden" + - "@ramprasadn" diff --git a/nextflow.config b/nextflow.config index d32f96a..7964229 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { tools = null // No default, must be specified // Germlinecnvcaller options + analysis_type = 'wgs' bin_length = 1000 mappable_regions = null padding = 0 diff --git a/nextflow_schema.json b/nextflow_schema.json index 5a0a915..37f5795 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,6 +11,13 @@ "description": "Options used by the germlinecnvcaller subworkflow", "default": "", "properties": { + "analysis_type": { + "type": "string", + "default": "wgs", + "description": "Specifies which analysis type for the pipeline- either 'wgs' or 'wes'.", + "fa_icon": "fas fa-align-center", + "enum": ["wgs", "wes"] + }, "bin_length": { "type": "number", "default": 1000, @@ -18,6 +25,24 @@ "fa_icon": "fas fa-sort-numeric-down", "help_text": "Used by GATK's PreprocessIntervals. GATK recommends a bin length of 1000 for WGS analysis, and 0 for WES analysis. " }, + "exclude_bed": { + "type": "string", + "exists": true, + "format": "path", + "fa_icon": "fas fa-file", + "pattern": "^\\S+\\.bed$", + "description": "Path to directory for a bed file containing regions to be exluded from the analysis.", + "help_text": "If the regions you would like to exclude are in bed format, use this option. If you have an interval_list file, use `exclude_interval_list` parameter instead." + }, + "exclude_interval_list": { + "type": "string", + "exists": true, + "format": "path", + "fa_icon": "fas fa-file", + "pattern": "^\\S+\\._interval_list$", + "description": "Path to directory for exclude_interval_list file.", + "help_text": "If the regions you would like to exclude are in interval_list format, use this option. If you have a bed file, use `exclude` parameter instead." + }, "mappable_regions": { "type": "string", "exists": true, @@ -63,6 +88,24 @@ "format": "file-path", "fa_icon": "fas fa-file", "help_text": "Used by GATK's AnnotateIntervals." + }, + "target_bed": { + "type": "string", + "exists": true, + "format": "path", + "fa_icon": "fas fa-file", + "pattern": "^\\S+\\.bed$", + "description": "Path to directory for target bed file.", + "help_text": "If the regions you would like to analyse are in bed format, use this option. If you have an interval_list file, use `target_interval_list` parameter instead." + }, + "target_interval_list": { + "type": "string", + "exists": true, + "format": "path", + "fa_icon": "fas fa-file", + "pattern": "^\\S+\\._interval_list$", + "description": "Path to directory for target interval_list file.", + "help_text": "If the regions you would like to analyse are in interval_list format, use this option. If you have a bed file, use `target_bed` parameter instead." } } }, diff --git a/subworkflows/local/germlinecnvcaller_cohort.nf b/subworkflows/local/germlinecnvcaller_cohort.nf index 750764b..77303b6 100644 --- a/subworkflows/local/germlinecnvcaller_cohort.nf +++ b/subworkflows/local/germlinecnvcaller_cohort.nf @@ -1,21 +1,27 @@ -include { GATK4_ANNOTATEINTERVALS } from '../../modules/nf-core/gatk4/annotateintervals/main' -include { GATK4_COLLECTREADCOUNTS } from '../../modules/nf-core/gatk4/collectreadcounts/main' -include { GATK4_DETERMINEGERMLINECONTIGPLOIDY } from '../../modules/nf-core/gatk4/determinegermlinecontigploidy/main' -include { GATK4_FILTERINTERVALS } from '../../modules/nf-core/gatk4/filterintervals/main' -include { GATK4_GERMLINECNVCALLER } from '../../modules/nf-core/gatk4/germlinecnvcaller/main' -include { GATK4_INTERVALLISTTOOLS } from '../../modules/nf-core/gatk4/intervallisttools/main' -include { GATK4_PREPROCESSINTERVALS } from '../../modules/nf-core/gatk4/preprocessintervals/main' -include { PICARD_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/picard/createsequencedictionary/main' -include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' -include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { GATK4_ANNOTATEINTERVALS } from '../../modules/nf-core/gatk4/annotateintervals/main' +include { GATK4_BEDTOINTERVALLIST as GATK4_BEDTOINTERVALLIST_TARGETS } from '../../modules/nf-core/gatk4/bedtointervallist/main' +include { GATK4_BEDTOINTERVALLIST as GATK4_BEDTOINTERVALLIST_EXCLUDE } from '../../modules/nf-core/gatk4/bedtointervallist/main' +include { GATK4_COLLECTREADCOUNTS } from '../../modules/nf-core/gatk4/collectreadcounts/main' +include { GATK4_DETERMINEGERMLINECONTIGPLOIDY } from '../../modules/nf-core/gatk4/determinegermlinecontigploidy/main' +include { GATK4_FILTERINTERVALS } from '../../modules/nf-core/gatk4/filterintervals/main' +include { GATK4_GERMLINECNVCALLER } from '../../modules/nf-core/gatk4/germlinecnvcaller/main' +include { GATK4_INTERVALLISTTOOLS } from '../../modules/nf-core/gatk4/intervallisttools/main' +include { GATK4_PREPROCESSINTERVALS } from '../../modules/nf-core/gatk4/preprocessintervals/main' +include { PICARD_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/picard/createsequencedictionary/main' +include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' workflow GERMLINECNVCALLER_COHORT { take: - ch_user_dict // channel: [mandatory] [ val(meta), path(dict) ] - ch_user_fai // channel: [mandatory] [ val(meta), path(fai) ] - ch_fasta // channel: [mandatory] [ val(meta), path(fasta) ] - ch_input // channel: [mandatory] [ val(meta), path(bam/cram), path(bai/crai) ] - ch_ploidy_priors // channel: [mandatory] [ path(tsv) ] + ch_user_dict // channel: [mandatory] [ val(meta), path(dict) ] + ch_user_fai // channel: [mandatory] [ val(meta), path(fai) ] + ch_fasta // channel: [mandatory] [ val(meta), path(fasta) ] + ch_input // channel: [mandatory] [ val(meta), path(bam/cram), path(bai/crai) ] + ch_ploidy_priors // channel: [mandatory] [ path(tsv) ] + ch_target_bed // channel: [mandatory] [ val(meta), path(bed) ] + ch_user_target_interval_list // channel: [mandatory] [ val(meta), path(intervals) ] + ch_exclude_bed // channel: [mandatory] [ val(meta), path(bed) ] + ch_user_exclude_interval_list // channel: [mandatory] [ val(meta), path(intervals) ] main: ch_versions = Channel.empty() @@ -37,10 +43,42 @@ workflow GERMLINECNVCALLER_COHORT { .collect() .set { ch_fai } + GATK4_BEDTOINTERVALLIST_TARGETS (ch_target_bed, ch_dict) //Runs for wes analysis, when target_bed file is provided instead of target_interval_list + GATK4_BEDTOINTERVALLIST_EXCLUDE (ch_exclude_bed, ch_dict) //Runs for wes analysis, when exclude_bed file is provided instead of target_interval_list + + ch_user_target_interval_list + .combine(GATK4_BEDTOINTERVALLIST_TARGETS.out.interval_list.ifEmpty(null)) + .branch { it -> + intervallistfrompath: it[2].equals(null) + return [it[0], it[1]] + intervallistfrombed: !(it[2].equals(null)) + return [it[2], it[3]] + } + .set { ch_targets_for_mix } + + ch_targets_for_mix.intervallistfrompath.mix(ch_targets_for_mix.intervallistfrombed) + .collect() + .set {ch_target_interval_list} + + ch_user_exclude_interval_list + .combine(GATK4_BEDTOINTERVALLIST_EXCLUDE.out.interval_list.ifEmpty(null)) + .branch { it -> + intervallistfrompath: it[2].equals(null) + return [it[0], it[1]] + intervallistfrombed: !(it[2].equals(null)) + return [it[2], it[3]] + } + .set { ch_exclude_for_mix } + + ch_exclude_for_mix.intervallistfrompath.mix(ch_exclude_for_mix.intervallistfrombed) + .collect() + .set { ch_exclude_interval_list } + GATK4_PREPROCESSINTERVALS ( ch_fasta, ch_fai, ch_dict, - [[:],[]], [[:],[]] ) + ch_target_interval_list, + ch_exclude_interval_list) GATK4_ANNOTATEINTERVALS ( GATK4_PREPROCESSINTERVALS.out.interval_list, ch_fasta, @@ -121,6 +159,8 @@ workflow GERMLINECNVCALLER_COHORT { ch_versions = ch_versions.mix(PICARD_CREATESEQUENCEDICTIONARY.out.versions) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) ch_versions = ch_versions.mix(GATK4_PREPROCESSINTERVALS.out.versions) + ch_versions = ch_versions.mix(GATK4_BEDTOINTERVALLIST_TARGETS.out.versions) + ch_versions = ch_versions.mix(GATK4_BEDTOINTERVALLIST_EXCLUDE.out.versions) ch_versions = ch_versions.mix(GATK4_COLLECTREADCOUNTS.out.versions.first()) ch_versions = ch_versions.mix(GATK4_ANNOTATEINTERVALS.out.versions) ch_versions = ch_versions.mix(GATK4_FILTERINTERVALS.out.versions) diff --git a/tests/pipeline/germlinecnvcaller_cohort.nf.test b/tests/pipeline/germlinecnvcaller_cohort.nf.test index ecd7833..1239160 100644 --- a/tests/pipeline/germlinecnvcaller_cohort.nf.test +++ b/tests/pipeline/germlinecnvcaller_cohort.nf.test @@ -25,6 +25,10 @@ nextflow_workflow { [[ id:'test' ], file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam"),[]], [[ id:'test2' ], file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam"),[]]) input[4] = Channel.fromPath(params.ploidy_priors) + input[5] = Channel.value([[:],[]]) + input[6] = Channel.value([[:],[]]) + input[7] = Channel.value([[:],[]]) + input[8] = Channel.value([[:],[]]) """ } } diff --git a/workflows/createpanelrefs.nf b/workflows/createpanelrefs.nf index 2c87546..d9b383b 100644 --- a/workflows/createpanelrefs.nf +++ b/workflows/createpanelrefs.nf @@ -28,16 +28,24 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -ch_dict = params.dict ? Channel.fromPath(params.dict).map { dict -> [[id:dict.baseName],dict]}.collect() - : Channel.empty() -ch_fai = params.fai ? Channel.fromPath(params.fai).map { fai -> [[id:fai.baseName],fai]}.collect() - : Channel.empty() -ch_fasta = params.fasta ? Channel.fromPath(params.fasta).map { fasta -> [[id:fasta.baseName],fasta]}.collect() - : Channel.empty() -ch_ploidy_priors = params.ploidy_priors ? Channel.fromPath(params.ploidy_priors).collect() - : Channel.empty() -ch_cnvkit_targets = params.cnvkit_targets ? Channel.fromPath(params.cnvkit_targets).map { targets -> [[id:targets.baseName],targets]}.collect() - : Channel.value([[:],[]]) +ch_cnvkit_targets = params.cnvkit_targets ? Channel.fromPath(params.cnvkit_targets).map { targets -> [[id:targets.baseName],targets]}.collect() + : Channel.value([[:],[]]) +ch_dict = params.dict ? Channel.fromPath(params.dict).map { dict -> [[id:dict.baseName],dict]}.collect() + : Channel.empty() +ch_exclude_bed = params.exclude_bed ? Channel.fromPath(params.exclude_bed).map { exclude -> [[id:exclude.baseName],exclude]}.collect() + : Channel.value([[:],[]]) +ch_exclude_interval_list = params.exclude_interval_list ? Channel.fromPath(params.exclude_interval_list).map { exclude -> [[id:exclude.baseName],exclude]}.collect() + : Channel.value([[:],[]]) +ch_fai = params.fai ? Channel.fromPath(params.fai).map { fai -> [[id:fai.baseName],fai]}.collect() + : Channel.empty() +ch_fasta = params.fasta ? Channel.fromPath(params.fasta).map { fasta -> [[id:fasta.baseName],fasta]}.collect() + : Channel.empty() +ch_ploidy_priors = params.ploidy_priors ? Channel.fromPath(params.ploidy_priors).collect() + : Channel.empty() +ch_target_bed = params.target_bed ? Channel.fromPath(params.target_bed).map { targets -> [[id:targets.baseName],targets]}.collect() + : Channel.value([[:],[]]) +ch_target_interval_list = params.target_interval_list ? Channel.fromPath(params.target_interval_list).map { targets -> [[id:targets.baseName],targets]}.collect() + : Channel.value([[:],[]]) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -93,7 +101,11 @@ workflow CREATEPANELREFS { ch_fai, ch_fasta, ch_germlinecnvcaller_input, - ch_ploidy_priors ) + ch_ploidy_priors, + ch_target_bed, + ch_target_interval_list, + ch_exclude_bed, + ch_exclude_interval_list ) ch_versions = ch_versions.mix(GERMLINECNVCALLER_COHORT.out.versions) }