Merge pull request #10 from ramprasadn/gcnvcaller

Add germlinecnvcaller
nf-core · Sep 11, 2023 · 315c0fb · 315c0fb
2 parents 7fb1753 + 4572dea
commit 315c0fb
Show file tree

Hide file tree

Showing 35 changed files with 1,852 additions and 56 deletions.
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -14,6 +14,10 @@
 
   > Talevich E, Shain AH, Botton T, Bastian BC (2016) CNVkit: Genome-Wide Copy Number Detection and Visualization from Targeted DNA Sequencing. PLoS Comput Biol 12(4): e1004873. doi: 10.1371/journal.pcbi.1004873. PubMed PMID: 27100738. PubMed Central PMCID: PMC4839673.
 
+- [GATK] (https://genome.cshlp.org/content/20/9/1297)
+
+  > McKenna A, Hanna M, Banks E, et al. The Genome Analysis Toolkit: A MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010;20(9):1297-1303. doi:10.1101/gr.107524.110
+
 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
 
   > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.

diff --git a/README.md b/README.md
@@ -20,7 +20,8 @@
 
 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 2. Build Panel of Normals for [`CNVKIT`](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004873)
-3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+3. Build ploidy and cnv calling models for [`GATK's germlinecnvcaller workflow`](https://genome.cshlp.org/content/20/9/1297)
+4. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
 
 ## Usage
 
@@ -34,21 +35,27 @@ First, prepare a samplesheet with your input data that looks as follows:
 `samplesheet.csv`:
 
 ```csv
-sample,bam
-sample1,sample1.bam
-sample2,sample2.bam
-sample3,sample3.bam
-sample4,sample4.bam
+sample,bam,bai,cram,crai
+sample1,sample1.bam,sample1.bai,,
+sample2,sample2.bam,,,
+sample3,sample3.bam,sample3.bai,,
+sample4,sample4.bam,,,
 ```
 
-Each row represents a bam file.
+Each row in the samplesheet represents an alignment file, and it is important that you provide the files in the right format for the analysis you want to run.
+
+| Tool              | Alignment format             |
+| ----------------- | ---------------------------- |
+| cnvkit            | bam                          |
+| germlinecnvcaller | bam or cram or a mix of both |
 
 Now, you can run the pipeline using:
 
 ```bash
 nextflow run nf-core/createpanelrefs \
    -profile <docker/singularity/.../institute> \
    --input samplesheet.csv \
+   --tools <cnvkit/germlinecnvcaller> \
    --genome GATK.GRCh38 \
    --outdir <OUTDIR>
 ```

diff --git a/conf/modules/germlinecnvcaller_cohort.config b/conf/modules/germlinecnvcaller_cohort.config
@@ -0,0 +1,88 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args   = Additional arguments appended to command in module.
+        ext.args2  = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3  = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    withName: '.*GERMLINECNVCALLER_COHORT.*' {
+        publishDir = [
+            enabled: false
+        ]
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:SAMTOOLS_FAIDX' {
+        ext.when = { params.fai.equals(null) }
+        publishDir = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/germlinecnvcaller/references" },
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:PICARD_CREATESEQUENCEDICTIONARY' {
+        ext.when = { params.dict.equals(null) }
+        publishDir = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/germlinecnvcaller/references" },
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_PREPROCESSINTERVALS' {
+        ext.args = { ["--imr OVERLAPPING_ONLY",
+                        "--padding ${params.padding}",
+                        "--bin-length ${params.bin_length}"].join(" ")
+                    }
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_COLLECTREADCOUNTS' {
+        ext.args = {"--format ${params.readcount_format} --imr OVERLAPPING_ONLY"}
+        publishDir = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/germlinecnvcaller/readcounts" },
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_ANNOTATEINTERVALS' {
+        ext.args = {"--imr OVERLAPPING_ONLY"}
+        ext.prefix = {" ${meta.id}_annotated"}
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_FILTERINTERVALS' {
+        ext.args   = {"--imr OVERLAPPING_ONLY"}
+        ext.prefix = {" ${meta.id}_filtered"}
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_INTERVALLISTTOOLS' {
+        ext.args   = {"--SUBDIVISION_MODE INTERVAL_COUNT --SCATTER_CONTENT ${params.scatter_content}"}
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_DETERMINEGERMLINECONTIGPLOIDY' {
+        ext.args   = {"--imr OVERLAPPING_ONLY"}
+        publishDir = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/germlinecnvcaller/determinegermlinecontigploidy" },
+                pattern: "*-model",
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*GERMLINECNVCALLER_COHORT:GATK4_GERMLINECNVCALLER' {
+        ext.args   = {"--imr OVERLAPPING_ONLY --run-mode COHORT"}
+        publishDir = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/germlinecnvcaller/germlinecnvcaller" },
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+}
diff --git a/conf/test.config b/conf/test.config
@@ -25,6 +25,10 @@ params {
     // Main options
     tools = 'cnvkit'
 
+    //Germlinecnvcaller options
+    scatter_content = 2
+    ploidy_priors     = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/gatk/contig_ploidy_priors_table.tsv"
+
     // Small reference genome
     genome            = null
     igenomes_ignore   = true

diff --git a/docs/output.md b/docs/output.md
@@ -12,8 +12,8 @@ The directories listed below will be created in the results directory after the
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
-- [FastQC](#fastqc) - Raw read QC
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
+- [GATK's germlinecnvcaller](#germlinecnvcaller) - Publish read counts, ploidy and cnvcalling models that can be used to call cnv's in the case mode.
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
 ### FastQC
@@ -37,6 +37,26 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
 
+### GATK germlinecnvcaller
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `results/germlinecnvcaller/`
+  - `determinecontigploidy`
+    - `cohort-model`: Contig ploidy model.
+  - `germlinecnvcaller`
+    - `*_model`: CNV caller model for each scattered shard.
+  - `readcounts`
+    - `*.hdf5|.tsv`: Read count statistics for each sample.
+  - `references`
+    - `*.dict`: Sequence dictionary file. This file is not published if user supplies this file to the pipeline using the `--dict` parameter.
+    - `*.fai`: Fasta index file. This file is not published if user supplies this file to the pipeline using the `--fai` parameter.
+
+</details>
+
+[GATK](https://github.com/broadinstitute/gatk) is a toolkit which offers a wide variety of tools with a primary focus on variant discovery and genotyping. In this pipeline we have implemented GATK's germlinecnvcalling workflow for analysing a cohort of samples. The output files generated from this analysis can be used for analysing samples in case mode. For more information about the workflow and output files, see GATK's documentation [here.](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152--How-to-Call-common-and-rare-germline-copy-number-variants)
+
 ### MultiQC
 
 <details markdown="1">

diff --git a/docs/usage.md b/docs/usage.md
@@ -6,61 +6,48 @@
 
 ## Introduction
 
-<!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
-
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `--input` parameter to specify its location. It has to be a comma-separated file and recognizes the following fields as column headers.
+
+| Fields   | Description                    |
+| -------- | ------------------------------ |
+| `sample` | Custom sample name.            |
+| `bam`    | Alignment file in bam format.  |
+| `bai`    | bam file index.                |
+| `cram`   | Alignment file in cram format. |
+| `crai`   | cram file index.               |
 
 ```bash
 --input '[path to samplesheet file]'
 ```
 
-### Multiple runs of the same sample
-
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
-
-```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
-```
-
 ### Full samplesheet
 
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
+The pipeline will auto-detect whether a sample is aligned in bam/cram format using the information provided in the samplesheet. The samplesheet can have either bam/cram files with or without their indices.
 
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
+A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 3 samples, where one sample, `SAMPLE_1` is missing its index file.
 
 ```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+sample,bam,bai
+SAMPLE_1,sample1.bam,
+SAMPLE_2,sample2.bam,sample2.bam.bai
+SAMPLE_3,sample3.bam,sample3.bam.bai
 ```
 
-| Column    | Description                                                                                                                                                                            |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:
 
 ```bash
-nextflow run nf-core/createpanelrefs --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker
+nextflow run nf-core/createpanelrefs --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker --tools cnvkit,germlinecnvcaller
 ```
 
-This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
+This will launch the pipeline with the `docker` configuration profile, and generate reference files necessary for cnvkit and germlinecnvcaller. To learn more about what tool options are recognized by the pipeline, check the pipeline's documentation on the [nf-core website](https://nf-co.re/createpanelrefs/dev/parameters/).
+
+See below for more information about profiles.
 
 Note that the pipeline will create the following files in your working directory:
 
@@ -114,6 +101,19 @@ To further assist in reproducbility, you can use share and re-use [parameter fil
 
 > 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles.
 
+## Workflow specific arguments
+
+### germlinecnvcaller
+
+If you are running the pipeline to generate references for the GATK's germlinecnvcalling workflow, you should ensure that you have provided all the mandatory options specified in the table below.
+
+| Mandatory                 | Optional |
+| ------------------------- | -------- |
+| fasta/genomes             | fai      |
+| ploidy_priors<sup>1</sup> | dict     |
+
+<sup>1</sup> To learn more about this file, see [this comment](https://gatk.broadinstitute.org/hc/en-us/community/posts/360074399831/comments/13441240230299) on GATK forum.<br />
+
 ## Core Nextflow arguments
 
 > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen).

diff --git a/main.nf b/main.nf
@@ -16,9 +16,9 @@ nextflow.enable.dsl = 2
     GENOME PARAMETER VALUES
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
-
 params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')
-
+params.fai   = WorkflowMain.getGenomeAttribute(params, 'fai')
+params.dict  = WorkflowMain.getGenomeAttribute(params, 'dict')
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     VALIDATE & PRINT PARAMETER SUMMARY

diff --git a/modules.json b/modules.json
@@ -15,10 +15,60 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "gatk4/annotateintervals": {
+                        "branch": "master",
+                        "git_sha": "016397249f05f5af7b97e3ea8d64458a07df2928",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/collectreadcounts": {
+                        "branch": "master",
+                        "git_sha": "d25bf48327e86a7f737047a57ec264b90e22ce3d",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/determinegermlinecontigploidy": {
+                        "branch": "master",
+                        "git_sha": "8c4542e5d421c4690cf1fa6ec729e9304763fdaf",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/filterintervals": {
+                        "branch": "master",
+                        "git_sha": "016397249f05f5af7b97e3ea8d64458a07df2928",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/germlinecnvcaller": {
+                        "branch": "master",
+                        "git_sha": "16bda00336e449b83d9b62abaa614f3880664ffb",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/intervallisttools": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
+                    "gatk4/preprocessintervals": {
+                        "branch": "master",
+                        "git_sha": "1226419498a14d17f98d12d6488d333b0dbd0418",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
+                    },
+                    "picard/createsequencedictionary": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
+                    "samtools/faidx": {
+                        "branch": "master",
+                        "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
+                        "installed_by": ["modules"]
+                    },
+                    "samtools/index": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
                     }
                 }
             }