Merge pull request #16 from TRON-Bioinformatics/master

Merge master into develop
TRON-Bioinformatics · Feb 9, 2024 · 5a1ee6b · 5a1ee6b
2 parents 6be9be4 + a9c44b2
commit 5a1ee6b
Show file tree

Hide file tree

Showing 28 changed files with 67,764 additions and 363 deletions.
diff --git a/.github/ISSUE_TEMPLATE/issue-template.md b/.github/ISSUE_TEMPLATE/issue-template.md
@@ -0,0 +1,23 @@
+---
+name: Issue template
+about: Template to report an issue
+title: ''
+labels: ''
+assignees: priesgo
+
+---
+
+**Describe the issue**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior.
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml
@@ -0,0 +1,33 @@
+name: Automated tests
+
+on: [push]
+
+jobs:
+  test:
+    runs-on: ubuntu-20.04
+
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-java@v2
+      with:
+        distribution: 'zulu' # See 'Supported distributions' for available options
+        java-version: '11'
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        auto-update-conda: true
+        channels: defaults,conda-forge,bioconda
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get --assume-yes install wget make procps software-properties-common
+        wget -qO- https://get.nextflow.io | bash && cp nextflow /usr/local/bin/nextflow
+        conda update conda
+    - name: Cache conda environments
+      uses: actions/cache@v2
+      with:
+        path: |
+          /home/runner/work/tronflow-bam-preprocessing/tronflow-bam-preprocessing/work/conda
+        key: ${{ runner.os }}-tronflow-bam-preprocessing
+    - name: Run tests
+      run: |
+        export NXF_VER=22.04.5
+        make
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,6 @@ report.html*
 timeline.html*
 trace.txt*
 dag.dot*
-*.swp
+*.swp
+/test_data/ucsc.hg19.minimal.without_indices.dict
+/test_data/ucsc.hg19.minimal.without_indices.fasta.fai
diff --git a/Makefile b/Makefile
@@ -1,56 +1,20 @@
-all : clean test check
+all : clean test
 
 clean:
 	rm -rf output
 	rm -f .nextflow.log*
 	rm -rf .nextflow*
 
 test:
-	nextflow main.nf --help
-	nextflow main.nf -profile test,conda --output output/test1
-	nextflow main.nf -profile test,conda --skip_bqsr --output output/test2
-	nextflow main.nf -profile test,conda --skip_realignment --output output/test3
-	nextflow main.nf -profile test,conda --skip_deduplication --output output/test4
-	nextflow main.nf -profile test,conda --output output/test5 --skip_deduplication --skip_bqsr --skip_metrics --known_indels1 false --known_indels2 false
-	nextflow main.nf -profile test,conda --output output/test6 --intervals false --skip_deduplication --skip_bqsr --skip_realignment
-	nextflow main.nf -profile test,conda --output output/test7 --skip_bqsr --skip_realignment
-	nextflow main.nf -profile test,conda --output output/test8  --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false --skip_bqsr --skip_realignment
-	nextflow main.nf -profile test,conda --output output/test9 --skip_deduplication --skip_bqsr --skip_realignment  --input_files false --input_bam test_data/TESTX_S1_L001.bam
-
-check:
-	test -s output/test1/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 1 output file!"; exit 1; }
-	test -s output/test1/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 1 output file!"; exit 1; }
-	test -s output/test1/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 1 output file!"; exit 1; }
-	test -s output/test1/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 1 output file!"; exit 1; }
-	test -s output/test2/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 2 output file!"; exit 1; }
-	test -s output/test2/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 2 output file!"; exit 1; }
-	test -s output/test2/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 2 output file!"; exit 1; }
-	test -s output/test2/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 2 output file!"; exit 1; }
-	test -s output/test3/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 3 output file!"; exit 1; }
-	test -s output/test3/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 3 output file!"; exit 1; }
-	test -s output/test3/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 3 output file!"; exit 1; }
-	test -s output/test3/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 3 output file!"; exit 1; }
-	test -s output/test4/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 4 output file!"; exit 1; }
-	test -s output/test4/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 4 output file!"; exit 1; }
-	test -s output/test4/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 4 output file!"; exit 1; }
-	test -s output/test4/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 4 output file!"; exit 1; }
-	test -s output/test5/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 5 output file!"; exit 1; }
-	test -s output/test5/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 5 output file!"; exit 1; }
-	test -s output/test5/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 5 output file!"; exit 1; }
-	test -s output/test5/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 5 output file!"; exit 1; }
-	test -s output/test6/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 6 output file!"; exit 1; }
-	test -s output/test6/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 6 output file!"; exit 1; }
-	test -s output/test6/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 6 output file!"; exit 1; }
-	test -s output/test6/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 6 output file!"; exit 1; }
-	test -s output/test7/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 7 output file!"; exit 1; }
-	test -s output/test7/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 7 output file!"; exit 1; }
-	test -s output/test7/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 7 output file!"; exit 1; }
-	test -s output/test7/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 7 output file!"; exit 1; }
-	test -s output/test8/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 8 output file!"; exit 1; }
-	test -s output/test8/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 8 output file!"; exit 1; }
-	test -s output/test8/sample1/metrics/TESTX_S1_L001.prepared.dedup.hs_metrics.txt || { echo "Missing test 8 output file!"; exit 1; }
-	test -s output/test8/sample1/metrics/TESTX_S1_L001.prepared.dedup_metrics.txt || { echo "Missing test 8 output file!"; exit 1; }
-	test -s output/test8/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing test 8 output file!"; exit 1; }
-	test -s output/test8/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing test 8 output file!"; exit 1; }
-	test -s output/test9/TESTX_S1_L001/TESTX_S1_L001.preprocessed.bam || { echo "Missing test 9 output file!"; exit 1; }
-	test -s output/test9/TESTX_S1_L001/TESTX_S1_L001.preprocessed.bai || { echo "Missing test 9 output file!"; exit 1; }
+	bash tests/test_00.sh
+	bash tests/test_01.sh
+	bash tests/test_02.sh
+	bash tests/test_03.sh
+	bash tests/test_04.sh
+	bash tests/test_05.sh
+	bash tests/test_06.sh
+	bash tests/test_07.sh
+	bash tests/test_08.sh
+	bash tests/test_09.sh
+	bash tests/test_10.sh
+	bash tests/test_11.sh
diff --git a/README.md b/README.md
@@ -1,22 +1,36 @@
 # TronFlow BAM preprocessing pipeline
 
+![GitHub tag (latest SemVer)](https://img.shields.io/github/v/release/tron-bioinformatics/tronflow-bam-preprocessing?sort=semver)
+[![Automated tests](https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing/actions/workflows/automated_tests.yml/badge.svg)](https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing/actions/workflows/automated_tests.yml)
 [![DOI](https://zenodo.org/badge/358400957.svg)](https://zenodo.org/badge/latestdoi/358400957)
+[![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT)
+[![Powered by Nextflow](https://img.shields.io/badge/powered%20by-Nextflow-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://www.nextflow.io/)
 
-Nextflow (Di Tommaso, 2017) pipeline for the preprocessing of BAM files based on Picard and GATK (DePristo, 2011).
+The TronFlow BAM preprocessing pipeline is part of a collection of computational workflows for tumor-normal pair 
+somatic variant calling. These workflows are implemented in the Nextflow (Di Tommaso, 2017) framework.
+
+Find the documentation here [![Documentation Status](https://readthedocs.org/projects/tronflow-docs/badge/?version=latest)](https://tronflow-docs.readthedocs.io/en/latest/?badge=latest)
+
+
+The aim of this workflow is to preprocess BAM files based on Picard and GATK (DePristo, 2011) best practices.
 
 
 ## Background
 
-In order to have a variant calling ready BAM file there are a number of operations that need to be applied on the BAM. This pipeline depends on the particular variant caller, but there are some common operations.
+In order to have a variant calling ready BAM file there are a number of operations that need to be applied on the BAM. 
+This pipeline depends on the particular variant caller, but there are some common operations.
 
-GATK has been providing a well known best practices document on BAM preprocessing, the latest best practices for GATK4 (https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165) does not perform anymore realignment around indels as opposed to best practices for GATK3 (https://software.broadinstitute.org/gatk/documentation/article?id=3238). This pipeline is based on both Picard and GATK. These best practices have been implemented a number of times, see for instance this implementation in Workflow Definition Language https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl.
+GATK has been providing a well known best practices document on BAM preprocessing, the latest best practices for 
+GATK4 (https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165) does not perform anymore realignment around indels as opposed to best practices for GATK3 (https://software.broadinstitute.org/gatk/documentation/article?id=3238). This pipeline is based on both Picard and GATK. These best practices have been implemented a number of times, see for instance this implementation in Workflow Definition Language https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl.
 
 
 ## Objectives
 
-We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different situations. For this purpose there are some required steps and some optional steps. This is implemented as a Nextflow pipeline to simplify parallelization of execution in the cluster. The default configuration uses reference genome hg19, if another reference is needed the adequate resources must be provided. The reference genome resources  for hg19 were downloaded from https://software.broadinstitute.org/gatk/download/bundle
+We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different 
+use cases. 
+For this purpose there are some required steps and some optional steps.  
 
-The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs. 
+The input can be either a tab-separated values file (`--input_files`) where each line corresponds to one input BAM or a single BAM (`--input_bam` and `--input_name`).
 
 ## Implementation
 
@@ -28,19 +42,10 @@ Steps:
 * **Mark duplicates** (optional). Identify the PCR and the optical duplications and marks those reads. This uses the parallelized version on Spark, it is reported to scale linearly up to 16 CPUs.
 * **Realignment around indels** (optional). This procedure is important for locus based variant callers, but for any variant caller doing haplotype assembly it is not needed. This is computing intensive as it first finds regions for realignment where there are indication of indels  and then it performs a local realignment over those regions. Implemented in GATK3, deprecated in GATK4
 * **Base Quality Score Recalibration (BQSR)** (optional). It aims at correcting systematic errors in the sequencer when assigning the base call quality errors, as these scores are used by variant callers it improves variant calling in some situations. Implemented in GATK4
-* **Metrics** (optional). A number of metrics are obtained over the BAM file with Picard's CollectMetrics (eg: duplication, insert size, alignment, etc.).
+* **Metrics** (optional). A number of metrics are obtained from the BAM file with Picard's CollectMetrics, CollectHsMetrics and samtools' coverage and depth.
 
 ![Pipeline](figures/bam_preprocessing2.png)
 
-## References
-
-The bam preprocessing workflow requires the human reference genome (`--reference`)
-Base Quality Score Recalibration (BQSR) requires dbSNP to avoid extracting error metrics from polymorphic sites (`--dbsnp`)
-Realignment around indels requires a set of known indels (`--known_indels1` and `--known_indels2`).
-These resources can be fetched from the GATK bundle https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle.
-
-Optionally, in order to run Picard's CollectHsMetrics an intervals file will need to be provided (`--intervals`). 
-This can be built from a BED file using Picard's BedToIntervalList (https://gatk.broadinstitute.org/hc/en-us/articles/360036883931-BedToIntervalList-Picard-)
 
 ## How to run it
 
@@ -55,21 +60,21 @@ Usage:
 
 Input:
     * --input_bam: the path to a single BAM (this option is not compatible with --input_files)
-    * --input_files: the path to a tab-separated values file containing in each row the sample name, sample type (eg: tumor or normal) and path to the BAM file (this option is not compatible with --input_bam)
+    * --input_files: the path to a tab-separated values file containing in each row the sample name, sample type (eg: tumor or normal) and path to the BAM file. The sample name should be unique for each bam file. (this option is not compatible with --input_bam)
     Sample type will be added to the BAM header @SN sample name
     The input file does not have header!
     Example input file:
-    name1       tumor   tumor.1.bam
-    name1       normal  normal.1.bam
-    name2       tumor   tumor.2.bam
+    name1_t   tumor   tumor.1.bam
+    name1_n   normal  normal.1.bam
+    name2_t   tumor   tumor.2.bam
     * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
 
 Optional input:
     * --input_name: the name of the sample. Only used when --input_bam is provided (default: normal)
     * --dbsnp: path to the dbSNP VCF (required to perform BQSR)
     * --known_indels1: path to a VCF of known indels (optional to perform realignment around indels)
     * --known_indels2: path to a second VCF of known indels (optional to perform realignment around indels)
-    * --intervals: path to an intervals file to collect HS metrics from, this can be built with Picard's BedToIntervalList (default: None)
+    * --intervals: path to a BED file to collect coverage and HS metrics from (default: None)
     * --collect_hs_minimum_base_quality: minimum base quality for a base to contribute coverage (default: 20).
     * --collect_hs_minimum_mapping_quality: minimum mapping quality for a read to contribute coverage (default: 20).
     * --skip_bqsr: optionally skip BQSR (default: false)
@@ -98,12 +103,46 @@ Computational resources:
 
 Optional output:
     * Recalibration report
+    * Deduplication metrics
     * Realignment intervals
-    * Metrics
+    * GATK multiple metrics
+    * HS metrics
+    * Horizontal and vertical coverage metrics
 ```
 
+### Input table
+
+The table with FASTQ files expects two tab-separated columns **without a header**
+
+| Sample name          | Sample type                      | BAM                  |
+|----------------------|---------------------------------|------------------------------|
+| sample_1             | normal      |    /path/to/sample_1.normal.bam   |
+| sample_1             | tumor      |    /path/to/sample_1.tumor.bam   |
+| sample_2             | normal      |    /path/to/sample_2.normal.bam   |
+| sample_2             | tumor      |    /path/to/sample_2.tumor.bam   |
+
+The values used in `sample type` are arbitrary. These will be set in the BAM header tag @RG:SM for sample. There may be some downstream constraints, eg: Mutect2 pipeline requires that the sample type between normal and tumor samples of the same pair are not the same.
+
+### References
+
+The BAM preprocessing workflow requires the human reference genome (`--reference`)
+Base Quality Score Recalibration (BQSR) requires dbSNP to avoid extracting error metrics from polymorphic sites (`--dbsnp`)
+Realignment around indels requires a set of known indels (`--known_indels1` and `--known_indels2`).
+These resources can be fetched from the GATK bundle https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle.
+
+Optionally, in order to run Picard's CollectHsMetrics a BED file will need to be provided (`--intervals`).
+This BED file will also be used for `samtools coverage`.
+
+## Troubleshooting
+
+### Too new Java version for MarkDuplicatesSpark
+
+When using Java 11 the cryptic error messsage `java.lang.IllegalArgumentException: Unsupported class file major version 55` has been observed.
+This issue is described here and the solution is to use Java 8 https://gatk.broadinstitute.org/hc/en-us/community/posts/360056174592-MarkDuplicatesSpark-crash.
+
+
 
-## References
+## Bibliography
 
 * DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D, Daly M. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet, 43:491-498. DOI: 10.1038/ng.806.
 * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316–319. 10.1038/nbt.3820
diff --git a/environment.yml b/environment.yml