diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc8..5085ace 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,4 @@ repository_type: pipeline +lint: + multiqc_config: + report_comment: False diff --git a/CITATIONS.md b/CITATIONS.md index 6411d58..fc47df4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,7 +12,7 @@ - [BWA](https://www.ncbi.nlm.nih.gov/pubmed/19451168/) -> Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234. + > Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234. - [deepTools](https://www.ncbi.nlm.nih.gov/pubmed/27079975/) @@ -32,7 +32,9 @@ > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002. -- [Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) +- [Trimmomatic](https://pubmed.ncbi.nlm.nih.gov/24695404/) + + > Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. 2014 Aug 1;30(15):2114-20. doi: 10.1093/bioinformatics/btu170. Epub 2014 Apr 1. PMID: 24695404; PMCID: PMC4103590. ## R packages diff --git a/README.md b/README.md index 1f2b3fa..340504e 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,6 @@ to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/i with `-profile test` before running the workflow on actual data. ::: - - First, prepare a samplesheet with your input data that looks as follows: `samplesheet.csv`: @@ -64,11 +61,10 @@ Each row represents a fastq file (single-end) or a pair of fastq files (paired e Now, you can run the pipeline using: - - ```bash nextflow run nf-core/sammyseq \ -profile \ + --fasta reference_genome.fa \ --input samplesheet.csv \ --outdir ``` @@ -78,19 +74,12 @@ or ```bash nextflow run nf-core/sammyseq \ -profile \ + --fasta reference_genome.fa \ --input samplesheet.csv \ --outdir \ --comparisonFile comparisons.csv ``` -`comparisons.csv`: - -```csv -sample1,sample2 -CTRL004_S2,CTRL004_S3 -CTRL004_S2,CTRL004_S4 -``` - :::warning Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 541c831..7dfa5ed 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,8 +1,8 @@ report_comment: > - This report has been generated by the nf-core/sammyseq + This report has been generated by the nf-core/sammyseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-sammyseq-methods-description": diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..3bc0416 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,fastq_1,fastq_2,experimentalID,fraction +SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,SAMPLE_PAIRED_END_EXPID,S2 +SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,,SAMPLE_SINGLE_END_EXPID,S2 diff --git a/conf/base.config b/conf/base.config index f0a8691..05cea2b 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,7 @@ process { - // TODO nf-core: Check the defaults for all processes + // nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -24,7 +24,7 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. + // nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } diff --git a/conf/modules.config b/conf/modules.config index ef99c6d..715dc71 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,14 @@ process { ] } + withName : ".*PREPARE_GENOME:.*" { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + } + withName: FASTQC { ext.args = '--quiet' } @@ -58,20 +66,20 @@ process { // Alignment, Picard MarkDuplicates and Filtering options // - withName: '.*FASTQ_ALIGN_BWAALN:BWA_.*' { + withName: '.*FASTQ_ALIGN_BWAALN:.*' { publishDir = [ [ - path: { "${params.outdir}/BWA" }, + path: { "${params.outdir}/alignment/bwa" }, mode: params.publish_dir_mode, - pattern: '*.bam*', + pattern: '*.{bam,bai}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ] } - withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { - ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES true --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' + withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES.*' { + ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' ext.prefix = { "${meta.id}.md" } publishDir = [ [ @@ -81,7 +89,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/markduplicates/bam" }, + path: { "${params.outdir}/alignment/markduplicates" }, mode: params.publish_dir_mode, pattern: '*.md.{bam,bai}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -90,10 +98,10 @@ process { ] } - withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' { + withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX.*' { ext.prefix = { "${meta.id}.markdup.sorted" } publishDir = [ - path: { "${params.outdir}/markduplicates/bam" }, + path: { "${params.outdir}/alignment/markduplicates" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.{bai,csi}' @@ -110,4 +118,39 @@ process { ] } + withName : ".*SAMTOOLS_FAIDX.*" { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: params.publish_dir_mode, + ] + } + + withName : ".*DEEPTOOLS_BAMCOVERAGE.*" { + + publishDir = [ + path: { "${params.outdir}/single_tracks/deeptools" }, + mode: params.publish_dir_mode, + pattern: '*.bigWig' + ] + } + + withName : ".*RTWOSAMPLESMLE.*" { + publishDir = [ + path: { "${params.outdir}/comparisons/spp_mle" }, + mode: params.publish_dir_mode, + ] + } + + +} + +if (params.blackListFile != null) { + process { + withName: '.*DEEPTOOLS_BAMCOVERAGE.*' { + ext.args = "–blackListFileName ${params.blackListFile}" + } + } } + + + diff --git a/conf/test.config b/conf/test.config index ed15af0..74c4282 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,5 +23,5 @@ params { input = 'https://genome.isasi.cnr.it/biocomp/test-datasets/sammyseq/testdata/chr22/samplesheet_test_github_chr22_tinier.csv' // Genome references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + fasta = 'https://genome.isasi.cnr.it/biocomp/test-datasets/sammyseq/testdata/chr22/chr22.fa' } diff --git a/docs/output.md b/docs/output.md index 0a7fdf3..4d92099 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,17 +6,28 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [FastQC](#fastqc) +- [Trim reads](#trim-reads) +- [Alignment on Reference](#alignment-on-reference) +- [Mark Duplicate reads](#mark-duplicate-reads) +- [Signal track generation](#signal-track-generation) +- [Comparisons](#comparisons) +- [MultiQC](#multiqc) +- [Pipeline information](#pipeline-information) + +### Read quality check + +#### FastQC + +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about the sequenced reads. It provides information about the quality score distribution across reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -### FastQC +#### Trim reads + +[`Trimmomatic`](http://www.usadellab.org/cms/?page=trimmomatic) is a software used to trim adapter sequences and low quality bases from the end of reads and quality check after this step is performed again with Fastqc.
Output files @@ -24,23 +35,71 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `fastqc/` - `*_fastqc.html`: FastQC report containing quality metrics. - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + - `*_trim_fastqc.html`: FastQC report containing quality metrics for trimmed reads. + - `*_trim_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images for trimmed reads.
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +:::note +The FastQC plots displayed in the MultiQC report shows both _untrimmed_ and _trimmed_ reads so they can be directly compared. +::: + +### Alignment on Reference -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +The alignment is performed using [BWA](https://github.com/lh3/bwa) and the aligned reads are then sorted by chromosome coordinates with [samtools](https://www.htslib.org/doc/samtools.html). -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +- `alignment/bwa/` + - `.bam` and `.bam.bai` -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +
+ +### Mark Duplicate reads + +Read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, not removed from the bam file. + +
+Output files + +- `alignment/markduplicates/` + - `.md.bam` and `.md.bam.bai` +- `reports/markduplicates/` + - `.md.MarkDuplicates.metrics.txt` + +
+ +### Signal track generation + +[deepTools](https://deeptools.readthedocs.io/en/develop/content/list_of_tools.html) is used to generate single fraction signals in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting. + +
+Output files + +- `single_tracks/deeptools/` + - `.bigWig` + +
+ +### Comparisons + +When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile smoothed by the Gaussian kernel is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508 + +
+Output files + +- `comparisons/spp_mle/` + - `.md_VS_.md.bw` + +
### MultiQC +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +
Output files @@ -51,12 +110,22 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +### Reference genome files -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +A number of genome-specific files if required by some of the analysis steps. If the `--save_reference` parameter is provided then the alignment indices generated by the pipeline will be saved in this directory. + +
+Output files + +- `genome/` + - `bwa/`: Directory containing BWA indices. + +
### Pipeline information +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +
Output files @@ -67,5 +136,3 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - Parameters used by the pipeline run: `params.json`.
- -[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/usage.md b/docs/usage.md index 30ecc2c..5cca560 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,31 +6,79 @@ ## Introduction - +sammyseqis a workflow designed for the analysis of Sequential Analysis of MacroMolecules accessibilitY sequencing (SAMMY-seq) data, a cheap and effective methodology to analyze chromatin state in cells. SAMMY-seq is an innovative technique based on the separation of chromatin in fractions, each progressively based on their solubility and accessibility, and extraction and sequencing of the DNA present in each of them. + +## Running the pipeline + +The typical command for running the pipeline is as follows: + +```bash +nextflow run nf-core/sammyseq -r dev \ + -profile docker \ + --fasta ./reference_genome.fa\ + --input ./samplesheet.csv \ + --outdir ./results +``` + +This will launch the pipeline with the `docker` configuration profile. See [below](#profile) for more information about profiles. + +Note that the pipeline will create the following files in your working directory: + +```bash +work # Directory containing the nextflow working files + # Finished results in specified location (defined with --outdir) +.nextflow_log # Log file from Nextflow +# Other nextflow hidden files, eg. history of pipeline runs and old logs. +``` + +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run nf-core/sammyseq -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './samplesheet.csv' +outdir: './results/' +fasta: './genome.fa' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +Before running the pipeline, you will need to create a samplesheet with information about the samples you would like to analyze. Use this parameter to specify its location: ```bash ---input '[path to samplesheet file]' +--input '[full path to samplesheet file]' ``` +It has to be a comma-separated file with 5 columns, and a header row as shown in the examples below. + ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 2 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample fraction sequenced across 2 lanes: ```console sample,fastq_1,fastq_2,experimentalID,fraction -CONTROL_REP1_S2,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,CONTROL,S2 -CONTROL_REP1_S2,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,CONTROL,S2 +CONTROL_REP1_S2,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,CONTROL_REP1,S2 +CONTROL_REP1_S2,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,CONTROL_REP1,S2 ``` ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 5 columns to match those defined in the table below. - - +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can contain a mixture of single- and paired-end but in case of multiple runs of the same `sample` they have to be of the same type to be correctly merged. There can be additional columns but the first 5 have to match those defined in the table below. ```console ssample,fastq_1,fastq_2,experimentalID,fraction @@ -45,55 +93,37 @@ CTRL004_S4,/home/sammy/test_data/CTRL004_S4_chr22only.fq.gz,,CTRL004,S4 | `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | | `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | | `experimentalID` | Experimental sample identifier. This represents the biological specimen of interest and will be the same for all fractions exctracted. | -| `fraction` | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4 . | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. - -## Running the pipeline +| `fraction` | Fraction derived from SAMMY protocol, e.g. depending on the protocol it can be S2, S2L, S2S, S3, S4. | -The typical command for running the pipeline is as follows: +An [example samplesheet](../tests/samplesheet_test.csv) has been provided with the pipeline. -```bash -nextflow run nf-core/sammyseq --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker -``` +### Pairwise comparisons -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +It is possible to generate pairwise comparisons between two samples by providing a list with the parameter `--comparisonFile` to indicate the full path to a comma-separated file with 2 columns: -Note that the pipeline will create the following files in your working directory: +`comparisons.csv`: -```bash -work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. +```csv +sample1,sample2 +CTRL004_S2,CTRL004_S3 +CTRL004_S2,CTRL004_S4 ``` -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. +It can contain any combination of sample identifiers, they have to correspond to identifiers present in the `sample` column in the input file. When `--comparisonFile` is set, the difference between sample1 and sample2 read density profile, smoothed by the Gaussian kernel, is calculated and saved in bigwig format, as described in Kharchenko PK, Tolstorukov MY, Park PJ "Design and analysis of ChIP-seq experiments for DNA-binding proteins" Nat. Biotech. doi:10.1038/nbt.1508 -Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. +### Combine fractions -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: - -The above pipeline run specified with a params file in yaml format: +Optionally, the fractions extracted from the same `experimentalID` can be combined together for later use by setting the parameter `--combine_fractions`. -```bash -nextflow run nf-core/sammyseq -profile docker -params-file params.yaml -``` +## Reference genome files -with `params.yaml` containing: +The minimum reference genome requirements is the FASTA file, provided with the mandatory parameter `--fasta`, the bwa index will be generated by the pipeline and can be saved for later reuse if the `--save_reference` parameter is passed. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space, if already present it can be passed using the `--bwa_index '/path/to/bwa/index/'` parameter. -```yaml -input: './samplesheet.csv' -outdir: './results/' -genome: 'GRCh37' -<...> -``` +### Blacklist bed files -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +A blacklist of regions that will be excluded by signal tracks can be provided using the optional parameter `--blackListFile` with full path to a coordinate file in bed format. Blacklist files for several genome builds can be found in the [ENCODE Blacklist Project](https://github.com/Boyle-Lab/Blacklist). -### Updating the pipeline +## Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -101,7 +131,7 @@ When you run the above command, Nextflow automatically pulls the pipeline code f nextflow pull nf-core/sammyseq ``` -### Reproducibility +## Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. diff --git a/main.nf b/main.nf index 615706d..f1fd011 100644 --- a/main.nf +++ b/main.nf @@ -17,7 +17,6 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') diff --git a/modules/local/rtwosamplesmle/main.nf b/modules/local/rtwosamplesmle/main.nf index 2161cc5..a5d1e55 100644 --- a/modules/local/rtwosamplesmle/main.nf +++ b/modules/local/rtwosamplesmle/main.nf @@ -48,11 +48,12 @@ process RTWOSAMPLESMLE { path chromsizes_file - //output: + output: // TODO nf-core: Named file extensions MUST be emitted for ALL output channels //tuple val(meta), path("*.bam"), emit: bam // TODO nf-core: List additional required output channels/values here //path "versions.yml" , emit: versions + tuple val(meta), path("*_mle.bw") , emit: results when: task.ext.when == null || task.ext.when diff --git a/modules/local/rtwosamplesmle/templates/two_samples_mle.R b/modules/local/rtwosamplesmle/templates/two_samples_mle.R index 5eb2b3e..805e72a 100644 --- a/modules/local/rtwosamplesmle/templates/two_samples_mle.R +++ b/modules/local/rtwosamplesmle/templates/two_samples_mle.R @@ -14,15 +14,15 @@ require(rtracklayer) # FUNCTIONS ################################################ -sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) { - if (!(is.null(endcol))) { - x <- x[order(x[,endcol]),] - } - x <- x[order(x[,stcol]),] - chrs <- ordered(x[,chrcol], levels=chrorder) - # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep="")) - x <- x[order(chrs),] -} +# sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=paste("chr", c(seq(22), "X", "Y"), sep="")) { +# if (!(is.null(endcol))) { +# x <- x[order(x[,endcol]),] +# } +# x <- x[order(x[,stcol]),] +# chrs <- ordered(x[,chrcol], levels=chrorder) +# # chrs <- ordered(tmp, levels=paste("chr", c(seq(22), "X", "Y"), sep="")) +# x <- x[order(chrs),] +# } ################################################ ## PARAMS @@ -36,8 +36,7 @@ sortbychr <- function(x, chrcol="chr", stcol="start", endcol=NULL, chrorder=past ip_file <- "${bam1}" input_file <- "${bam2}" chromsizes_file <- "${chromsizes_file}" -mle_output_file <- "${bam1.baseName}VS${bam2.baseName}_mle.txt" -remove_anomalies <- FALSE +mle_output_file <- "${bam1.baseName}_VS_${bam2.baseName}_mle.bw" ################################################ # DEFAULT PARAMETERS @@ -45,6 +44,7 @@ remove_anomalies <- FALSE remove_anomalies <- FALSE debug_mode <- TRUE +stebp = 50 ################################################ # CORE PROCESSES @@ -66,6 +66,7 @@ chromsizes <- fread(chromsizes_file, data.table = FALSE) #chromsizes <- sortbychr(chromsizes, chrcol="V1", stcol="V2", chrorder=paste("chr", c(seq(19), "X", "Y"), sep="")) #chrs <- as.character(sub('chr', '', chromsizes[, 1])) chrs <- chromsizes[, 1] +rownames(chromsizes) <- chrs ### Import the data @@ -75,31 +76,40 @@ input <- read.bam.tags(input_file) ### Select the informative tags ip_tags <- ip[['tags']] input_tags <- input[['tags']] + print("second part") ### Remove the tags with anomalies if(remove_anomalies==TRUE){ ip_rm <- remove.local.tag.anomalies(ip_tags[chrs]) input_rm <- remove.local.tag.anomalies(input_tags[chrs]) }else{ - ip_rm <- ip_tags - input_rm <- input_tags + ip_rm <- ip_tags[chrs] + input_rm <- input_tags[chrs] } print("third part") ### Make the smoothing with the loglikelihood function -print("lane 90") +print("line 90") #print(paste0("ip_rm=",head(ip_rm))) #print(paste0("input_rm=",head(ip_rm))) -chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE) -print("lane 91") +# chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm, input_rm, tag.shift = 0, background.density.scaling = FALSE) +ip_chrs <- names(ip_rm)[sapply(ip_rm, length) > 10] +input_chrs <- names(input_rm)[sapply(input_rm, length) > 10] +chr2keep <- intersect(ip_chrs, input_chrs) +# length(chr2keep) + +chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_rm[chr2keep], input_rm[chr2keep], tag.shift = 0, background.density.scaling = FALSE, step = stebp) + +print("line 91") #chip_smoothed_mle <- get.smoothed.enrichment.mle(ip_tags, input_tags, tag.shift = 0, background.density.scaling = FALSE) ccl <- sapply(chip_smoothed_mle, nrow) cpos <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$x)) +cend <- cpos + stebp - 1 ccov <- unlist(sapply(chip_smoothed_mle, function(csm) csm\$y)) print("line 97") -gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cpos), strand = Rle('*', sum(ccl)), score = ccov) -seqlengths(gr.mle) <- chromsizes[, 2] +gr.mle <- GRanges(seqnames = rep(names(chip_smoothed_mle), ccl), ranges = IRanges(start = cpos, end = cend), strand = Rle('*', sum(ccl)), score = ccov) +seqlengths(gr.mle) <- chromsizes[chr2keep, 2] #rename score col col_names <- names(mcols(gr.mle)) diff --git a/nextflow.config b/nextflow.config index 5140b09..ded694b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,9 +9,6 @@ // Global default params, used in configs params { - - - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null comparisonFile = null @@ -19,11 +16,11 @@ params { // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = false + igenomes_ignore = true bwa_index = null save_reference = false - - //overall analysis errorStrategy + + //overall analysis options stopAt = '' combine_fractions = false @@ -34,6 +31,9 @@ params { max_multiqc_email_size = '25.MB' multiqc_methods_description = null + // bam generation + blackListFile = null + // Boilerplate options outdir = null publish_dir_mode = 'copy' @@ -52,7 +52,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -238,7 +238,7 @@ manifest { description = """analysis pipeline for SAMMY-seq""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.01' + version = '0.01dev' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 86cb4cd..dbc2628 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,7 +15,21 @@ "stopAt": { "type": "string", "description": "Specify after which step the pipeline should stop.", - "fa_icon": "fas fa-stop-circle" + "fa_icon": "fas fa-stop-circle", + "hidden": true, + "enum": [ + "BEGIN", + "PREPARE_GENOME", + "INPUT_CHECK", + "CAT_FASTQ_lane", + "CAT_FRACTIONS", + "TRIMMOMATIC", + "FASTQ_ALIGN_BWAALN", + "SAMTOOLS_FAIDX", + "BAM_MARKDUPLICATES_PICARD", + "DEEPTOOLS_BAMCOVERAGE", + "RTWOSAMPLESMLE" + ] }, "input": { "type": "string", @@ -24,7 +38,7 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/sammyseq/usage#samplesheet-input).", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 5 columns, and a header row. See [usage docs](https://nf-co.re/sammyseq/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, "outdir": { @@ -48,14 +62,16 @@ "comparisonFile": { "type": "string", "format": "file-path", - "description": "Path to comma-separated file containing information about samples comparisons", + "description": "Path to comma-separated file containing the sample names for the desired paired comparisons", "mimetype": "text/csv", - "help_text": "You will need to create a comparisons file with information about the comparisons in your experiment before running the pipeline. Use this parameter to specify the necessary comparisons. It has to be a comma-separated file with 2 columns, one for the first sample to compair and the second one to sample against which to do it." + "help_text": "You will need to create a comparisons file with information about the desired comparisons in your experiment before running the pipeline. It has to be a comma-separated file with 2 columns: one for the first sample to compare and the second one to sample against which to do it. The sample names must correspond to sample names present in the first column of the samplesheet (--input parameter)", + "fa_icon": "fas fa-file-csv" }, "combine_fractions": { "type": "boolean", "description": "when combined, the fastqs of individual fractions from the same samples will be merged into a new fastq.", - "help_text": "the combined fastq file can be used in order to obtain information on the whole sample or to perform additional analyses not included" + "help_text": "the combined fastq file can be used in order to obtain information on the whole sample or to perform additional analyses not included", + "fa_icon": "fas fa-merge" } } }, @@ -64,6 +80,7 @@ "type": "object", "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", + "required": ["fasta"], "properties": { "genome": { "type": "string", @@ -99,7 +116,15 @@ "description": "Do not load the iGenomes reference config.", "fa_icon": "fas fa-ban", "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "default": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. You can run `sammyseq` by specifying at least a FASTA genome file." + }, + "blackListFile": { + "type": "string", + "format": "file-path", + "fa_icon": "fas fa-ban", + "description": "A BED or GTF file containing regions that should be excluded from all analyses", + "help_text": "Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant" } } }, diff --git a/subworkflows/local/cat_fractions.nf b/subworkflows/local/cat_fractions.nf new file mode 100644 index 0000000..c752644 --- /dev/null +++ b/subworkflows/local/cat_fractions.nf @@ -0,0 +1,50 @@ +// Import SAMTOOLS_FAIDX module +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' + +// Definition of the subworkflow +workflow CAT_FRACTIONS { + take: + //reads_to_merge + //all_reads + starting_files + + main: + + all_files = starting_files.map { meta, fastq -> + def expID = meta.expID // Prendi l'expID da meta + meta.remove('fraction') // Rimuovi il campo 'fraction' + [expID, meta, fastq] + } + + all_files.groupTuple( by:0 ) + .map{ expID, meta , fastq -> + meta2 = meta[0].clone() // Prendi solo la prima mappa e crea una copia + meta2.id = expID // Sostituisci il valore nel campo id con expID + [ [meta2], fastq.flatten().toList() ] + } + .set { reads_to_merge } + + reads_to_merge + .filter { meta, fastq -> fastq.size() > 1 } + .set { ch_reads_to_process_in_CAT_FASTQ } + + ch_reads_to_process_in_CAT_FASTQ + .flatMap { meta, fastq -> + meta.collect { m -> [m,fastq]} + } + .set { ch_to_CAT } + + //ch_to_CAT.view{"ch_to_CAT : ${it}"} + + CAT_FASTQ ( + ch_to_CAT + ).reads.set { cat_fastq_output } + + ch_cat_adjusted = CAT_FASTQ.out.reads.map { meta, fastq -> + return [meta, fastq] + } // make fastq of CAT_FASTQ a list of paths + merged_reads = starting_files.mix(ch_cat_adjusted) + + emit: + merged_reads = merged_reads +} \ No newline at end of file diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b39e726..eca6b1c 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -42,10 +42,11 @@ workflow PREPARE_GENOME { //println(ch_fasta) // Make fasta file available if reference saved or IGV is run //if (params.save_reference || !params.skip_igv) { - if (params.save_reference) { - file("${params.outdir}/genome/").mkdirs() - ch_fasta.copyTo("${params.outdir}/genome/") - } + + // if (params.save_reference) { + // file("${params.outdir}/genome/").mkdirs() + // ch_fasta.copyTo("${params.outdir}/genome/") + // } // diff --git a/tests/samplesheet_test_genome_isasi_chr22_tinier.csv b/tests/samplesheet_test.csv similarity index 100% rename from tests/samplesheet_test_genome_isasi_chr22_tinier.csv rename to tests/samplesheet_test.csv diff --git a/workflows/sammyseq.nf b/workflows/sammyseq.nf index 5f7b3bf..deb5e15 100644 --- a/workflows/sammyseq.nf +++ b/workflows/sammyseq.nf @@ -49,10 +49,12 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' include { FASTQ_ALIGN_BWAALN } from '../subworkflows/nf-core/fastq_align_bwaaln/main.nf' +include { CAT_FRACTIONS } from '../subworkflows/local/cat_fractions' include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard' include { CUT_SIZES_GENOME } from "../modules/local/chromosomes_size" include { RTWOSAMPLESMLE } from '../modules/local/rtwosamplesmle' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -112,16 +114,43 @@ workflow SAMMYSEQ { //INPUT_CHECK.out.reads.view() - // merge fastq with same lane - ch_merge_lane = INPUT_CHECK.out.reads + //TODO use branch + + ch_notmerge_lane = INPUT_CHECK.out.reads + .map{ meta, path -> + id=meta.subMap('id') + meta=meta + path=path + [id.id, meta, path] + } .groupTuple() - .map{ meta , path -> //if paired end ? - meta = meta - path = path.flatten() - [meta , path] + .filter{ it[1].size() == 1 } + .map{ id, meta, path -> + meta_notmerge=meta[0] + path_notmerge=path[0] + [meta_notmerge, path_notmerge] } - ch_merge_lane.view{"ch_merge_lane ${it}"} + //INPUT_CHECK.out.reads.view{"INPUT_CHECK.out.reads : ${it}"} + //ch_notmerge_lane.view{"ch_notmerge_lane: ${it}"} + + ch_merge_lane = INPUT_CHECK.out.reads + .map{ meta, path -> + id=meta.subMap('id') + meta=meta + path=path + [id.id, meta, path] + } + .groupTuple() + .filter{ it[1].size() >= 2 } //filtra per numero di meta presenti dopo il tupla se hai due meta vuol dire che devi unire due campioni + .map{ id, meta, path -> + single = meta[0].subMap('single_end') + meta = meta[0] + def flatPath = path.flatten() + [ meta , flatPath ] + } + + //ch_merge_lane.view{"ch_merge_lane ${it}"} ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) @@ -129,11 +158,20 @@ workflow SAMMYSEQ { return } + //ch_merge_lane.view{"ch_merge_lane : ${it}"} + CAT_FASTQ ( ch_merge_lane ).reads.set { cat_lane_output } //cat_lane_output.view() + ch_starter = cat_lane_output.mix(ch_notmerge_lane) + + //ch_starter.view{"ch_starter : ${it}"} + + if (params.stopAt == 'CAT_FASTQ_lane') { + return + } // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") @@ -145,57 +183,59 @@ workflow SAMMYSEQ { if(params.combine_fractions){ - INPUT_CHECK.out.reads_to_merge - .filter { meta, fastq -> fastq.size() > 1 } - .set { ch_reads_to_process_in_CAT_FASTQ } - - ch_reads_to_process_in_CAT_FASTQ - .flatMap { meta, fastq -> - meta.collect { m -> [m,fastq]} - } - .set { ch_to_CAT } - - //ch_to_CAT.view{"ch_to_CAT : ${it}"} - - CAT_FASTQ ( - ch_to_CAT - ).reads.set { cat_fastq_output } - - ch_cat_adjusted = CAT_FASTQ.out.reads.map { meta, fastq -> - return [meta, [fastq]] - } // make fastq of CAT_FASTQ a list of paths - merged_reads = INPUT_CHECK.out.reads.mix(ch_cat_adjusted) + merged_reads = CAT_FRACTIONS(//INPUT_CHECK.out.reads_to_merge, + //INPUT_CHECK.out.reads + ch_starter + )//.out.merged_reads } else { //merged_reads = INPUT_CHECK.out.reads - merged_reads = cat_lane_output + merged_reads = ch_starter } - if (params.stopAt == 'CAT_FASTQ') { + //merged_reads.view{"merged_reads: ${it}"} + + if (params.stopAt == 'CAT_FRACTIONS') { return } // // MODULE: Run FastQC // - //merged_reads.view() + TRIMMOMATIC ( + merged_reads + ) + + //TRIMMOMATIC.out.trimmed_reads.view() + + //a channel is created for the trimmed files and the id is renamed to meta, so that when passed to fastqc it does not overwrite the output files with non-trimmed ones + ch_fastqc_trim=TRIMMOMATIC.out.trimmed_reads + .map{ meta, path -> + def id=meta.subMap('id') + newid=id.id + "_trim" + sng=meta.subMap('single_end').single_end + newmeta=[id: newid, single_end: sng] + [ newmeta ,path] + } + + //trimmed and untrimmed fastq channels are merged and the resulting channel is passed to FASTQC + ch_fastqc_in = ch_fastqc_trim.mix(merged_reads) + //ch_fastqc_in.view() FASTQC ( - merged_reads + ch_fastqc_in + //merged_reads ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - TRIMMOMATIC ( - merged_reads - ) + if (params.stopAt == 'TRIMMOMATIC') { return } - - //TRIMMOMATIC.out.trimmed_reads.view() FASTQ_ALIGN_BWAALN ( TRIMMOMATIC.out.trimmed_reads, @@ -216,6 +256,10 @@ workflow SAMMYSEQ { ch_fai_for_cut = SAMTOOLS_FAIDX.out.fai.collect() + if (params.stopAt == 'SAMTOOLS_FAIDX') { + return + } + CUT_SIZES_GENOME(ch_fai_for_cut) //CUT_SIZES_GENOME.out.ch_sizes_genome.view() @@ -372,6 +416,7 @@ workflow SAMMYSEQ { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.metrics.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.flagstat.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(BAM_MARKDUPLICATES_PICARD.out.idxstats.collect{it[1]}.ifEmpty([]))