diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
deleted file mode 100644
index 6448b91a..00000000
--- a/.github/workflows/awsfulltest.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: nf-core AWS full size tests
-# This workflow is triggered on published releases.
-# It can be additionally triggered manually with GitHub actions workflow dispatch button.
-# It runs the -profile 'test_full' on AWS batch
-
-on:
- release:
- types: [published]
- workflow_dispatch:
-jobs:
- run-tower:
- name: Run AWS full tests
- if: github.repository == 'sanger-tol/genomeassembly'
- runs-on: ubuntu-latest
- steps:
- - name: Launch workflow via tower
- uses: nf-core/tower-action@v3
- # TODO nf-core: You can customise AWS full pipeline tests as required
- # Add full size test data (but still relatively small datasets for few samples)
- # on the `test_full.config` test runs with only one set of parameters
- with:
- workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
- access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
- compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
- workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/sanger-tol-genomeassembly/work-${{ github.sha }}
- parameters: |
- {
- "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/sanger-tol-genomeassembly/results-${{ github.sha }}"
- }
- profiles: test_full,aws_tower
diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml
deleted file mode 100644
index 47d3f42e..00000000
--- a/.github/workflows/awstest.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: nf-core AWS test
-# This workflow can be triggered manually with the GitHub actions workflow dispatch button.
-# It runs the -profile 'test' on AWS batch
-
-on:
- workflow_dispatch:
-jobs:
- run-tower:
- name: Run AWS tests
- if: github.repository == 'sanger-tol/genomeassembly'
- runs-on: ubuntu-latest
- steps:
- # Launch workflow using Tower CLI tool action
- - name: Launch workflow via tower
- uses: nf-core/tower-action@v3
- with:
- workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
- access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
- compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
- workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/sanger-tol-genomeassembly/work-${{ github.sha }}
- parameters: |
- {
- "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/sanger-tol-genomeassembly/results-test-${{ github.sha }}"
- }
- profiles: test,aws_tower
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c5134b17..b6373300 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,4 +41,4 @@ jobs:
- name: Run pipeline with test data
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_github,docker -c conf/hifiasm.config --outdir ./results
+ nextflow run ${GITHUB_WORKSPACE} -profile test_github,docker -c conf/hifiasm_test.config --outdir ./results
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 3f27dab4..eefd788d 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -22,7 +22,7 @@ jobs:
run: npm install -g editorconfig-checker
- name: Run ECLint check
- run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile')
+ run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile\|drawio')
Prettier:
runs-on: ubuntu-latest
diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml
new file mode 100644
index 00000000..e69af1ef
--- /dev/null
+++ b/.github/workflows/sanger_test.yml
@@ -0,0 +1,29 @@
+name: sanger-tol LSF tests
+
+on:
+ workflow_dispatch:
+jobs:
+ run-tower:
+ name: Run LSF tests
+ runs-on: ubuntu-latest
+ steps:
+ - name: Launch workflow via tower
+ uses: seqeralabs/action-tower-launch@v2
+ with:
+ workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+ access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
+ compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+ revision: ${{ github.sha }}
+ workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }}
+ parameters: |
+ {
+ "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
+ }
+ profiles: test,sanger,singularity,cleanup
+
+ - uses: actions/upload-artifact@v3
+ with:
+ name: Tower debug log file
+ path: |
+ tower_action_*.log
+ tower_action_*.json
diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml
new file mode 100644
index 00000000..e028c6b6
--- /dev/null
+++ b/.github/workflows/sanger_test_full.yml
@@ -0,0 +1,43 @@
+name: sanger-tol LSF full size tests
+
+on:
+ push:
+ branches:
+ - main
+ - dev
+ workflow_dispatch:
+jobs:
+ run-tower:
+ name: Run LSF full size tests
+ runs-on: ubuntu-latest
+ steps:
+ - name: Sets env vars for push
+ run: |
+ echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV
+ if: github.event_name == 'push'
+
+ - name: Sets env vars for workflow_dispatch
+ run: |
+ echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV
+ if: github.event_name == 'workflow_dispatch'
+
+ - name: Launch workflow via tower
+ uses: seqeralabs/action-tower-launch@v2
+ with:
+ workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+ access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
+ compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+ revision: ${{ env.REVISION }}
+ workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
+ parameters: |
+ {
+ "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
+ }
+ profiles: test_full,sanger,singularity,cleanup
+
+ - uses: actions/upload-artifact@v3
+ with:
+ name: Tower debug log file
+ path: |
+ tower_action_*.log
+ tower_action_*.json
diff --git a/CITATIONS.md b/CITATIONS.md
index b6756d75..76cac2b4 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -2,18 +2,86 @@
## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/)
-> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
+> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: https://doi.org/10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/)
-> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
+> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: https://doi.org/10.1038/nbt.3820. PubMed PMID: 28398311.
## Pipeline tools
-- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+- [Hifiasm](https://hifiasm.readthedocs.io/en/latest/)
-- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
- > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
+ > Cheng, H., Concepcion, G.T., Feng, X. et al. Haplotype-resolved de novo assembly using phased assembly graphs with hifiasm. Nat Methods 18, 170–175 (2021). doi:
+ > https://doi.org/10.1038/s41592-020-01056-5
+
+- [purge_dups](https://pubmed.ncbi.nlm.nih.gov/31971576/)
+
+ > Guan D, McCarthy SA, Wood J, Howe K, Wang Y, Durbin R. Identifying and removing haplotypic duplication in primary genome assemblies. Bioinformatics. 2020 May 1;36(9):2896-2898. doi: https://doi.org/10.1093/bioinformatics/btaa025. PMID: 31971576; PMCID: PMC7203741.
+
+- [Longranger](https://github.com/10XGenomics/longranger)
+
+- [Freebayes](https://arxiv.org/abs/1207.3907)
+
+ > Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 [q-bio.GN] 2012
+
+- [bwa-mem2](https://ieeexplore.ieee.org/document/8820962)
+
+ > Vasimuddin Md, Sanchit Misra, Heng Li, Srinivas Aluru. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. doi: https://doi.org/10.1109/IPDPS.2019.00041
+
+- [YaHS](https://academic.oup.com/bioinformatics/article/39/1/btac808/6917071)
+
+ > Chenxi Zhou and others, YaHS: yet another Hi-C scaffolding tool, Bioinformatics, Volume 39, Issue 1, January 2023, btac808, doi: https://doi.org/10.1093/bioinformatics/btac808
+
+- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/)
+
+ > Li H. New strategies to improve minimap2 alignment accuracy. Bioinformatics. 2021 Oct 8;37(23):4572–4. doi: https://doi.org/10.1093/bioinformatics/btab705. Epub ahead of print. PMID: 34623391; PMCID: PMC8652018.
+
+- [Samtools](https://pubmed.ncbi.nlm.nih.gov/33590861/)
+
+ > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: https://doi.org/10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819.
+
+- [Bcftools](https://samtools.github.io/bcftools/bcftools.html)
+
+ > Danecek P, Bonfield JK, et al. Twelve years of SAMtools and BCFtools. Gigascience (2021) 10(2):giab008 link
+
+- [GATK4](https://gatk.broadinstitute.org/hc/en-us)
+
+ > Van der Auwera GA & O'Connor BD. (2020). Genomics in the Cloud: Using Docker, GATK, and WDL in Terra (1st Edition). O'Reilly Media.
+
+- [Bedtools](https://bedtools.readthedocs.io/en/latest/)
+
+ > Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi:
+ > https://doi.org/10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PMID: 20110278; PMCID: PMC2832824.
+
+- [Juicer](https://github.com/aidenlab/juicer)
+
+ > Durand NC, Shamim MS, Machol I, Rao SS, Huntley MH, Lander ES, Aiden EL. Juicer Provides a One-Click System for Analyzing Loop-Resolution Hi-C Experiments. Cell Syst. 2016 Jul;3(1):95-8. doi: https://doi.org/10.1016/j.cels.2016.07.002. PMID: 27467249; PMCID: PMC5846465.
+
+- [PretextMap](https://github.com/wtsi-hpag/PretextMap)
+
+- [Cooler](https://github.com/open2c/cooler)
+ > Abdennur N, Mirny LA. Cooler: scalable storage for Hi-C data and other genomically labeled arrays. Bioinformatics. 2020 Jan 1;36(1):311-316. doi: https://doi.org/10.1093/bioinformatics/btz540. PMID: 31290943; PMCID: PMC8205516.
+- [MitoHiFi](https://github.com/marcelauliano/MitoHiFi)
+
+ > MitoHiFi: a python pipeline for mitochondrial genome assembly from PacBio High Fidelity reads Marcela Uliano-Silva, João Gabriel R. N. Ferreira, Ksenia Krasheninnikova, Darwin Tree of Life Consortium, Giulio Formenti, Linelle Abueg, James Torrance, Eugene W. Myers, Richard Durbin, Mark Blaxter, Shane A. McCarthy bioRxiv 2022.12.23.521667; doi: https://doi.org/10.1101/2022.12.23.521667
+
+- [MitoFinder](https://github.com/RemiAllio/MitoFinder)
+
+ > Allio, R, Schomaker‐Bastos, A, Romiguier, J, Prosdocimi, F, Nabholz, B, Delsuc, F. MitoFinder: Efficient automated large‐scale extraction of mitogenomic data in target enrichment phylogenomics. Mol Ecol Resour. 2020; 00: 1– 14. doi: https://doi.org/10.1111/1755-0998.13160
+
+- [MITOS](https://anaconda.org/bioconda/mitos)
+
+ > M. Bernt, A. Donath, F. Jühling, F. Externbrink, C. Florentz, G. Fritzsch, J. Pütz, M. Middendorf, P. F. Stadler MITOS: Improved de novo Metazoan Mitochondrial Genome Annotation Molecular Phylogenetics and Evolution 2013, 69(2):313-319.
+
+- [MerquryFK](https://github.com/thegenemyers/MERQURY.FK)
+
+- [BUSCO](https://busco.ezlab.org)
+
+ > Mosè Manni, Matthew R Berkeley, Mathieu Seppey, Felipe A Simão, Evgeny M Zdobnov, BUSCO Update: Novel and Streamlined Workflows along with Broader and Deeper Phylogenetic Coverage for Scoring of Eukaryotic, Prokaryotic, and Viral Genomes. Molecular Biology and Evolution, Volume 38, Issue 10, October 2021, Pages 4647–4654
+
+- [GFASTATS](https://github.com/vgl-hub/gfastats)
+ > Giulio Formenti and others, Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs, Bioinformatics, Volume 38, Issue 17, September 2022, Pages 4214–4216, doi: https://doi.org/10.1093/bioinformatics/btac460
## Software packaging/containerisation tools
@@ -23,13 +91,13 @@
- [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/)
- > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506.
+ > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: https://doi.org/10.1038/s41592-018-0046-7. PubMed PMID: 29967506.
- [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/)
- > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671.
+ > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: https://doi.org/10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671.
- [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241)
- [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
- > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
+ > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: https://doi.org/10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
diff --git a/README.md b/README.md
index be185169..ad9cbc38 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
+
[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
@@ -9,8 +10,36 @@
**sanger-tol/genomeassembly** is a bioinformatics pipeline for a genome assembly for HiFi, Illumina 10x (optional), and HiC data. It performs the following steps: raw assembly, purging from haplotigs, optional polishing, and scaffolding.
+Original assembly of HiFi reads is performed using [hifiasm](https://hifiasm.readthedocs.io) assembler in two modes - original and using HiC data (optional). Then assembly is purged from alternative haplotigs using [purge_dups](https://github.com/dfguan/purge_dups). Next optional step is polishing of the purged assembly using Illumina 10X read sequencing. 10X reads are mapped to the full assembly (purged + haplotigs) using [Longranger](https://support.10xgenomics.com/genome-exome/software/pipelines/latest/what-is-long-ranger) and polishing is implemented using [Freebayes](https://github.com/freebayes/freebayes). HiC reads are further mapped with [bwamem2](https://github.com/bwa-mem2/bwa-mem2) to the primary contigs, which are further scaffolded with [YaHS](https://github.com/c-zhou/yahs) using the provided Hi-C data.
+Polished and scaffolded assemblies are evaluated using [GFASTATS](https://github.com/vgl-hub/gfastats), [BUSCO](https://busco.ezlab.org/) and [MERQURY.FK](https://github.com/thegenemyers/MERQURY.FK)
+
The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
+On release, automated continuous integration tests run the pipeline on a full-sized dataset on the LSF infrastructure. This ensures that the pipeline runs on LSF, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.
+
+## Pipeline summary
+
+While the steps are described in a sequential order, many of them can be executed as parallel jobs.
+
+1. Process the input YAML file, combine them by semantics into the data structures required to pass further down the pipeline.
+2. Run organnels subworkflow on the HiFi reads.
+3. Run hifiasm in the original mode.
+4. Produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for [3].
+5. If hifiasm_hic_on
option is set
+ 1. run hifiasm in HiC mode.
+ 2. produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for [5i].
+6. Run purging subworkflow on the primary contigs from [3], i.e. produce the purged assembly and a set of haplotigs. Consider the purged contigs as the primary assembly for further steps.
+7. Take haplotigs from [6], merge with haplotigs from [3] and run purging subworkfllow on it. Discard the contigs that were purged away, continue with the purged haplotigs as a representation of the haplotig assembly.
+8. Produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for the primary and haplotigs from [6] and [7].
+9. If polishing_on
+ 1. Illumina 10X reads to the joined primary and alt contigs.
+ 2. polish initial assembly based on the aligment produced in [9i]. Set polished primary contigs as the primary assembly and polished haplotigs as the haplotig assembly.
+ 3. produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for [9ii].
+10. Run organelles subworkflow on the joined primary and haplotigs contigs.
+11. Map HiC data onto primary contigs.
+12. Run scaffolding for primary contigs.
+13. Produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for [12].
+
## Usage
> **Note**
@@ -18,40 +47,33 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline)
> with `-profile test` before running the workflow on actual data.
-Currently, it is advised to run the pipeline with docker or singularity as a small number of major modules do not currently have a conda env associated with them.
+Currently, it is advised to run the pipeline with docker or singularity as some modules do not have a conda env associated with them.
-1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`)
+To run the pipeline use a command-line:
-2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_.
+```bash
+nextflow run sanger-tol/genomeassembly -profile singularity,YOURPROFILE --outdir
+```
-3. Download the pipeline and test it on a minimal dataset with a single command:
+For more details on how to run the pipeline and interprete the results see [usage](usage.md) and [output](output.md) sections of the documentation.
- ```bash
- nextflow run sanger-tol/genomeassembly -profile test,YOURPROFILE --outdir
- ```
+## Credits
- Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
+sanger-tol/genomeassembly was originally written by @ksenia-krasheninnikova.
- > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`.
- > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
- > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs.
- > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs.
+We thank the following people for their extensive assistance in the development of this pipeline:
-> **Warning:**
-> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
-> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
-> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
+@priyanka-surana for the code review, very helpful coding suggestions, and assistance with pushing this pipeline forward through development.
-## Credits
+@mcshane and @c-zhou for the design and implementation of the original pipelines for purging (@mcshane), polishing (@mcshane) and scaffolding (@c-zhou).
-sanger-tol/genomeassembly was originally written by @ksenia-krasheninnikova based on the ToL Genome Engine procedures.
+TreeVal team Damon-Lee Pointon (@DLBPointon), Yumi Sims (@yumisims) and William Eagles (@weaglesBio) for implementation of the hic-mapping pipeline.
-We thank the following people for their extensive assistance in the development of this pipeline:
+@muffato for help with nf-core integration, dealing with infrastructure and troubleshooting, for the code reviews and valuable suggestions at the different stages of the pipeline development.
+
+@mahesh-panchal for nextflow implementation of the purging pipeline, code review and valuable suggestions to the nf-core modules implementation.
-@mcshane - For the original implementation of the genomeassembly pipeline
-@priyanka-surana - For code reviews and code support
-@mahesh-panchal - For nextflow implementation of the purge_dups pipeline that was re-used here,
- as well as for the implementation of input parsing subworkflow which was further adapted for the current pipeline
+@gq1 for the code review and valuable suggestions.
## Contributions and Support
@@ -59,15 +81,9 @@ If you would like to contribute to this pipeline, please see the [contributing g
## Citations
-
-
-If you use sanger-tol/genomeassembly for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX)
-
-### Tools
-
An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.
-You can cite the `nf-core` publication as follows:
+This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE).
> **The nf-core framework for community-curated bioinformatics pipelines.**
>
diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml
deleted file mode 100644
index 3e870d34..00000000
--- a/assets/methods_description_template.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-id: "sanger-tol-genomeassembly-methods-description"
-description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication."
-section_name: "sanger-tol/genomeassembly Methods Description"
-section_href: "https://github.com/sanger-tol/genomeassembly"
-plot_type: "html"
-## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline
-## You inject any metadata in the Nextflow '${workflow}' object
-data: |
- Methods
- Data was processed using sanger-tol/genomeassembly v${workflow.manifest.version} ${doi_text} of the sanger-tol collection of workflows, created using nf-core (Ewels et al., 2020).
- The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:
- ${workflow.commandLine}
- References
-
- - Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
- - Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
-
-
-
Notes:
-
- ${nodoi_text}
- - The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
- - You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
-
-
diff --git a/bin/bed_chunks.sh b/bin/bed_chunks.sh
index ec406012..77af6254 100755
--- a/bin/bed_chunks.sh
+++ b/bin/bed_chunks.sh
@@ -1,4 +1,26 @@
#!/bin/bash
+#
+# Copyright (C) 2022-2023 Genome Research Ltd.
+#
+# Author: Ksenia Krasheninnikova
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
if [ $# -ne 2 ]; then echo -e "Script to split genome into chunks.\nUsage: $0 .\nVersion: 1.0"; exit 1; fi
diff --git a/bin/coverage.sh b/bin/coverage.sh
index e7784064..eb1edffb 100755
--- a/bin/coverage.sh
+++ b/bin/coverage.sh
@@ -1,4 +1,26 @@
#!/bin/bash
+#
+# Copyright (C) 2022-2023 Genome Research Ltd.
+#
+# Author: Ksenia Krasheninnikova
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
if [ $# -ne 1 ]; then echo -e "Script to extract coverage threshold.\nUsage: $0 .\nVersion: 1.0"; exit 1; fi
diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
index b87a56a5..81eaad34 100755
--- a/bin/generate_cram_csv.sh
+++ b/bin/generate_cram_csv.sh
@@ -1,13 +1,12 @@
#!/bin/bash
+# Copyright (C) 2022-2023 Genome Research Ltd.
#
# Based on https://github.com/sanger-tol/treeval/blob/80554a803903183613d49690d5770eeadb3c42c9/bin/generate_cram_csv.sh
# from Sanger TOL treeval pipeline
#
-#cram_path=$1
chunkn=0
-#for cram in ${cram_path}/*.cram; do
for cram in "$@"; do
rgline=$(samtools view -H $cram|grep "RG"|sed 's/\t/\\t/g'|sed "s/'//g")
diff --git a/bin/get_calcuts_params_from_model_fk.py b/bin/get_calcuts_params_from_model_fk.py
index 3be34d49..bf7d27cf 100755
--- a/bin/get_calcuts_params_from_model_fk.py
+++ b/bin/get_calcuts_params_from_model_fk.py
@@ -1,4 +1,26 @@
#!/usr/bin/env python
+#
+# Copyright (C) 2022-2023 Genome Research Ltd.
+#
+# Author: Ksenia Krasheninnikova
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
import sys
diff --git a/bin/gnu_sort.sh b/bin/gnu_sort.sh
deleted file mode 100755
index 740b0cf3..00000000
--- a/bin/gnu_sort.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-args=$1
-cpus=$2
-buffer=$3
-bed=$4
-outfile=$5
-
-sort $args --parallel=$cpus -S${buffer} $bed > $outfile
diff --git a/bin/prepare_pretext.sh b/bin/prepare_pretext.sh
index f59eae99..99df0ab5 100755
--- a/bin/prepare_pretext.sh
+++ b/bin/prepare_pretext.sh
@@ -1,4 +1,26 @@
#!/bin/bash
+#
+# Copyright (C) 2022-2023 Genome Research Ltd.
+#
+# Author: Ksenia Krasheninnikova
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
FAI=$1
PAIRS=$2
diff --git a/conf/base.config b/conf/base.config
index cdbb7333..3355e30e 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -24,7 +24,6 @@ process {
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
- }
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
@@ -60,10 +59,22 @@ process {
}
// TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
- withName:HIFIASM {
- cpus = { check_max( 28 * task.attempt, 'cpus' ) }
+ withName:'.*:HIFIASM.*' {
+ cpus = { check_max( 28 * task.attempt, 'cpus' ) }
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
- time = { check_max( 48.h * task.attempt, 'time' ) }
+ time = { check_max( 48.h * task.attempt, 'time' ) }
+ }
+
+ withName:'.*MINIMAP2_ALIGN.*' {
+ cpus = { check_max( 16 * task.attempt, 'cpus' ) }
+ memory = { check_max( 50.GB * task.attempt, 'memory' ) }
+ time = { check_max( 16.h * task.attempt, 'time' ) }
+ }
+
+ withName:'.*BWAMEM2_INDEX.*' {
+ memory = { check_max( 72.GB * task.attempt, 'memory' ) }
+ }
+
withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
diff --git a/conf/hifiasm.config b/conf/hifiasm_test.config
similarity index 100%
rename from conf/hifiasm.config
rename to conf/hifiasm_test.config
diff --git a/conf/modules.config b/conf/modules.config
index c5023fa8..318525ad 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -28,7 +28,7 @@ process {
// Set up of kmer profile
withName: FASTK_FASTK {
- ext.args = "-k31 -t"
+ ext.args = "-k31 -t -P."
publishDir = [
path: { "${params.outdir}/kmer" },
mode: params.publish_dir_mode,
@@ -66,15 +66,6 @@ process {
]
}
- withName: HIFIASM_HIC {
- ext.args = "--primary"
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
-
withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_PRI' {
ext.prefix = { "${meta.id}.asm.p_ctg" }
publishDir = [
@@ -93,32 +84,6 @@ process {
]
}
- withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_PRI_HIC' {
- ext.prefix = { "${meta.id}.asm.hic.p_ctg" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
-
- withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_ALT_HIC' {
- ext.prefix = { "${meta.id}.asm.hic.a_ctg" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
-
- withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_.*HIC' {
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
-
withName: '.*GENOME_STATISTICS_RAW:GFASTATS_PRI' {
ext.prefix = { "${meta.id}.asm.p_ctg" }
publishDir = [
@@ -137,29 +102,16 @@ process {
]
}
- withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_PRI' {
- ext.prefix = { "${meta.id}.asm.hic.p_ctg" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
- mode: params.publish_dir_mode,
- pattern: '*assembly_summary'
- ]
- }
-
- withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_HAP' {
- ext.prefix = { "${meta.id}.asm.hic.a_ctg" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
- mode: params.publish_dir_mode,
- pattern: '*assembly_summary'
- ]
- }
-
withName: '.*GENOME_STATISTICS_RAW:BUSCO' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/${meta.id}.p_ctg.${meta.lineage}.busco" },
mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ saveAs: { filename -> filename.endsWith('busco.log') ? filename :
+ filename.endsWith('full_table.tsv') ? filename :
+ filename.endsWith('missing_busco_list.tsv') ? filename :
+ filename.startsWith('short_summary') ? filename :
+ filename.endsWith('busco.batch_summary.txt') ? filename :
+ null }
]
}
@@ -170,33 +122,109 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
+
+ if (params.hifiasm_hic_on) {
+ withName: HIFIASM_HIC {
+ ext.args = "--primary"
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+ withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_PRI_HIC' {
+ ext.prefix = { "${meta.id}.asm.hic.p_ctg" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_ALT_HIC' {
+ ext.prefix = { "${meta.id}.asm.hic.a_ctg" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_.*HIC' {
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+ withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_PRI' {
+ ext.prefix = { "${meta.id}.asm.hic.p_ctg" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
+ mode: params.publish_dir_mode,
+ pattern: '*assembly_summary'
+ ]
+ }
+
+ withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_HAP' {
+ ext.prefix = { "${meta.id}.asm.hic.a_ctg" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" },
+ mode: params.publish_dir_mode,
+ pattern: '*assembly_summary'
+ ]
+ }
+ withName: '.*GENOME_STATISTICS_RAW_HIC:BUSCO' {
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.p_ctg.${meta.lineage}.busco" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.endsWith('busco.log') ? filename :
+ filename.endsWith('full_table.tsv') ? filename :
+ filename.endsWith('missing_busco_list.tsv') ? filename :
+ filename.startsWith('short_summary') ? filename :
+ filename.endsWith('busco.batch_summary.txt') ? filename :
+ null }
+ ]
+ }
+
+ withName: '.*GENOME_STATISTICS_RAW_HIC:MERQURYFK_MERQURYFK' {
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.p_ctg.ccs.merquryk" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+ }
+
// End of Set up of the raw assembly pipeline
// Set up of organelles pipeline
- withName: '.*ORGANELLES_READS:MITOHIFI_MITOHIFI' {
- ext.args2 = '-r'
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/mito.reads" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
+ if (params.organelles_on) {
+ withName: '.*ORGANELLES_READS:MITOHIFI_MITOHIFI' {
+ ext.args2 = '-r'
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/mito.reads" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
- withName: '.*ORGANELLES_CONTIGS:MITOHIFI_MITOHIFI' {
- ext.args2 = '-c'
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/mito" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
+ withName: '.*ORGANELLES_CONTIGS:MITOHIFI_MITOHIFI' {
+ ext.args2 = '-c'
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/mito" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
}
// End of set up of organelles pipeline
// Set up of the purging pipeline
withName: '.*PURGE_DUPS_PRI:MINIMAP2_ALIGN_READS' {
- //these options from pbmm2 CSS preset
+ //these options are from pbmm2 CSS preset
ext.args = "-k19 -w10 -O5,56 -E4,1 -A2 -B5 -z400,50 -r2000 --lj-min-ratio 0.5"
ext.prefix = { "${meta.id}.reads" }
publishDir = [
@@ -275,6 +303,8 @@ process {
}
withName: '.*PURGE_DUPS_ALT:MINIMAP2_ALIGN_READS' {
+ //these options are from pbmm2 CSS preset
+ ext.args = "-k19 -w10 -O5,56 -E4,1 -A2 -B5 -z400,50 -r2000 --lj-min-ratio 0.5"
ext.prefix = { "${meta.id}.reads" }
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/purging/coverage.htigs" },
@@ -318,6 +348,7 @@ process {
}
withName: '.*PURGE_DUPS_ALT:MINIMAP2_ALIGN_ASSEMBLY' {
+ ext.args = "-xasm5 -DP"
ext.prefix = "self_aln"
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/purging/split_aln.htigs" },
@@ -327,6 +358,7 @@ process {
}
withName: '.*PURGE_DUPS_ALT:PURGEDUPS_PURGEDUPS' {
+ ext.args = "-2"
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/purging/purge_dups.htigs" },
mode: params.publish_dir_mode,
@@ -371,7 +403,12 @@ process {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/purging/${meta.id}.purged.${meta.lineage}.busco" },
mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ saveAs: { filename -> filename.endsWith('busco.log') ? filename :
+ filename.endsWith('full_table.tsv') ? filename :
+ filename.endsWith('missing_busco_list.tsv') ? filename :
+ filename.startsWith('short_summary') ? filename :
+ filename.endsWith('busco.batch_summary.txt') ? filename :
+ null }
]
}
@@ -386,129 +423,134 @@ process {
// Set up of the polishing pipeline
-
- withName: LONGRANGER_MKREF {
- if(System.getenv('GITHUB_ACTION') != null ) {
- container = "ghcr.io/sanger-tol/longranger:2.2.2-c3"
+ if (params.polishing_on) {
+ withName: LONGRANGER_MKREF {
+ if(System.getenv('GITHUB_ACTION') != null ) {
+ container = "ghcr.io/sanger-tol/longranger:2.2.2-c3"
+ }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
}
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
- withName: LONGRANGER_ALIGN {
- // Keep in sync with `longranger_lsf_sanger.config`
- ext.args = "--disable-ui --nopreflight"
- if(System.getenv('GITHUB_ACTION') != null ) {
- container = "ghcr.io/sanger-tol/longranger:2.2.2-c3"
+ withName: LONGRANGER_ALIGN {
+ // Keep in sync with `longranger_lsf_sanger.config`
+ ext.args = "--disable-ui --nopreflight"
+ if(System.getenv('GITHUB_ACTION') != null ) {
+ container = "ghcr.io/sanger-tol/longranger:2.2.2-c3"
+ }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
}
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
- withName: BED_CHUNKS {
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/chunks" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
-
- withName: MERGE_FREEBAYES {
- ext.prefix = 'merged'
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/" },
- mode: params.publish_dir_mode,
- pattern: "*merged*"
- ]
- }
+ withName: BED_CHUNKS {
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/chunks" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
- withName: BCFTOOLS_CONSENSUS {
- // Filter by mapping quality, keep alt-alt het and alt-alt hom,
- // keep longer allele
- ext.args = '-i\'QUAL>1 && (GT="AA" || GT="Aa")\' -Hla'
- ext.prefix = { "${meta.id}.consensus" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
+ withName: MERGE_FREEBAYES {
+ ext.prefix = 'merged'
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/" },
+ mode: params.publish_dir_mode,
+ pattern: "*merged*"
+ ]
+ }
- withName: BCFTOOLS_INDEX {
- ext.args = '--tbi'
- }
+ withName: BCFTOOLS_CONSENSUS {
+ // Filter by mapping quality, keep alt-alt het and alt-alt hom,
+ // keep longer allele
+ ext.args = '-i\'QUAL>1 && (GT="AA" || GT="Aa")\' -Hla'
+ ext.prefix = { "${meta.id}.consensus" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
- withName: BCFTOOLS_VIEW {
- // Dont keep command line information, keep reference allele or
- // suggested alternative if reference is N
- ext.args = '--no-version -e\'type="ref"||REF~"N"\''
- }
+ withName: BCFTOOLS_INDEX {
+ ext.args = '--tbi'
+ }
- withName: BCFTOOLS_NORM {
- ext.args = '--no-version'
- }
+ withName: BCFTOOLS_VIEW {
+ // Dont keep command line information, keep reference allele or
+ // suggested alternative if reference is N
+ ext.args = '--no-version -e\'type="ref"||REF~"N"\''
+ }
- withName: BCFTOOLS_SORT {
- scratch = true
- }
+ withName: BCFTOOLS_NORM {
+ ext.args = '--no-version'
+ }
- withName: SEQTK_SUBSEQ_PRIMARY {
- ext.prefix = 'primary'
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : "primary.fa" }
- ]
- }
+ withName: BCFTOOLS_SORT {
+ scratch = true
+ }
- withName: SEQTK_SUBSEQ_HAPLOTIGS {
- ext.prefix = 'haplotigs'
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : "haplotigs.fa" }
- ]
- }
+ withName: SEQTK_SUBSEQ_PRIMARY {
+ ext.prefix = 'primary'
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : "primary.fa" }
+ ]
+ }
- withName: '.*GENOME_STATISTICS_POLISHED:GFASTATS_PRI' {
- ext.prefix = { "primary" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
- mode: params.publish_dir_mode,
- pattern: '*assembly_summary'
- ]
- }
+ withName: SEQTK_SUBSEQ_HAPLOTIGS {
+ ext.prefix = 'haplotigs'
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : "haplotigs.fa" }
+ ]
+ }
- withName: '.*GENOME_STATISTICS_POLISHED:GFASTATS_HAP' {
- ext.prefix = { "haplotigs" }
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
- mode: params.publish_dir_mode,
- pattern: '*assembly_summary'
- ]
- }
+ withName: '.*GENOME_STATISTICS_POLISHED:GFASTATS_PRI' {
+ ext.prefix = { "primary" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
+ mode: params.publish_dir_mode,
+ pattern: '*assembly_summary'
+ ]
+ }
+ withName: '.*GENOME_STATISTICS_POLISHED:GFASTATS_HAP' {
+ ext.prefix = { "haplotigs" }
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" },
+ mode: params.publish_dir_mode,
+ pattern: '*assembly_summary'
+ ]
+ }
- withName: '.*GENOME_STATISTICS_POLISHED:BUSCO' {
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/${meta.id}.polished.${meta.lineage}.busco" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
+ withName: '.*GENOME_STATISTICS_POLISHED:BUSCO' {
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/${meta.id}.polished.${meta.lineage}.busco" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.endsWith('busco.log') ? filename :
+ filename.endsWith('full_table.tsv') ? filename :
+ filename.endsWith('missing_busco_list.tsv') ? filename :
+ filename.startsWith('short_summary') ? filename :
+ filename.endsWith('busco.batch_summary.txt') ? filename :
+ null }
+ ]
+ }
- withName: '.*GENOME_STATISTICS_POLISHED:MERQURYFK_MERQURYFK' {
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/${meta.id}.polished.ccs.merquryk" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
+ withName: '.*GENOME_STATISTICS_POLISHED:MERQURYFK_MERQURYFK' {
+ publishDir = [
+ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing/${meta.id}.polished.ccs.merquryk" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
}
// End of Set up of the polishing pipeline
@@ -550,14 +592,6 @@ process {
ext.args = "--output-fmt cram"
}
- withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_INDEX' {
- publishDir = [
- path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
- ]
- }
-
withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_STATS' {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding" },
@@ -648,7 +682,12 @@ process {
publishDir = [
path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" },
mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ saveAs: { filename -> filename.endsWith('busco.log') ? filename :
+ filename.endsWith('full_table.tsv') ? filename :
+ filename.endsWith('missing_busco_list.tsv') ? filename :
+ filename.startsWith('short_summary') ? filename :
+ filename.endsWith('busco.batch_summary.txt') ? filename :
+ null }
]
}
@@ -661,11 +700,11 @@ process {
}
// End of Set up of the scaffolding pipeline
- //Set up of assmebly stats subworkflow
+ //Set up of assembly stats subworkflow
withName: 'BUSCO' {
ext.args = "--mode genome"
}
- //End of Set up of assmebly stats subworkflow
+ //End of Set up of assembly stats subworkflow
}
diff --git a/conf/test.config b/conf/test.config
index ba53ac72..0cbb3cae 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -14,10 +14,10 @@ params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'
+ // Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
- // Limit resources so that this can run on GitHub Actions
// Input data
input = "${projectDir}/assets/test.yaml"
diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png
deleted file mode 100755
index 361d0e47..00000000
Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ
diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png
deleted file mode 100755
index cb39ebb8..00000000
Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ
diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png
deleted file mode 100755
index a4b89bf5..00000000
Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ
diff --git a/docs/images/v1/genome_statistics.drawio b/docs/images/v1/genome_statistics.drawio
new file mode 100644
index 00000000..1c4832f0
--- /dev/null
+++ b/docs/images/v1/genome_statistics.drawio
@@ -0,0 +1,143 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/genome_statistics.png b/docs/images/v1/genome_statistics.png
new file mode 100644
index 00000000..1da359b3
Binary files /dev/null and b/docs/images/v1/genome_statistics.png differ
diff --git a/docs/images/v1/genomescope_model.drawio b/docs/images/v1/genomescope_model.drawio
new file mode 100644
index 00000000..b71f5da2
--- /dev/null
+++ b/docs/images/v1/genomescope_model.drawio
@@ -0,0 +1,77 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/genomescope_model.png b/docs/images/v1/genomescope_model.png
new file mode 100644
index 00000000..004a180c
Binary files /dev/null and b/docs/images/v1/genomescope_model.png differ
diff --git a/docs/images/v1/hic-mapping.drawio b/docs/images/v1/hic-mapping.drawio
new file mode 100644
index 00000000..7806b879
--- /dev/null
+++ b/docs/images/v1/hic-mapping.drawio
@@ -0,0 +1,271 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/hic-mapping.png b/docs/images/v1/hic-mapping.png
new file mode 100644
index 00000000..7a1cf69e
Binary files /dev/null and b/docs/images/v1/hic-mapping.png differ
diff --git a/docs/images/v1/organelles.drawio b/docs/images/v1/organelles.drawio
new file mode 100644
index 00000000..1c4ca31d
--- /dev/null
+++ b/docs/images/v1/organelles.drawio
@@ -0,0 +1,93 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/organelles.png b/docs/images/v1/organelles.png
new file mode 100644
index 00000000..36fa4dc7
Binary files /dev/null and b/docs/images/v1/organelles.png differ
diff --git a/docs/images/v1/polishing.drawio b/docs/images/v1/polishing.drawio
new file mode 100644
index 00000000..bfa0e4f5
--- /dev/null
+++ b/docs/images/v1/polishing.drawio
@@ -0,0 +1,312 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/polishing.png b/docs/images/v1/polishing.png
new file mode 100644
index 00000000..3f638ce2
Binary files /dev/null and b/docs/images/v1/polishing.png differ
diff --git a/docs/images/v1/purge_dups.drawio b/docs/images/v1/purge_dups.drawio
new file mode 100644
index 00000000..64f4decb
--- /dev/null
+++ b/docs/images/v1/purge_dups.drawio
@@ -0,0 +1,213 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/purge_dups.png b/docs/images/v1/purge_dups.png
new file mode 100644
index 00000000..901a46e0
Binary files /dev/null and b/docs/images/v1/purge_dups.png differ
diff --git a/docs/images/v1/raw_assembly.drawio b/docs/images/v1/raw_assembly.drawio
new file mode 100644
index 00000000..7d314c47
--- /dev/null
+++ b/docs/images/v1/raw_assembly.drawio
@@ -0,0 +1,108 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/raw_assembly.png b/docs/images/v1/raw_assembly.png
new file mode 100644
index 00000000..aa8407b1
Binary files /dev/null and b/docs/images/v1/raw_assembly.png differ
diff --git a/docs/images/v1/scaffolding.drawio b/docs/images/v1/scaffolding.drawio
new file mode 100644
index 00000000..f7b97c87
--- /dev/null
+++ b/docs/images/v1/scaffolding.drawio
@@ -0,0 +1,395 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/v1/scaffolding.png b/docs/images/v1/scaffolding.png
new file mode 100644
index 00000000..2f1809fa
Binary files /dev/null and b/docs/images/v1/scaffolding.png differ
diff --git a/docs/output.md b/docs/output.md
index 64968fda..a858863b 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -2,59 +2,191 @@
## Introduction
-This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline.
+This document describes the output produced by the genomeassembly pipeline.
The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
-
+## Subworkflows
-## Pipeline overview
+The pipeline is built using [Nextflow](https://www.nextflow.io/) DSL2.
-The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
+### PREPARE_INPUT
-- [FastQC](#fastqc) - Raw read QC
-- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
-- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
+Here the input YAML is being processed. This subworkflow generates the input channels used as by the other subworkflows.
-### FastQC
+### GENOMESCOPE_MODEL
-Output files
+ Output files
+
+ - kmer/*ktab
+ - kmer table file
+ - kmer/*hist
+ - kmer histogram file
+ - kmer/*model.txt
+ - genomescope model in text format
+ - kmer/*[linear,log]_plot.png
+ - genomescope kmer plots
+
+
+
+This subworkflow generates a KMER database and coverage model used in [PURGE_DUPS](#purge_dups) and [GENOME_STATISTICS](#genome_statistics)
-- `fastqc/`
- - `*_fastqc.html`: FastQC report containing quality metrics.
- - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+![Subworkflow for kmer profile](images/v1/genomescope_model.png)
+### RAW_ASSEMBLY
+
+
+ Output files
+
+ - .\*hifiasm.\*/.*p_ctg.[g]fa
+ - primary assembly in GFA and FASTA format; for more details refer to [hifiasm output](https://hifiasm.readthedocs.io/en/latest/interpreting-output.html)
+ - .\*hifiasm.\*/.*a_ctg.[g]fa
+ - haplotigs in GFA and FASTA format; for more details refer to [hifiasm output](https://hifiasm.readthedocs.io/en/latest/interpreting-output.html)
+ - .\*hifiasm.\*/.*bin
+ - internal binary hifiasm files; for more details refer [here](https://hifiasm.readthedocs.io/en/latest/faq.html#id12)
+
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+This subworkflow generates a raw assembly(-ies). First, hifiasm is run on the input HiFi reads then raw contigs are converted from GFA into FASTA format, this assembly is due to purging, polishing (optional) and scaffolding further down the pipeline.
+In case hifiasm HiC mode is switched on, it is performed as an extra step with results stored in hifiasm-hic folder.
+
+![Raw assembly subworkflow](images/v1/raw_assembly.png)
-![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
+### PURGE_DUPS
-![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
+
+ Output files
+
+ - \*.hifiasm..\*/purged.fa
+ - purged primary contigs
+ - \*.hifiasm..\*/purged.htigs.fa
+ - haplotigs after purging
+ - other files from the purge_dups pipeline
+ - for details refer [here](https://github.com/dfguan/purge_dups)
+
+
+Retained haplotype is identified in primary assembly. The alternate contigs are updated correspondingly.
+The subworkflow relies on kmer coverage model to identify coverage thresholds. For more details see [purge_dups](https://github.com/dfguan/purge_dups)
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
+
-> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
+![Subworkflow for purging haplotigs](images/v1/purge_dups.png)
-### MultiQC
+### POLISHING
-Output files
+ Output files
+
+ - \*.hifiasm..\*/polishing/.*consensus.fa
+ - polished joined primary and haplotigs assembly
+ - \*.hifiasm..\*/polishing/merged.vcf.gz
+ - unfiltered variants
+ - \*.hifiasm..\*/polishing/merged.vcf.gz.tbi
+ - index file
+ - \*.hifiasm..\*/polishing/refdata-*
+ - Longranger assembly indices
+
+
+
+This subworkflow uses read mapping of the Illumina 10X short read data to fix short errors in primary contigs and haplotigs.
+
+![Subworkflow for purging haplotigs](images/v1/polishing.png)
+
+### HIC_MAPPING
+
+
+ Output files
+
+ - \*.hifiasm..\*/scaffolding/.*_merged_sorted.bed
+ - bed file obtained from merged mkdup bam
+ - \*.hifiasm..\*/scaffolding/.*mkdup.bam
+ - final read mapping bam with mapped reads
+
+
+This subworkflow implements alignment of the Illumina HiC short reads to the primary assembly. Uses [`CONVERT_STATS`](#convert_stats) as internal subworkflow to calculate read mapping stats.
+
+![HiC mapping subworkflow](images/v1/hic-mapping.png)
+
+### CONVERT_STATS
+
+
+ Output files
+
+ - \*.hifiasm..\*/scaffolding/.*.stats
+ - output of samtools stats
+ - \*.hifiasm..\*/scaffolding/.*.idxstats
+ - output of samtools idxstats
+ - \*.hifiasm..\*/scaffolding/.*.flagstat
+ - output of samtools flagstat
+
+
+This subworkflow produces statistcs for a bam file containing read mapping. It is executed within [`HIC_MAPPING`](#hic_mapping) subworkflow.
+
+### SCAFFOLDING
+
+
+ Output files
+
+ - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/out_scaffolds_final.fa
+ - scaffolds in FASTA format
+ - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/out_scaffolds_final.agp
+ - coordinates of contigs relative to scaffolds
+ - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/alignments_sorted.txt
+ - Alignments for Juicer in text format
+ - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/yahs_scaffolds.hic
+ - Juicer HiC map
+ - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/*cool
+ - HiC map for cooler
+ - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/*.FullMap.png
+ - Pretext snapshot
+
+
+The subworkflow performs scaffolding of the primary contigs using HiC mapping generated in [`HIC_MAPPING`](hic_mapping). It also performs some postprocessing steps such as generating cooler and pretext files
+
+![Scaffolding subworkflow](images/v1/scaffolding.png)
+
+### GENOME_STATISTICS
-- `multiqc/`
- - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
- - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
- - `multiqc_plots/`: directory containing static images from the report in various formats.
+
+ Output files
+
+- .\*.assembly_summary
+ - numeric statistics for pri and alt sequences
+- .\*ccs.merquryk
+ - folder with merqury plots and kmer statistics
+- .\*busco
+ - folder with BUSCO results
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
+This subworkflow is used to evaluate the quality of sequences. It is performed after the intermidate steps, such as raw assembly generation, purging and polishing, and also at the end of the pipeline when scaffolds are produced.
+
+![Genome statistics subworkflow](images/v1/genome_statistics.png)
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
+### ORGANELLES
+
+
+ Output files
+
+- \*.hifiasm.\*/mito..\*/final_mitogenome.fasta
+ - organelle assembly
+- \*.hifiasm.\*/mito..\*/final_mitogenome.[gb,gff]
+ - organelle gene annotation
+- \*.hifiasm.\*/mito..\*/contigs_stats.tsv
+ - summary of mitochondrial findings
+- output also includes other output files produced by MitoHiFi
+
+
+
+This subworkflow implements assembly of organelles. In the main pipeline it is called twice - for assembling mitochondrion from HiFi reads and as an alternative it runs identification of the mitochondrion for the genome assembly
+
+![Organelles subworkflow](images/v1/organelles.png)
### Pipeline information
+[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.
+
Output files
@@ -64,5 +196,3 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
- Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
-
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.
diff --git a/docs/usage.md b/docs/usage.md
index 62718d03..b374006a 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -1,102 +1,131 @@
# sanger-tol/genomeassembly: Usage
-> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._
-
## Introduction
-
-
-## Samplesheet input
+## Workflow input
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+### Parameters summary
-```bash
---input '[path to samplesheet file]'
-```
+
+ Details
+
+Workflow accepts the following parameters:
+* input
- (required) YAML file containing description of the dataset, incl. ToLID, paths to the raw data etc.
+* bed_chunks_polishing
- a number of chunks to split contigs for polishing (default 100)
+* cool_bin
- a bin size for cooler (default 1000)
+* organelles_on
- set True
for running organelles subworkflow
+* polishing_on
- set True
for polishing
+* hifiasm_hic_on
- set True
to run of hifiasm in HiC mode
+
NB: hifiasm in the original mode is used as the main assembly even if the hifiasm_hic_on
flag is set
-### Multiple runs of the same sample
+
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+### Full samplesheet
-```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
+The input dataset is described in YAML format which states for "Yet Another Markdown Language". It is a human readable file which contains information
+about location paths for the raw data (HiFi, 10X, HiC) used for the genome assembly. It can also contain meta information such as HiC restriction motifs,
+BUSCO lineage, mitochondrial code etc. For more information see [Input YAML definition](#input_yaml_definition)
+
+### Input YAML definition
+
+- dataset.id
+ - is used as the sample id throughout the pipeline. ToLID should be used in ToL datasets.
+- dataset.illumina_10X.reads
+ - is necessary in case polishing is applied, this field should point to the path of the folder containing 10X reads. Sample identifier in the Illumina reads should coincide with the top level ID. For the use of the Longranger software the reads should follow [the 10X FASTQ file naming convention](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input).
+- dataset.pacbio.reads
+ - contains the list (-reads
) of the HiFi reads in FASTA (or gzipped FASTA) format in. The pipeline implementation is based on an assumption that reads have gone through adapter/barcode checks.
+- dataset.HiC.reads
+ - contains the list (-reads
) of the HiC reads in the indexed CRAM format.
+- dataset.hic_motif
+ - is a comma-separated list of restriction sites. The pipeline was tested with the Arima dataset, but it's should be alright to use it with the other HiC libraries
+- dataset.busco.lineage
+ - specifies the name of the BUSCO dataset (i.e. bacteria_odb10).
+- dataset.busco.lineage_path
+ - is an optional field containing the path to the folder with pre-downloaded BUSCO lineages.
+- dataset.mito.species
+ - is the latin name of the species to look for the mitogenome reference in the organelles subworkflow. Normally this parameter will contain the latin name of the species whose genome is being assembled.
+- dataset.mito.min_length
+ - sets the minimal length of the mito, can be 15Kb.
+- dataset.mito.code
+ - is a mitochondrial code for the mitogenome annotation. See [here](https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) for reference.
+
+### An example of the input YAML
+
+
+ Details
+
+Example is based on [test.yaml](../assets/test.yaml).
+```yaml
+dataset:
+ id: baUndUnlc1
+ illumina_10X:
+ reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/
+ pacbio:
+ reads:
+ - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta
+ HiC:
+ reads:
+ - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2#7.sub.cram
+hic_motif: GATC,GANTC,CTNAG,TTAA
+busco:
+ lineage: bacteria_odb10
+mito:
+ species: Caradrina clavipalpis
+ min_length: 15000
+ code: 5
```
+
-### Full samplesheet
+## Usage
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
+### Local testing
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
+
+ Details
-```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+The pipeline can be tested locally using a provided small test dataset:
+
+```
+cd ${GENOMEASSEMBLY_TEST_DATA}
+curl https://darwin.cog.sanger.ac.uk/genomeassembly_test_data.tar.gz | tar xzf -
+
+git clone git@github.com:sanger-tol/genomeassembly.git
+cd genomeassembly/
+sed -i "s|/home/runner/work/genomeassembly/genomeassembly|${GENOMEASSEMBLY_TEST_DATA}|" assets/test_github.yaml
+nextflow run main.nf -profile test_github,singularity --outdir ${OUTDIR} {OTHER ARGUMENTS}
```
-| Column | Description |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
+These command line steps will download and decompress the test data first, then download the pipeline and modify YAML so that it matches dataset location in your file system.
+The last command line runs the test.
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+You should now be able to run the pipeline as you see fit.
-## Running the pipeline
+
+
+### Running the pipeline
The typical command for running the pipeline is as follows:
-```bash
-nextflow run sanger-tol/genomeassembly --input samplesheet.csv --outdir --genome GRCh37 -profile docker
+```console
+nextflow run sanger-tol/genomeassembly --input assets/dataset.yaml --outdir -profile docker,sanger
```
-This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
+This will launch the pipeline with the `docker` configuration profile, also using your institution profille if available (see [nf-core/configs](#nf-core_configs)). See below for more information about profiles.
Note that the pipeline will create the following files in your working directory:
-```bash
+```console
work # Directory containing the nextflow working files
# Finished results in specified location (defined with --outdir)
.nextflow_log # Log file from Nextflow
# Other nextflow hidden files, eg. history of pipeline runs and old logs.
```
-If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file.
-
-Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `.
-
-> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args).
-> The above pipeline run specified with a params file in yaml format:
-
-```bash
-nextflow run sanger-tol/genomeassembly -profile docker -params-file params.yaml
-```
-
-with `params.yaml` containing:
-
-```yaml
-input: './samplesheet.csv'
-outdir: './results/'
-genome: 'GRCh37'
-input: 'data'
-<...>
-```
-
-You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
-
### Updating the pipeline
When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
-```bash
+```console
nextflow pull sanger-tol/genomeassembly
```
@@ -104,13 +133,9 @@ nextflow pull sanger-tol/genomeassembly
It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since.
-First, go to the [sanger-tol/genomeassembly releases page](https://github.com/sanger-tol/genomeassembly/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag.
+First, go to the [sanger-tol/genomeassembly releases page](https://github.com/sanger-tol/genomeassembly/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`.
-This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports.
-
-To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter.
-
-> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles.
+This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future.
## Core Nextflow arguments
@@ -120,7 +145,7 @@ To further assist in reproducbility, you can use share and re-use [parameter fil
Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments.
-Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below.
+Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. When using Biocontainers, most of these software packaging methods pull Docker containers from quay.io e.g [FastQC](https://quay.io/repository/biocontainers/fastqc) except for Singularity which directly downloads Singularity images via https hosted by the [Galaxy project](https://depot.galaxyproject.org/singularity/) and Conda which downloads and installs software locally from [Bioconda](https://bioconda.github.io/).
> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported.
@@ -129,11 +154,8 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c
Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important!
They are loaded in sequence, so later profiles can overwrite earlier profiles.
-If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment.
+If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended.
-- `test`
- - A profile with a complete configuration for automated testing
- - Includes links to test data so needs no other parameters
- `docker`
- A generic configuration profile to be used with [Docker](https://docker.com/)
- `singularity`
@@ -144,10 +166,11 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof
- A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/)
- `charliecloud`
- A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/)
-- `apptainer`
- - A generic configuration profile to be used with [Apptainer](https://apptainer.org/)
- `conda`
- - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer.
+ - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud.
+- `test`
+ - A profile with a complete configuration for automated testing
+ - Includes links to test data so needs no other parameters
### `-resume`
@@ -163,23 +186,11 @@ Specify the path to a specific config file (this is a core Nextflow command). Se
### Resource requests
-Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped.
+Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](../conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped.
To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website.
-### Custom Containers
-
-In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date.
-
-To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website.
-
-### Custom Tool Arguments
-
-A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default.
-
-To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website.
-
-### nf-core/configs
+### nf-core/configs
In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile.
@@ -187,14 +198,6 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config
If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs).
-## Azure Resource Requests
-
-To be used with the `azurebatch` profile by specifying the `-profile azurebatch`.
-We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required.
-
-Note that the choice of VM size depends on your quota and the overall workload during the analysis.
-For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes).
-
## Running in the background
Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished.
@@ -209,6 +212,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo
In some cases, the Nextflow Java virtual machines can start to request a large amount of memory.
We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`):
-```bash
+```console
NXF_OPTS='-Xms1g -Xmx4g'
```
diff --git a/lib/WorkflowGenomeassembly.groovy b/lib/WorkflowGenomeassembly.groovy
index 9d829808..48bd88d8 100755
--- a/lib/WorkflowGenomeassembly.groovy
+++ b/lib/WorkflowGenomeassembly.groovy
@@ -7,13 +7,6 @@ import groovy.text.SimpleTemplateEngine
class WorkflowGenomeassembly {
- //
- // Check and validate parameters
- //
- public static void initialise(params, log) {
- genomeExistsError(params, log)
- }
-
//
// Get workflow summary for MultiQC
//
@@ -58,17 +51,4 @@ class WorkflowGenomeassembly {
return description_html
}
- //
- // Exit pipeline if incorrect --genome key provided
- //
- private static void genomeExistsError(params, log) {
- if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
- def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
- " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" +
- " Currently, the available genome keys are:\n" +
- " ${params.genomes.keySet().join(", ")}\n" +
- "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
- Nextflow.error(error_string)
- }
- }
}
diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
index 9499b819..c3e4f726 100755
--- a/lib/WorkflowMain.groovy
+++ b/lib/WorkflowMain.groovy
@@ -86,15 +86,4 @@ class WorkflowMain {
Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'")
}
}
- //
- // Get attribute from genome config file e.g. fasta
- //
- public static Object getGenomeAttribute(params, attribute) {
- if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) {
- if (params.genomes[ params.genome ].containsKey(attribute)) {
- return params.genomes[ params.genome ][ attribute ]
- }
- }
- return null
- }
}
diff --git a/lib/WorkflowSanger-tol-genomeassembly.groovy b/lib/WorkflowSanger-tol-genomeassembly.groovy
index a6a57e21..de4b7754 100755
--- a/lib/WorkflowSanger-tol-genomeassembly.groovy
+++ b/lib/WorkflowSanger-tol-genomeassembly.groovy
@@ -4,15 +4,6 @@
class WorkflowSanger-tol-genomeassembly {
- //
- // Check and validate parameters
- //
- public static void initialise(params, log) {
- genomeExistsError(params, log)
-
-
- }
-
//
// Get workflow summary for MultiQC
//
@@ -38,17 +29,5 @@ class WorkflowSanger-tol-genomeassembly {
yaml_file_text += "data: |\n"
yaml_file_text += "${summary_section}"
return yaml_file_text
- }//
- // Exit pipeline if incorrect --genome key provided
- //
- private static void genomeExistsError(params, log) {
- if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
- log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
- " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" +
- " Currently, the available genome keys are:\n" +
- " ${params.genomes.keySet().join(", ")}\n" +
- "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
- System.exit(1)
- }
}
}
diff --git a/modules.json b/modules.json
index 77812319..99c11583 100644
--- a/modules.json
+++ b/modules.json
@@ -81,11 +81,6 @@
"git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
"installed_by": ["modules"]
},
- "fastqc": {
- "branch": "master",
- "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
- "installed_by": ["modules"]
- },
"freebayes": {
"branch": "master",
"git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
@@ -141,11 +136,6 @@
"installed_by": ["modules"],
"patch": "modules/nf-core/mitohifi/mitohifi/mitohifi-mitohifi.diff"
},
- "multiqc": {
- "branch": "master",
- "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba",
- "installed_by": ["modules"]
- },
"pretextmap": {
"branch": "master",
"git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
@@ -154,7 +144,8 @@
"pretextsnapshot": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
- "installed_by": ["modules"]
+ "installed_by": ["modules"],
+ "patch": "modules/nf-core/pretextsnapshot/pretextsnapshot.diff"
},
"purgedups/calcuts": {
"branch": "master",
diff --git a/modules/local/bamtobed_sort.nf b/modules/local/bamtobed_sort.nf
index 8eff315c..c4d1db6e 100644
--- a/modules/local/bamtobed_sort.nf
+++ b/modules/local/bamtobed_sort.nf
@@ -24,7 +24,7 @@ process BAMTOBED_SORT {
def st_cores = task.cpus > 4 ? 4 : "${task.cpus}"
def buffer_mem = task.memory.toGiga() / 2
"""
- samtools view -@${st_cores} -u -F0x400 ${bam} | bamToBed | sort -k4 --parallel=${task.cpus} -S ${buffer_mem}G > ${prefix}_merged_sorted.bed
+ samtools view -@${st_cores} -u -F0x400 ${bam} | bamToBed | sort -k4 --parallel=${task.cpus} -S ${buffer_mem}G -T . > ${prefix}_merged_sorted.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/local/get_calcuts_params.nf b/modules/local/get_calcuts_params.nf
index 54b074b0..fac83cca 100644
--- a/modules/local/get_calcuts_params.nf
+++ b/modules/local/get_calcuts_params.nf
@@ -22,7 +22,7 @@ process GET_CALCUTS_PARAMS {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
- \$(python --version)
+ python: \$(python --version)
END_VERSIONS
"""
}
diff --git a/modules/local/gfa_to_fasta.nf b/modules/local/gfa_to_fasta.nf
index e826e0c4..d273d9ab 100644
--- a/modules/local/gfa_to_fasta.nf
+++ b/modules/local/gfa_to_fasta.nf
@@ -2,7 +2,7 @@ process GFA_TO_FASTA {
tag "$meta.id"
label 'process_high'
- conda (params.enable_conda ? "conda-forge::gawk=5.1.0" : null)
+ conda "conda-forge::gawk=5.1.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'quay.io/biocontainers/gawk:5.1.0' }"
diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf
deleted file mode 100644
index 9ae58381..00000000
--- a/modules/nf-core/fastqc/main.nf
+++ /dev/null
@@ -1,51 +0,0 @@
-process FASTQC {
- tag "$meta.id"
- label 'process_medium'
-
- conda "bioconda::fastqc=0.11.9"
- container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' :
- 'quay.io/biocontainers/fastqc:0.11.9--0' }"
-
- input:
- tuple val(meta), path(reads)
-
- output:
- tuple val(meta), path("*.html"), emit: html
- tuple val(meta), path("*.zip") , emit: zip
- path "versions.yml" , emit: versions
-
- when:
- task.ext.when == null || task.ext.when
-
- script:
- def args = task.ext.args ?: ''
- def prefix = task.ext.prefix ?: "${meta.id}"
- // Make list of old name and new name pairs to use for renaming in the bash while loop
- def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] }
- def rename_to = old_new_pairs*.join(' ').join(' ')
- def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ')
- """
- printf "%s %s\\n" $rename_to | while read old_name new_name; do
- [ -f "\${new_name}" ] || ln -s \$old_name \$new_name
- done
- fastqc $args --threads $task.cpus $renamed_files
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" )
- END_VERSIONS
- """
-
- stub:
- def prefix = task.ext.prefix ?: "${meta.id}"
- """
- touch ${prefix}.html
- touch ${prefix}.zip
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" )
- END_VERSIONS
- """
-}
diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml
deleted file mode 100644
index 4da5bb5a..00000000
--- a/modules/nf-core/fastqc/meta.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: fastqc
-description: Run FastQC on sequenced reads
-keywords:
- - quality control
- - qc
- - adapters
- - fastq
-tools:
- - fastqc:
- description: |
- FastQC gives general quality metrics about your reads.
- It provides information about the quality score distribution
- across your reads, the per base sequence content (%A/C/G/T).
- You get information about adapter contamination and other
- overrepresented sequences.
- homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
- documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/
- licence: ["GPL-2.0-only"]
-input:
- - meta:
- type: map
- description: |
- Groovy Map containing sample information
- e.g. [ id:'test', single_end:false ]
- - reads:
- type: file
- description: |
- List of input FastQ files of size 1 and 2 for single-end and paired-end data,
- respectively.
-output:
- - meta:
- type: map
- description: |
- Groovy Map containing sample information
- e.g. [ id:'test', single_end:false ]
- - html:
- type: file
- description: FastQC report
- pattern: "*_{fastqc.html}"
- - zip:
- type: file
- description: FastQC report archive
- pattern: "*_{fastqc.zip}"
- - versions:
- type: file
- description: File containing software versions
- pattern: "versions.yml"
-authors:
- - "@drpatelh"
- - "@grst"
- - "@ewels"
- - "@FelixKrueger"
diff --git a/modules/nf-core/pretextsnapshot/main.nf b/modules/nf-core/pretextsnapshot/main.nf
index a881fe0b..254ed22d 100644
--- a/modules/nf-core/pretextsnapshot/main.nf
+++ b/modules/nf-core/pretextsnapshot/main.nf
@@ -19,7 +19,7 @@ process PRETEXTSNAPSHOT {
script:
def args = task.ext.args ?: ''
- def prefix = task.ext.prefix ?: "${meta.id}"
+ def prefix = task.ext.prefix ?: "${meta.id}."
"""
PretextSnapshot \\
$args \\
diff --git a/modules/nf-core/pretextsnapshot/pretextsnapshot.diff b/modules/nf-core/pretextsnapshot/pretextsnapshot.diff
new file mode 100644
index 00000000..768880ba
--- /dev/null
+++ b/modules/nf-core/pretextsnapshot/pretextsnapshot.diff
@@ -0,0 +1,14 @@
+Changes in module 'nf-core/pretextsnapshot'
+--- modules/nf-core/pretextsnapshot/main.nf
++++ modules/nf-core/pretextsnapshot/main.nf
+@@ -19,7 +19,7 @@
+
+ script:
+ def args = task.ext.args ?: ''
+- def prefix = task.ext.prefix ?: "${meta.id}"
++ def prefix = task.ext.prefix ?: "${meta.id}."
+ """
+ PretextSnapshot \\
+ $args \\
+
+************************************************************
diff --git a/nextflow.config b/nextflow.config
index 68315f2b..bb8b0e2e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -26,10 +26,10 @@ params {
bed_chunks_polishing = 10
// Scaffolding postprocessing
- cool_bin = null
+ cool_bin = 1000
// Boilerplate options
- outdir = null
+ outdir = "./results"
tracedir = "${params.outdir}/genomeassembly_info"
publish_dir_mode = 'copy'
email = null
@@ -81,6 +81,7 @@ try {
profiles {
+ cleanup { cleanup = true }
debug {
dumpHashes = true
process.beforeScript = 'echo $HOSTNAME'
@@ -204,6 +205,7 @@ report {
trace {
enabled = true
file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
+ fields = 'name,status,module,cpus,memory,attempt,realtime,%cpu,%mem,peak_rss'
}
dag {
enabled = true
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 2a6d755e..0e0f0ded 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -59,15 +59,18 @@
},
"timestamp": {
"type": "string",
- "hidden": true
+ "hidden": true,
+ "description": "Used for output naming, defined at runtime"
},
"hifiasm": {
"type": "string",
- "hidden": true
+ "hidden": true,
+ "description": "Used for hifiasm output naming, defined at runtime"
},
"hifiasmhic": {
"type": "string",
- "hidden": true
+ "hidden": true,
+ "description": "Used for hifiasm-hic output naming, defined at runtime"
}
}
},
diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf
index 47cb5a58..35cbe9e7 100644
--- a/subworkflows/local/convert_stats.nf
+++ b/subworkflows/local/convert_stats.nf
@@ -19,37 +19,43 @@ workflow CONVERT_STATS {
main:
ch_versions = Channel.empty()
- // Convert BAM to CRAM
+ //
+ // MODULE: CONVERT BAM TO CRAM
+ //
SAMTOOLS_VIEW ( bam, fasta, [] )
ch_versions = ch_versions.mix(SAMTOOLS_VIEW.out.versions.first())
- // Index CRAM file
+ //
+ // MODULE: INDEX CRAM FILE
+ //
SAMTOOLS_INDEX ( SAMTOOLS_VIEW.out.cram )
ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first())
- // Combine CRAM and CRAI into one channel
+ //
+ // LOGIC: COMBINE CRAM AND CRAI INTP ONE CHANNEL
+ //
SAMTOOLS_VIEW.out.cram
.join(SAMTOOLS_INDEX.out.crai, by: [0], remainder: true)
.set { ch_cram_crai }
- // Calculate statistics
+ //
+ // MODULE: CALCULATE STATS
+ //
SAMTOOLS_STATS ( ch_cram_crai, fasta )
ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first())
- // Calculate statistics based on flag values
+ //
+ // MODULE: CALCULATE STATISTTICS BASED ON FLAG VALUES
+ //
SAMTOOLS_FLAGSTAT ( ch_cram_crai )
ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions.first())
- // Calculate index statistics
+ //
+ // MODULE: CALCULATE INDEX STATISTICS
+ //
SAMTOOLS_IDXSTATS ( ch_cram_crai )
ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions.first())
emit:
- cram = SAMTOOLS_VIEW.out.cram
- crai = SAMTOOLS_INDEX.out.crai
- stats = SAMTOOLS_STATS.out.stats
- flagstat = SAMTOOLS_FLAGSTAT.out.flagstat
- idxstats = SAMTOOLS_IDXSTATS.out.idxstats
-
versions = ch_versions
}
diff --git a/subworkflows/local/assembly_stats.nf b/subworkflows/local/genome_statistics.nf
similarity index 83%
rename from subworkflows/local/assembly_stats.nf
rename to subworkflows/local/genome_statistics.nf
index 4ca2c557..37524e45 100644
--- a/subworkflows/local/assembly_stats.nf
+++ b/subworkflows/local/genome_statistics.nf
@@ -22,17 +22,32 @@ workflow GENOME_STATISTICS {
main:
ch_versions = Channel.empty()
+ //
+ // LOGIC: SEPARATE PRIMARY INTO A CHANNEL
+ //
assembly.map{ meta, primary, haplotigs -> [meta, primary] }
.set{ primary_ch }
+ //
+ // MODULE: RUN GFASTATS ON PRIMARY ASSEMBLY
+ //
GFASTATS_PRI( primary_ch, 'fasta', [], [], [], [], [], [] )
ch_versions = ch_versions.mix(GFASTATS_PRI.out.versions.first())
+ //
+ // LOGIC: SEPARATE HAP INTO A CHANNEL
+ //
assembly.map{ meta, primary, haplotigs -> [meta, haplotigs] }
.set{ haplotigs_ch }
+
+ //
+ // MODULE: RUN GFASTATS ON HAPLOTIGS
+ //
GFASTATS_HAP( haplotigs_ch, 'fasta', [], [], [], [], [], [] )
- // BUSCO
+ //
+ // MODULE: RUN BUSCO ON PRIMARY ASSEMBLY
+ //
BUSCO ( primary_ch.join(lineage)
.map{ meta, primary, lineage_db, lineage_name ->
[[id:meta.id, lineage:lineage_name], primary]},
@@ -41,21 +56,22 @@ workflow GENOME_STATISTICS {
[] )
ch_versions = ch_versions.mix(BUSCO.out.versions.first())
- // MerquryFK
+ //
+ // LOGIC: JOIN ASSEMBLY AND KMER DATABASE INPUT
+ //
hist.join(ktab).join(assembly)
.map{ meta, hist, ktab, primary, hap ->
hap.size() ? [ meta, hist, ktab, primary, hap ] :
[ meta, hist, ktab, primary, [] ] }
.set{ ch_merq }
+
+ //
+ // MODULE: RUN KMER ANALYSIS WITH MERQURYFK
+ //
MERQURYFK_MERQURYFK ( ch_merq )
ch_versions = ch_versions.mix(MERQURYFK_MERQURYFK.out.versions.first())
emit:
- busco = BUSCO.out.short_summaries_json // meta, path("short_summary.*.json")
- merquryk_completeness = MERQURYFK_MERQURYFK.out.stats // meta, stats
- merquryk_qv = MERQURYFK_MERQURYFK.out.qv // meta, qv
- assembly_stats_pri = GFASTATS_PRI.out.assembly_summary // path("*.assembly_summary")
- assembly_stats_alt = GFASTATS_HAP.out.assembly_summary // path("*.assembly_summary")
versions = ch_versions
}
diff --git a/subworkflows/local/genomescope_model.nf b/subworkflows/local/genomescope_model.nf
index 37330640..f2be25d8 100644
--- a/subworkflows/local/genomescope_model.nf
+++ b/subworkflows/local/genomescope_model.nf
@@ -9,22 +9,50 @@ workflow GENOMESCOPE_MODEL {
reads // [meta, [reads]]
main:
- reads.flatMap { meta, reads -> reads instanceof List ? reads.collect{ [ meta, it ] } : [ [ meta, reads ] ] }
- .set{ reads_ch }
+ ch_versions = Channel.empty()
- CAT_CAT_READS( reads_ch )
- CAT_CAT_READS.out.file_out.map{ meta, reads -> reads.getName().endsWith('gz') ? [meta, reads.getParent().toString() + '/' + reads.getBaseName().toString() + '.fa.gz'] : [meta, reads.getParent().toString() + '/' + reads.getBaseName().toString() + '.fa'] }
- .set{ reads_merged_ch }
+ //
+ // MODULE: MERGE ALL READS IN ONE FILE
+ //
+ CAT_CAT_READS( reads )
+ ch_versions = ch_versions.mix(CAT_CAT_READS.out.versions)
+
+ //
+ // LOGIC: KEEP THE CORRECT EXTENSION
+ //
+ CAT_CAT_READS.out.file_out.map{ meta, reads_ch -> reads_ch.getName().endsWith('gz')
+ ? [meta, reads_ch.getParent().toString() + '/' + reads_ch.getBaseName().toString() + '.fa.gz'] : [meta, reads_ch.getParent().toString() + '/' + reads_ch.getBaseName().toString() + '.fa'] }
+ .set{ reads_merged_ch }
+
+ //
+ // LOGIC: MAKE SURE MERGED READS HAVE THE PROPER EXTENTION
+ //
CAT_CAT_READS.out.file_out.join(reads_merged_ch)
.map{ meta, reads_old, reads_new -> reads_old.renameTo(reads_new); }
+
+ //
+ // MODULE: GENERATE KMER DATABASE
+ //
FASTK_FASTK( reads_merged_ch )
+ ch_versions = ch_versions.mix(FASTK_FASTK.out.versions)
+
+ //
+ // MODULE: KEEP THE KMERS HISTOGRAM
+ //
FASTK_HISTEX( FASTK_FASTK.out.hist )
- GENESCOPEFK ( FASTK_HISTEX.out.hist )
+ ch_versions = ch_versions.mix(FASTK_HISTEX.out.versions)
+
+ //
+ // MODULE: GENERATE GENOMESCOPE KMER COVERAGE MODEL
+ //
+ GENESCOPEFK( FASTK_HISTEX.out.hist )
+ ch_versions = ch_versions.mix(GENESCOPEFK.out.versions)
emit:
model = GENESCOPEFK.out.model
hist = FASTK_FASTK.out.hist
ktab = FASTK_FASTK.out.ktab
-
+
+ versions = ch_versions
}
diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf
index 8ac720f5..5a84f792 100644
--- a/subworkflows/local/hic_mapping.nf
+++ b/subworkflows/local/hic_mapping.nf
@@ -11,6 +11,7 @@ include { BWAMEM2_INDEX } from '../../modu
include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_HIC_MAPPING } from '../../modules/nf-core/samtools/faidx/main'
include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_HIC_MAPPING } from '../../modules/nf-core/samtools/merge/main'
include { SAMTOOLS_MARKDUP as SAMTOOLS_MARKDUP_HIC_MAPPING } from '../../modules/nf-core/samtools/markdup/main'
+
include { BAMTOBED_SORT } from '../../modules/local/bamtobed_sort'
include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv'
include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort'
@@ -26,13 +27,13 @@ workflow HIC_MAPPING {
ch_versions = Channel.empty()
//
- // MODULE: Indexing on reference output the folder of indexing files
+ // MODULE: INDEX REFERENCE FASTA
//
BWAMEM2_INDEX (reference_tuple)
- ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
+ ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
//
- // LOGIC: make channel of hic reads as input for GENERATE_CRAM_CSV
+ // LOGIC: JOIN HIC READS AND REFERENCE INTO A NEW CHANNEL
//
reference_tuple
.join( hic_reads_path )
@@ -41,13 +42,13 @@ workflow HIC_MAPPING {
.set { get_reads_input }
//
- // MODULE: generate a cram csv file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
+ // MODULE: GENERATE A CSV LISTING CRAM CHUNKS
//
GENERATE_CRAM_CSV ( get_reads_input )
- ch_versions = ch_versions.mix(GENERATE_CRAM_CSV.out.versions)
+ ch_versions = ch_versions.mix(GENERATE_CRAM_CSV.out.versions)
//
- // LOGIC: organise all parametres into a channel for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
+ // LOGIC: REFACTOR CHANNELS TO GET INPUT FOR CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
//
ch_filtering_input = GENERATE_CRAM_CSV.out.csv
.splitCsv()
@@ -68,13 +69,13 @@ workflow HIC_MAPPING {
}
//
- // MODULE: parallel proccessing bwa-mem2 alignment by given interval of containers from cram files
+ // MODULE: PERFORM READ MAPPING IN PARALLEL MANNER USING CRAM INTERVALS
//
CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT ( ch_filtering_input )
- ch_versions = ch_versions.mix(CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions)
+ ch_versions = ch_versions.mix(CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions)
//
- // LOGIC: PREPARING BAMS FOR MERGE
+ // LOGIC: PREPARE BAMS FOR MERGE
//
CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam
.map{ meta, file ->
@@ -97,7 +98,7 @@ workflow HIC_MAPPING {
SAMTOOLS_FAIDX_HIC_MAPPING( reference_tuple, [[],[]] )
//
- // LOGIC: PREPARING MERGE INPUT
+ // LOGIC: PREPARE MERGE INPUT
//
reference_tuple
.combine( SAMTOOLS_FAIDX_HIC_MAPPING.out.fai )
@@ -109,36 +110,38 @@ workflow HIC_MAPPING {
.set { ref_files }
//
- // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
+ // MODULE: MERGE POSITION SORTED BAM FILES
//
SAMTOOLS_MERGE_HIC_MAPPING ( collected_files_for_merge, ref_files.reference_meta, ref_files.ref_idx )
- ch_versions = ch_versions.mix ( SAMTOOLS_MERGE_HIC_MAPPING.out.versions.first() )
+ ch_versions = ch_versions.mix ( SAMTOOLS_MERGE_HIC_MAPPING.out.versions.first() )
//
- // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
+ // MODULE: MARK DUPLICATES ON THE MERGED BAM
//
SAMTOOLS_MARKDUP_HIC_MAPPING ( SAMTOOLS_MERGE_HIC_MAPPING.out.bam, ref_files.reference )
- ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP_HIC_MAPPING.out.versions )
+ ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP_HIC_MAPPING.out.versions )
//
- // MODULE: SAMTOOLS FILTER OUT DUPLICATE READS | BAMTOBED | SORT BED FILE
+ // MODULE: FILTER OUT DUPLICATE READS, CONVERT BAM TO BED AND SORT BED FILE
//
BAMTOBED_SORT( SAMTOOLS_MARKDUP_HIC_MAPPING.out.bam )
- ch_versions = ch_versions.mix( BAMTOBED_SORT.out.versions )
+ ch_versions = ch_versions.mix( BAMTOBED_SORT.out.versions )
+ //
+ // LOGIC: GENERATE INPUT FOR STATS SUBWORKFLOW
+ //
SAMTOOLS_MARKDUP_HIC_MAPPING.out.bam
.map { meta, bam -> [ meta, bam, [] ] }
.set { ch_stat }
+ //
+ // SUBWORKFLOW: PRODUCE READ MAPPING STATS
+ //
CONVERT_STATS ( ch_stat, ref_files.reference )
+ ch_versions = ch_versions.mix( CONVERT_STATS.out.versions )
emit:
bed = BAMTOBED_SORT.out.sorted_bed
- cram = CONVERT_STATS.out.cram
- crai = CONVERT_STATS.out.crai
- stats = CONVERT_STATS.out.stats
- idxstats = CONVERT_STATS.out.idxstats
- flagstat = CONVERT_STATS.out.flagstat
versions = ch_versions
diff --git a/subworkflows/local/organelles.nf b/subworkflows/local/organelles.nf
index 363e35e3..9f20f8ff 100644
--- a/subworkflows/local/organelles.nf
+++ b/subworkflows/local/organelles.nf
@@ -1,5 +1,5 @@
-include { MITOHIFI_FINDMITOREFERENCE } from '../../modules/nf-core/mitohifi/findmitoreference/main'
-include { MITOHIFI_MITOHIFI } from '../../modules/nf-core/mitohifi/mitohifi/main'
+include { MITOHIFI_FINDMITOREFERENCE } from '../../modules/nf-core/mitohifi/findmitoreference/main'
+include { MITOHIFI_MITOHIFI } from '../../modules/nf-core/mitohifi/mitohifi/main'
workflow ORGANELLES {
take:
@@ -9,16 +9,29 @@ workflow ORGANELLES {
main:
ch_versions = Channel.empty()
+ //
+ // LOGIC: SEPARATE INPUT INTO CHANNELS
+ //
mito_info.map{ species, min_length, code, email -> species}.set{species}
mito_info.map{ species, min_length, code, email -> min_length}.set{min_length}
mito_info.map{ species, min_length, code, email -> code}.set{code}
mito_info.map{ species, min_length, code, email -> email}.set{email}
+
+ //
+ // MODULE: DOWNLOAD REFERENCE ORGANELLE ASSEMBLY
+ //
MITOHIFI_FINDMITOREFERENCE(species, email, min_length)
+ ch_versions = ch_versions.mix(MITOHIFI_FINDMITOREFERENCE.out.versions.first())
+ //
+ // MODULE: IDENTIFY ORGANELLE IN THE DATASET
+ //
MITOHIFI_MITOHIFI( input,
MITOHIFI_FINDMITOREFERENCE.out.fasta,
MITOHIFI_FINDMITOREFERENCE.out.gb,
code)
+ ch_versions = ch_versions.mix(MITOHIFI_FINDMITOREFERENCE.out.versions.first())
+
emit:
versions = ch_versions
diff --git a/subworkflows/local/polishing.nf b/subworkflows/local/polishing.nf
index b863b758..9d036f28 100644
--- a/subworkflows/local/polishing.nf
+++ b/subworkflows/local/polishing.nf
@@ -7,6 +7,7 @@ include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_FB } from '../../modules/nf-c
include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_NORM } from '../../modules/nf-core/bcftools/index/main'
include { GATK4_MERGEVCFS as MERGE_FREEBAYES } from '../../modules/nf-core/gatk4/mergevcfs/main'
include { FREEBAYES } from '../../modules/nf-core/freebayes/main'
+
include { BED_CHUNKS } from '../../modules/local/bed_chunks'
include { LONGRANGER_COVERAGE } from '../../modules/local/longranger_coverage'
include { LONGRANGER_MKREF } from '../../modules/local/longranger/mkref/main'
@@ -22,80 +23,158 @@ workflow POLISHING {
ch_versions = Channel.empty()
//
- // Polishing step 1: map reads to the reference
+ // LOGIC: SEPARATE ASSEMBLY FILE INTO CHANNEL
//
fasta_in.map{ meta, fasta, fai -> [meta, fasta] }
.set{ fasta_ch }
+ //
+ // MODULE: GENERATE INDICES
+ //
LONGRANGER_MKREF(fasta_ch)
ch_versions = ch_versions.mix(LONGRANGER_MKREF.out.versions)
+ //
+ // MODULE: MAP 10X READS TO THE REFERENCE
+ //
LONGRANGER_ALIGN( LONGRANGER_MKREF.out.folder, reads_10X )
ch_versions = ch_versions.mix(LONGRANGER_ALIGN.out.versions)
//
- // Polishing step 2: apply freebayes consensus based on longranger alignments
+ // LOGIC: SEPARATE INDEX FILE INTO CHANNEL
//
// Split genome into chunks
fasta_in.map{ meta, fasta, fai -> [meta, fai] }
.set{chunks_ch}
+
+ //
+ // MODULE: SPLIT ASSEMBLY INTO CHUNKS
+ //
BED_CHUNKS (chunks_ch, bed_chunks_polishing)
ch_versions = ch_versions.mix(BED_CHUNKS.out.versions)
+
+ //
+ // LOGIC: TRANSFORM CHUNKS CHANNEL INTO LIST OF INTERVALS
+ //
intervals_structured = BED_CHUNKS.out.coords.toList().transpose()
+
+ //
+ // LOGIC: JOIN READ MAPPING BAM WITH ITS INDEX
+ //
LONGRANGER_ALIGN.out.bam.join(LONGRANGER_ALIGN.out.bai)
.set{ bam_ch }
+
+ //
+ // LOGIC: CREATE DATA STRUCTURE FOR SCATTERING
+ //
intervals_freebayes = bam_ch.combine(intervals_structured)
.map{ meta, bam, bai, bed -> [ [id: bed.getSimpleName()], bam, bai, [], [], bed] }
- // In case the average coverage from Longranger is provided use it for defining
- // max coverage cut-off then scatter Freebayes over the genome chunks
+ //
+ // LOGIC: SEPARATE ASSEMBLY AND ITS INDEX INTO CHANNELS
+ //
fasta = fasta_in.collect{it[1]}
fai = fasta_in.collect{it[2]}
+
+ //
+ // LOGIC: EXTRACT ALIGNMENT SUMMARY FROM LONGRANGER RESULTS
+ //
LONGRANGER_ALIGN.out.csv.collect{it[1]}
.set{summary}
+
+ //
+ // MODULE: EXTRACT COVERAGE INFORMATION
+ //
LONGRANGER_COVERAGE(summary)
ch_versions = ch_versions.mix(LONGRANGER_COVERAGE.out.versions)
+
+ //
+ // MODULE: SCATTER FREEBAYES OVER THE CHUNKS
+ //
FREEBAYES(intervals_freebayes, fasta, fai, [], [], [], LONGRANGER_COVERAGE.out.cov)
ch_versions = ch_versions.mix(FREEBAYES.out.versions)
+
+ //
+ // MODULE: INDEX FREEBAYES OUTPUT
+ //
BCFTOOLS_INDEX_FB(FREEBAYES.out.vcf)
ch_versions = ch_versions.mix(BCFTOOLS_INDEX_FB.out.versions)
- // Filter and sort vcf for each genome chunk
+ //
+ // LOGIC: REFACTOR AND COMBINE VCF CHANNELS FOR FURTHER PROCESSING
+ //
FREEBAYES.out.vcf.map{ meta, vcf -> [meta.id.toString(), vcf]}
.join(BCFTOOLS_INDEX_FB.out.tbi.map {meta, tbi -> [meta.id.toString(), tbi]})
.map{ id, vcf, tbi -> [[ id: id.toString()+'_view'], vcf, tbi ]}
.set{ input_view }
+
+ //
+ // MODULE: FILTER FREEBAYES RESULTS
+ //
BCFTOOLS_VIEW(input_view, [], [], [])
ch_versions = ch_versions.mix(BCFTOOLS_VIEW.out.versions)
+
+ //
+ // LOGIC: REFACTOR CHANNEL TO AVOID NAME COLLISION
+ //
input_sort = BCFTOOLS_VIEW.out.vcf.map{ meta, vcf -> [ [id: meta.id.toString()+'_sorted'], vcf ]}
+
+ //
+ // MODULE: SORT FILTERED VCF
+ //
BCFTOOLS_SORT(input_sort)
ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions)
- // Merge vcf files into one
+ //
+ // LOGIC: SEPARATE META INTO CHANNEL
+ //
meta_ch = fasta_in.collect{it[0]}
+
+ //
+ // MODULE: MERGE FREEBAYES RESULTS ON CHUNKS
+ //
MERGE_FREEBAYES(BCFTOOLS_SORT.out.vcf.combine(fasta_in)
.map{ meta, vcf, meta_fin, fa, fai -> [[id: meta_fin.id], vcf]}.groupTuple(),
[ [id:'merged'], [] ] )
ch_versions = ch_versions.mix(MERGE_FREEBAYES.out.versions)
- // Normalize variants and index normalized vcf
+ //
+ // LOGIC: REFACTOR AND COMBINE CHANNELS FOR FURTHER PROCESSING
+ //
MERGE_FREEBAYES.out.vcf.map{ meta, vcf -> [meta.id.toString(), vcf]}
.join(MERGE_FREEBAYES.out.tbi.map{ meta, tbi -> [meta.id.toString(), tbi]})
.combine(fasta_in)
.map{ id_norm, vcf, tbi, meta, fasta, fai -> [meta, vcf, tbi] }
.set{ input_norm }
+
+ //
+ // LOGIC: CREATE CHANNEL FROM REFERENCE FILE AND META
+ //
fasta_in.map{ meta, fasta, fai -> [meta, fasta] }
.set{ fasta_meta_ch }
+
+ //
+ // MODULE: LEFT-ALIGN AND NORMALIZE INDELS
+ //
BCFTOOLS_NORM(input_norm, fasta_meta_ch)
ch_versions = ch_versions.mix(BCFTOOLS_NORM.out.versions)
+
+ //
+ // MODULE: INDEX NORMALIZED VARIANTS
+ //
BCFTOOLS_INDEX_NORM(BCFTOOLS_NORM.out.vcf)
ch_versions = ch_versions.mix(BCFTOOLS_INDEX_NORM.out.versions)
- // Generate consensus fasta file
+ //
+ // LOGIC: JOIN VCF CHANNEL WITH ITS INDEX AND FASTA REFERENCE CHANNELS
+ //
BCFTOOLS_NORM.out.vcf
.join(BCFTOOLS_INDEX_NORM.out.tbi, by: [0], remainder: true)
.join(fasta_ch, by: [0], remainder: true)
.set{ ch_merge }
- //ch_merge.view()
+
+ //
+ // MODULE: GENERATE CONSENSUS FASTA FILE
+ //
BCFTOOLS_CONSENSUS(ch_merge)
ch_versions = ch_versions.mix(BCFTOOLS_CONSENSUS.out.versions)
diff --git a/subworkflows/local/prepare_input.nf b/subworkflows/local/prepare_input.nf
index abd838c6..15bd7c0d 100644
--- a/subworkflows/local/prepare_input.nf
+++ b/subworkflows/local/prepare_input.nf
@@ -11,6 +11,7 @@ include { GUNZIP as GUNZIP_HAP } from '../../modules/nf-c
include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_PRIMARY } from '../../modules/nf-core/samtools/faidx/main'
include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_HAPLOTIGS } from '../../modules/nf-core/samtools/faidx/main'
include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_MERGED } from '../../modules/nf-core/samtools/faidx/main'
+
include { FASTA_CONCAT } from '../../modules/local/concat'
workflow PREPARE_INPUT {
@@ -21,9 +22,15 @@ workflow PREPARE_INPUT {
main:
ch_versions = Channel.empty()
+ //
+ // LOAD YAML
+ //
Channel.of(ch_input).map { file -> readYAML( file ) }
.set { ymlfile }
+ //
+ // LOGIC: DIVIDE INPUT INTO BLOCKS BY SEMANTICS
+ //
ymlfile.multiMap{ data ->
dataset : (data.dataset ? data.dataset : [])
busco : (data.busco ? data.busco : [])
@@ -32,6 +39,9 @@ workflow PREPARE_INPUT {
}
.set{ ch_yml_data }
+ //
+ // LOGIC: DIVIDE DATASET INTO BLOCKS BY DATATYPE
+ //
ch_yml_data.dataset.flatten()
.multiMap { data ->
id_ch : (data.id ? [id: data.id] : [])
@@ -48,9 +58,15 @@ workflow PREPARE_INPUT {
}
.set{ dataset_ch }
+ //
+ // LOGIC: ADD HIC MOTIF TO DATASET HIC CHANNEL
+ //
dataset_ch.hic_ch.combine(ch_yml_data.hic_motif)
.set{ hic_ch }
+ //
+ // LOGIC: REFACTOR BUSCO CHANNEL TO ADD META
+ //
dataset_ch.id_ch.combine (
ch_yml_data.busco.flatten()
.map { data -> [
diff --git a/subworkflows/local/purge_dups.nf b/subworkflows/local/purge_dups.nf
index d6d4af21..cec0691c 100644
--- a/subworkflows/local/purge_dups.nf
+++ b/subworkflows/local/purge_dups.nf
@@ -10,7 +10,6 @@ https://github.com/NBISweden/Earth-Biogenome-Project-pilot/blob/5ec2002638055bb8
include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_READS } from "../../modules/nf-core/minimap2/align/main"
include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ASSEMBLY } from "../../modules/nf-core/minimap2/align/main"
-
include { PURGEDUPS_CALCUTS } from '../../modules/nf-core/purgedups/calcuts/main'
include { PURGEDUPS_GETSEQS } from '../../modules/nf-core/purgedups/getseqs/main'
include { PURGEDUPS_PBCSTAT } from '../../modules/nf-core/purgedups/pbcstat/main'
@@ -26,6 +25,11 @@ workflow PURGE_DUPS {
prefix // [ prefix ] prefix for the output files
main:
+ ch_versions = Channel.empty()
+
+ //
+ // LOGIC: TRANSFORM INPUT DATA STRUCTURE
+ //
reads_plus_assembly_ch
.flatMap { meta, reads, assembly, model -> reads instanceof List ? reads.collect{ [ meta, reads, assembly, model ] } : [ [ meta, reads, assembly, model ] ] }
.multiMap { meta, reads, assembly, model ->
@@ -35,7 +39,9 @@ workflow PURGE_DUPS {
}
.set { input }
- // Map pacbio reads
+ //
+ // MODULE: MAP HIFI READS TO CONTIGS
+ //
MINIMAP2_ALIGN_READS(
input.reads_ch,
input.assembly_ch,
@@ -44,18 +50,47 @@ workflow PURGE_DUPS {
false, // cigar in bam file
false // no split index
)
+ ch_versions = ch_versions.mix(MINIMAP2_ALIGN_READS.out.versions)
+
+ //
+ // MODULE: CREATE READ DEPTH HISTOGRAM
+ //
PURGEDUPS_PBCSTAT( MINIMAP2_ALIGN_READS.out.paf.groupTuple() )
+ ch_versions = ch_versions.mix(PURGEDUPS_PBCSTAT.out.versions)
+ //
+ // MODULE: PARSE KMER COVERAGE
+ //
GET_CALCUTS_PARAMS( input.model_ch )
+ ch_versions = ch_versions.mix(GET_CALCUTS_PARAMS.out.versions)
+ //
+ // MODULE: GENERATES CUTOFFS BASED ON HISTOGRAM AND KMER COVERAGE
+ //
PURGEDUPS_CALCUTS( PURGEDUPS_PBCSTAT.out.stat, GET_CALCUTS_PARAMS.out.cutoffs )
+ ch_versions = ch_versions.mix(PURGEDUPS_CALCUTS.out.versions)
- // Split assembly and do self alignment
+ //
+ // LOGIC: TRANSFORM ASSEMBLY INPUT
+ //
reads_plus_assembly_ch
.map { meta, reads, assembly, model -> [ meta, assembly ] }
.set { minimal_assembly_ch }
+
+ //
+ // MODULE: SPLIT ASSEMLBY
+ //
PURGEDUPS_SPLITFA( minimal_assembly_ch )
+ ch_versions = ch_versions.mix(PURGEDUPS_SPLITFA.out.versions)
+
+ //
+ // LOGIC: ESTIMATE THE SIZE OF THE INDEX
+ //
minimal_assembly_ch.map{ meta, asm -> Math.ceil(asm.size()/1e9).round() }.set{ idx_num }
+
+ //
+ // MODULE: PEFORM SELF ALIGNMENT
+ //
MINIMAP2_ALIGN_ASSEMBLY (
PURGEDUPS_SPLITFA.out.split_fasta,
[], // Trigger read to read alignment
@@ -64,20 +99,35 @@ workflow PURGE_DUPS {
false, // cigar in bam file
idx_num
)
+ ch_versions = ch_versions.mix(MINIMAP2_ALIGN_ASSEMBLY.out.versions)
+ //
+ // MODULE: PURGE HAPLOTIGS
+ //
PURGEDUPS_PURGEDUPS(
PURGEDUPS_PBCSTAT.out.basecov
.join( PURGEDUPS_CALCUTS.out.cutoff )
.map { meta, cov, cutoff -> [ meta.findAll { !(it.key in [ 'single_end' ]) }, cov, cutoff ] }
.join( MINIMAP2_ALIGN_ASSEMBLY.out.paf )
)
+ ch_versions = ch_versions.mix(PURGEDUPS_PURGEDUPS.out.versions)
+ //
+ // LOGIC: PREPARE INPUT FOR GETSEQS
+ //
minimal_assembly_ch.join( PURGEDUPS_PURGEDUPS.out.bed )
.map { meta, assembly, bed -> [[id:meta.id, prefix:prefix], assembly, bed] }
.set { ch_getseqs_input }
+
+ //
+ // MODULE: GENERATE PRIMARY AND ALT CONTIGS
+ //
PURGEDUPS_GETSEQS( ch_getseqs_input )
+ ch_versions = ch_versions.mix(PURGEDUPS_GETSEQS.out.versions)
emit:
pri = PURGEDUPS_GETSEQS.out.purged
alt = PURGEDUPS_GETSEQS.out.haplotigs
+
+ versions = ch_versions
}
diff --git a/subworkflows/local/raw_assembly.nf b/subworkflows/local/raw_assembly.nf
index 6ef65926..37aeb349 100644
--- a/subworkflows/local/raw_assembly.nf
+++ b/subworkflows/local/raw_assembly.nf
@@ -15,33 +15,44 @@ workflow RAW_ASSEMBLY {
main:
ch_versions = Channel.empty()
+ //
+ // MODULE: RUN HIFIASM IN STANDARD WAY
+ //
HIFIASM_PRI(hifi_reads, [], [], [], [], [])
ch_versions = ch_versions.mix(HIFIASM_PRI.out.versions)
+ //
+ // MODULE: CONVERT PRIMARY CONTIGS TO FASTA
+ //
GFA_TO_FASTA_PRI( HIFIASM_PRI.out.primary_contigs )
+
+ //
+ // MODULE: CONVERT ALT CONTIGS TO FASTA
+ //
GFA_TO_FASTA_ALT( HIFIASM_PRI.out.alternate_contigs )
ch_versions = ch_versions.mix(GFA_TO_FASTA_PRI.out.versions)
+ //
+ // LOGIC: IF FLAG SWITCHED ON RUN HIFIASM IN HIC MODE
+ //
if ( hifiasm_hic_on ) {
+ //
+ // MODULE: RUN HIFIASM IN HIC MODE
+ //
HIFIASM_HIC(hifi_reads, [], [], [], [], hic_reads)
+
+ //
+ // MODULE: CONVERT HIFIASM-HIC PRIMARY CONTIGS TO FASTA
+ //
GFA_TO_FASTA_PRI_HIC( HIFIASM_HIC.out.hic_primary_contigs )
+
+ //
+ // MODULE: CONVERT HIFIASM-HIC ALT CONTIGS TO FASTA
+ //
GFA_TO_FASTA_ALT_HIC( HIFIASM_HIC.out.hic_alternate_contigs )
}
emit:
- raw_unitigs = HIFIASM_PRI.out.raw_unitigs
- source_overlaps = HIFIASM_PRI.out.source_overlaps
- reverse_overlaps = HIFIASM_PRI.out.reverse_overlaps
- corrected_reads = HIFIASM_PRI.out.corrected_reads
- primary_contigs_gfa = HIFIASM_PRI.out.primary_contigs
- alternate_contigs_gfa = HIFIASM_PRI.out.alternate_contigs
- processed_unitigs = HIFIASM_PRI.out.processed_unitigs
-
- primary_hic_contigs_gfa = hifiasm_hic_on ? HIFIASM_HIC.out.hic_primary_contigs : null
- alternate_hic_contigs_gfa = hifiasm_hic_on ? HIFIASM_HIC.out.hic_alternate_contigs : null
- phased_hic_contigs_hap1_gfa = hifiasm_hic_on ? HIFIASM_HIC.out.paternal_contigs : null
- phased_hic_contigs_hap2_gfa = hifiasm_hic_on ? HIFIASM_HIC.out.maternal_contigs : null
-
primary_contigs = GFA_TO_FASTA_PRI.out.fasta
alternate_contigs = GFA_TO_FASTA_ALT.out.fasta
primary_hic_contigs = hifiasm_hic_on ? GFA_TO_FASTA_PRI_HIC.out.fasta : null
diff --git a/subworkflows/local/scaffolding.nf b/subworkflows/local/scaffolding.nf
index 3d355546..b15b4439 100644
--- a/subworkflows/local/scaffolding.nf
+++ b/subworkflows/local/scaffolding.nf
@@ -3,11 +3,12 @@ include { COOLER_ZOOMIFY } from '../../modules/nf-core/cool
include { SAMTOOLS_FAIDX as CONTIGS_FAIDX } from '../../modules/nf-core/samtools/faidx/main.nf'
include { SAMTOOLS_FAIDX as SCAFFOLDS_FAIDX } from '../../modules/nf-core/samtools/faidx/main.nf'
include { YAHS } from '../../modules/nf-core/yahs/main'
+include { PRETEXTMAP } from '../../modules/nf-core/pretextmap/main.nf'
+include { PRETEXTSNAPSHOT } from '../../modules/nf-core/pretextsnapshot/main'
+
include { JUICER_PRE } from '../../modules/local/juicer_pre.nf'
include { JUICER_TOOLS_PRE } from '../../modules/local/juicer_tools_pre.nf'
include { PREPARE_PRETEXTMAP_INPUT } from '../../modules/local/prepare_pretext_input.nf'
-include { PRETEXTMAP } from '../../modules/nf-core/pretextmap/main.nf'
-include { PRETEXTSNAPSHOT } from '../../modules/nf-core/pretextsnapshot/main'
include { CHROM_SIZES } from '../../modules/local/chrom_sizes.nf'
workflow SCAFFOLDING {
@@ -18,64 +19,117 @@ workflow SCAFFOLDING {
main:
ch_versions = Channel.empty()
+
+ //
+ // MODULE: INDEX INPUT ASSEMBLY
+ //
CONTIGS_FAIDX( fasta_in, [[],[]] )
ch_versions = ch_versions.mix(CONTIGS_FAIDX.out.versions)
+
+ //
+ // LOGIC: SEPARATE INPUT CHANNELS FOR YAHS
+ //
CONTIGS_FAIDX.out.fai.join( fasta_in )
.map{ meta, fai, fasta -> fasta }
.set{ scaf_ref }
CONTIGS_FAIDX.out.fai.join( fasta_in )
.map{ meta, fai, fasta -> fai }
.set{ scaf_ref_fai }
+
+ //
+ // MODULE: PERFORM SCAAFFOLDING WITH YAHS
+ //
YAHS( bed_in, scaf_ref, scaf_ref_fai )
ch_versions = ch_versions.mix(YAHS.out.versions)
+
+ //
+ // MODULE: INDEX SCAFFOLDS
+ //
SCAFFOLDS_FAIDX(YAHS.out.scaffolds_fasta, [[],[]])
ch_versions = ch_versions.mix(SCAFFOLDS_FAIDX.out.versions)
+
+ //
+ // LOGIC: KEEP META
+ //
bed_in.map{ meta, bed -> meta}.set{ch_meta}
- // Prepare contact pairs for cooler
+ //
+ // LOGIC: PREPARE CONTACT PAIRS FOR COOLER
+ //
YAHS.out.binary.join(YAHS.out.scaffolds_agp)
.combine(scaf_ref)
.combine(scaf_ref_fai)
.map{meta, binary, agp, fa, fai -> [meta, binary, agp, fai]}
.set{ch_merge}
+
+ //
+ // MODULE: PREPARE INPUT FOR COOLER
+ //
JUICER_PRE(ch_merge)
ch_versions = ch_versions.mix(JUICER_PRE.out.versions)
- // Bin contact pairs
+ //
+ // LOGIC: BIN CONTACT PAIRS
+ //
JUICER_PRE.out.pairs.join(bed_in)
.combine(Channel.of(cool_bin))
.set{ch_juicer}
+
+ //
+ // MODULE: GENERATE SCAFFOLD SIZES
+ //
CHROM_SIZES(SCAFFOLDS_FAIDX.out.fai)
+ ch_versions = ch_versions.mix(CHROM_SIZES.out.versions)
+
+ //
+ // MODULE: GENERATE A MULTI-RESOLUTION COOLER FILE BY COARSENING
+ //
COOLER_CLOAD(ch_juicer, CHROM_SIZES.out.chrom_sizes)
ch_versions = ch_versions.mix(COOLER_CLOAD.out.versions)
- // Generate a multi-resolution cooler file by coarsening
+ //
+ // LOGIC: REFACTOR CHANNEL FOR ZOOMIFY
+ //
COOLER_CLOAD.out.cool.map{ meta, cools, cool_bin-> [meta, cools]}
.set{ch_cool}
+
+ //
+ // MODULE: ZOOM COOL TO MCOOL
+ //
COOLER_ZOOMIFY(ch_cool)
ch_versions = ch_versions.mix(COOLER_ZOOMIFY.out.versions)
- // Create contact map in pretext format
+ //
+ // LOGIC: EXTRACT INDEX FILE
+ //
SCAFFOLDS_FAIDX.out.fai.map{ meta, fai -> fai }.set{fai}
+
+ //
+ // MODULE: COMBINE SCAFFOLDS SIZES AND PAIRS FOR PRETEXT
+ //
PREPARE_PRETEXTMAP_INPUT(JUICER_PRE.out.pairs, fai)
ch_versions = ch_versions.mix(PREPARE_PRETEXTMAP_INPUT.out.versions)
+
+ //
+ // MODULE: GENERATE PRETEXT MAP FROM UPDATED PAIRS
+ //
PRETEXTMAP(PREPARE_PRETEXTMAP_INPUT.out.pairs, [])
ch_versions = ch_versions.mix(PRETEXTMAP.out.versions)
+ //
+ // MODULE: GENERATE PNG FROM STANDARD PRETEXT
+ //
PRETEXTSNAPSHOT(PRETEXTMAP.out.pretext)
ch_versions = ch_versions.mix(PRETEXTSNAPSHOT.out.versions)
- // Generate HiC Map
+ //
+ // MODULE: GENERATE HIC MAP
+ //
JUICER_TOOLS_PRE(JUICER_PRE.out.pairs, CHROM_SIZES.out.chrom_sizes, 'yahs_scaffolds')
ch_versions = ch_versions.mix(JUICER_TOOLS_PRE.out.versions)
emit:
- alignments_sorted = JUICER_PRE.out.pairs
fasta = YAHS.out.scaffolds_fasta
- chrom_sizes = CHROM_SIZES.out.chrom_sizes
- cool = COOLER_CLOAD.out.cool
- mcool = COOLER_ZOOMIFY.out.mcool
- snapshots = PRETEXTSNAPSHOT.out.image
- hic = JUICER_TOOLS_PRE.out.hic
+
versions = ch_versions.ifEmpty(null)
}
diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf
index 8f4981f5..04b19f51 100644
--- a/workflows/genomeassembly.nf
+++ b/workflows/genomeassembly.nf
@@ -6,16 +6,12 @@
def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
-// Validate input parameters
-WorkflowGenomeassembly.initialise(params, log)
-
-// TODO nf-core: Add all file path parameters for the pipeline to the list below
// Check input path parameters to see if they exist
def checkPathParamList = [ params.input ]
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
// Check mandatory parameters
-if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
+if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input YAML not specified!' }
if (params.bed_chunks_polishing) { bed_chunks_polishing = params.bed_chunks_polishing } else { bed_chunks_polishing = 100; }
@@ -23,7 +19,7 @@ if (params.cool_bin) { cool_bin = params.cool_bin } else { cool_bin = 1000; }
if (params.polishing_on) { polishing_on = params.polishing_on } else { polishing_on = false; }
if (params.hifiasm_hic_on) { hifiasm_hic_on = params.hifiasm_hic_on } else { hifiasm_hic_on = false; }
-if ('organelles_on' in params.keySet() && !params.organelles_on) { organelles_on = false } else { organelles_on = true; }
+if (params.organelles_on) { organelles_on = params.organelles_on } else { organelles_on = false; }
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CONFIG FILES
@@ -39,37 +35,36 @@ if ('organelles_on' in params.keySet() && !params.organelles_on) { organelles_o
//
// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
//
-include { PREPARE_INPUT } from '../subworkflows/local/prepare_input'
-include { RAW_ASSEMBLY } from '../subworkflows/local/raw_assembly'
-include { ORGANELLES as ORGANELLES_READS } from '../subworkflows/local/organelles'
-include { ORGANELLES as ORGANELLES_CONTIGS } from '../subworkflows/local/organelles'
-include { GENOMESCOPE_MODEL } from '../subworkflows/local/genomescope_model'
-include { PURGE_DUPS as PURGE_DUPS_PRI } from '../subworkflows/local/purge_dups'
-include { PURGE_DUPS as PURGE_DUPS_ALT } from '../subworkflows/local/purge_dups'
-include { POLISHING } from '../subworkflows/local/polishing'
-include { SCAFFOLDING } from '../subworkflows/local/scaffolding'
-include { KEEP_SEQNAMES as KEEP_SEQNAMES_PRIMARY } from '../modules/local/keep_seqnames'
-include { KEEP_SEQNAMES as KEEP_SEQNAMES_HAPLOTIGS } from '../modules/local/keep_seqnames'
-include { HIC_MAPPING } from '../subworkflows/local/hic_mapping'
-include { GENOME_STATISTICS as GENOME_STATISTICS_RAW } from '../subworkflows/local/assembly_stats'
-include { GENOME_STATISTICS as GENOME_STATISTICS_RAW_HIC } from '../subworkflows/local/assembly_stats'
-include { GENOME_STATISTICS as GENOME_STATISTICS_PURGED } from '../subworkflows/local/assembly_stats'
-include { GENOME_STATISTICS as GENOME_STATISTICS_POLISHED } from '../subworkflows/local/assembly_stats'
-include { GENOME_STATISTICS as GENOME_STATISTICS_SCAFFOLDS } from '../subworkflows/local/assembly_stats'
+include { PREPARE_INPUT } from '../subworkflows/local/prepare_input'
+include { RAW_ASSEMBLY } from '../subworkflows/local/raw_assembly'
+include { ORGANELLES as ORGANELLES_READS } from '../subworkflows/local/organelles'
+include { ORGANELLES as ORGANELLES_CONTIGS } from '../subworkflows/local/organelles'
+include { GENOMESCOPE_MODEL } from '../subworkflows/local/genomescope_model'
+include { PURGE_DUPS as PURGE_DUPS_PRI } from '../subworkflows/local/purge_dups'
+include { PURGE_DUPS as PURGE_DUPS_ALT } from '../subworkflows/local/purge_dups'
+include { POLISHING } from '../subworkflows/local/polishing'
+include { SCAFFOLDING } from '../subworkflows/local/scaffolding'
+include { KEEP_SEQNAMES as KEEP_SEQNAMES_PRIMARY } from '../modules/local/keep_seqnames'
+include { KEEP_SEQNAMES as KEEP_SEQNAMES_HAPLOTIGS } from '../modules/local/keep_seqnames'
+include { HIC_MAPPING } from '../subworkflows/local/hic_mapping'
+include { GENOME_STATISTICS as GENOME_STATISTICS_RAW } from '../subworkflows/local/genome_statistics'
+include { GENOME_STATISTICS as GENOME_STATISTICS_RAW_HIC } from '../subworkflows/local/genome_statistics'
+include { GENOME_STATISTICS as GENOME_STATISTICS_PURGED } from '../subworkflows/local/genome_statistics'
+include { GENOME_STATISTICS as GENOME_STATISTICS_POLISHED } from '../subworkflows/local/genome_statistics'
+include { GENOME_STATISTICS as GENOME_STATISTICS_SCAFFOLDS } from '../subworkflows/local/genome_statistics'
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT NF-CORE MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
-include { CAT_CAT as CAT_CAT_MITOHIFI_READS } from "../modules/nf-core/cat/cat/main"
-include { CAT_CAT as CAT_CAT_HAPLOTIGS } from "../modules/nf-core/cat/cat/main"
-include { CAT_CAT as CAT_CAT_PURGEDUPS } from "../modules/nf-core/cat/cat/main"
-include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_PURGEDUPS } from '../modules/nf-core/samtools/faidx/main'
-
-include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
-include { SEQTK_SUBSEQ as SEQTK_SUBSEQ_PRIMARY } from '../modules/nf-core/seqtk/subseq/main'
-include { SEQTK_SUBSEQ as SEQTK_SUBSEQ_HAPLOTIGS } from '../modules/nf-core/seqtk/subseq/main'
+include { CAT_CAT as CAT_CAT_MITOHIFI_READS } from "../modules/nf-core/cat/cat/main"
+include { CAT_CAT as CAT_CAT_HAPLOTIGS } from "../modules/nf-core/cat/cat/main"
+include { CAT_CAT as CAT_CAT_PURGEDUPS } from "../modules/nf-core/cat/cat/main"
+include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX_PURGEDUPS } from '../modules/nf-core/samtools/faidx/main'
+include { SEQTK_SUBSEQ as SEQTK_SUBSEQ_HAPLOTIGS } from '../modules/nf-core/seqtk/subseq/main'
+include { SEQTK_SUBSEQ as SEQTK_SUBSEQ_PRIMARY } from '../modules/nf-core/seqtk/subseq/main'
+include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -84,33 +79,78 @@ workflow GENOMEASSEMBLY {
ch_versions = Channel.empty()
//
- // SUBWORKFLOW: Read in yaml, validate and prepare for further steps
+ // SUBWORKFLOW: READ IN YAML, VALIDATE AND PREPARE FOR FURTHER STEPS
//
PREPARE_INPUT(ch_input)
ch_versions = ch_versions.mix(PREPARE_INPUT.out.versions)
-
+
+ //
+ // LOGIC: CREATE A VARIABLE SERVING AS AN ALIAS FOR HIFI READS CHANNEL
+ //
PREPARE_INPUT.out.hifi.set{ hifi_reads_ch }
-
-
+
+ //
+ // LOGIC: SEPARATE READS PATHS INTO A DIFFERENT CHANNEL
+ //
PREPARE_INPUT.out.hic.map{ meta, reads, motif -> reads }.set{ hic_reads_ch }
+ //
+ // SUBWORKFLOW: GENERATE KMER DATABASE AND PROFILE MODEL
+ //
GENOMESCOPE_MODEL( hifi_reads_ch )
+ ch_versions = ch_versions.mix(GENOMESCOPE_MODEL.out.versions)
+ //
+ // LOGIC: ONLY LOOK FOR A MITO IF THE CORRESPONDING FLAG IS SET
+ //
if ( organelles_on ) {
+ //
+ // MODULE: MERGE INPUT FASTA FILES WITH PACBIO READS
+ //
CAT_CAT_MITOHIFI_READS(hifi_reads_ch)
+ ch_versions = ch_versions.mix(CAT_CAT_MITOHIFI_READS.out.versions)
+
+ //
+ // SUBWORKFLOW: IDENTIFY ORGANELLES ON THE RAW READS
+ //
ORGANELLES_READS(CAT_CAT_MITOHIFI_READS.out.file_out, PREPARE_INPUT.out.mito)
+ ch_versions = ch_versions.mix(ORGANELLES_READS.out.versions)
}
- RAW_ASSEMBLY( hifi_reads_ch , hic_reads_ch, hifiasm_hic_on )
+ //
+ // SUBWORKFLOW: RUN A HIFIASM ASSEMBLY ON THE HIFI READS; ALSO CREATE
+ // A HIFIASM RUN IN HIC MODE IF THE FLAG IS SWITCHED ON
+ //
+ RAW_ASSEMBLY( hifi_reads_ch, hic_reads_ch, hifiasm_hic_on )
+ ch_versions = ch_versions.mix(RAW_ASSEMBLY.out.versions)
+
+ //
+ // LOGIC: DEFINE THE PRIMARY CONTIGS CHANNEL
+ //
RAW_ASSEMBLY.out.primary_contigs.set{ primary_contigs_ch }
+
+ //
+ // LOGIC: DEFINE THE HAPLOTIGS CHANNELS
+ //
RAW_ASSEMBLY.out.alternate_contigs.set{ haplotigs_ch }
+
+ //
+ // SUBWORKFLOW: CALCULATE STATISTICS FOR THE RAW ASSEMBLY
+ //
GENOME_STATISTICS_RAW( primary_contigs_ch.join(haplotigs_ch),
PREPARE_INPUT.out.busco,
GENOMESCOPE_MODEL.out.hist,
GENOMESCOPE_MODEL.out.ktab
)
+ ch_versions = ch_versions.mix(GENOME_STATISTICS_RAW.out.versions)
+ //
+ // LOGIC: CHECK IF THE HIFIASM HIC MODE WAS SWITCHED ON
+ //
if ( hifiasm_hic_on ) {
+ //
+ // SUBWORKFLOW: CALCULATE RAW ASSEMBLY STATISTICS FOR THE HIFIASN IN HIC MODE
+ //
GENOME_STATISTICS_RAW_HIC( RAW_ASSEMBLY.out.primary_hic_contigs
.join(RAW_ASSEMBLY.out.alternate_hic_contigs),
PREPARE_INPUT.out.busco,
@@ -118,106 +158,208 @@ workflow GENOMEASSEMBLY {
GENOMESCOPE_MODEL.out.ktab
)
}
+
+ //
+ // LOGIC: CREATE AN INPUT DATA STRUCTURE FOR PURGING
+ //
hifi_reads_ch.join(primary_contigs_ch)
.join(GENOMESCOPE_MODEL.out.model)
.set{ purge_dups_input }
+
+ //
+ // SUBWORKFLOW: RUN PURGE DUPS ON THE PRIMARY CONTIGS
+ //
PURGE_DUPS_PRI( purge_dups_input, 'primary' )
+ ch_versions = ch_versions.mix(PURGE_DUPS_PRI.out.versions)
+
+ //
+ // LOGIC: UPDATE THE PRIMARY CONTIGS CHANNEL
+ //
PURGE_DUPS_PRI.out.pri.map{ meta, fasta -> [[id:meta.id], fasta] }
.set{ primary_contigs_ch }
+ //
+ // LOGIC: SET APART THE HAPLOTIGS AFTER PURGING AND THE HIFIASM HAPLOTIGS
+ //
haplotigs_ch.combine( PURGE_DUPS_PRI.out.alt )
.map{ meta_h, h, meta_h_purged, h_purged -> [meta_h, [h, h_purged]]}
.set{ haplotigs_to_merge }
- CAT_CAT_HAPLOTIGS{ haplotigs_to_merge }
+ //
+ // MODULE: COMBINE PURGED SEQUENCES WITH THE ORIGINAL HAPLOTIGS
+ //
+ CAT_CAT_HAPLOTIGS{ haplotigs_to_merge }
+
+ //
+ // LOGIC: CREATE AN INPUT DATA STRUCTURE FOR THE SECOND ROUND OF PURGING
+ //
hifi_reads_ch.join(CAT_CAT_HAPLOTIGS.out.file_out)
.join(GENOMESCOPE_MODEL.out.model)
- .set{ purge_dups_haploitgs_input }
+ .set{ purge_dups_haplotigs_input }
- PURGE_DUPS_ALT( purge_dups_haploitgs_input, 'haplotigs' )
+ //
+ // SUBWORKFLOW: PURGE HAPLOTIGS
+ //
+ PURGE_DUPS_ALT( purge_dups_haplotigs_input, 'haplotigs' )
+ //
+ // LOGIC: UPDATE THE HAPLOTIGS CHANNEL
+ //
PURGE_DUPS_ALT.out.pri.map{ meta, fasta -> [[id:meta.id], fasta] }
.set{ haplotigs_ch }
+
+ //
+ // SUBWORKFLOW: CALCULATE STATISTICS FOR THE PURGED ASSEMBLY
+ //
GENOME_STATISTICS_PURGED( primary_contigs_ch.join(haplotigs_ch),
PREPARE_INPUT.out.busco,
GENOMESCOPE_MODEL.out.hist,
GENOMESCOPE_MODEL.out.ktab
)
+
+ //
+ // LOGIC: CREATE A CHANNEL FOR THE PURGED CONTIGS AMD HAPLOTIGS
+ //
PURGE_DUPS_PRI.out.pri.combine(PURGE_DUPS_ALT.out.pri)
.map{ meta_pri, purged_pri, meta_alt, purged_alt -> [[id: meta_pri.id], [purged_pri, purged_alt]]}
.set{ purged_pri_alt_ch }
+ //
+ // MODULE: MERGE PURGED CONTIGS AND HAPLOTIGS INTO ONE FILE
+ //
CAT_CAT_PURGEDUPS( purged_pri_alt_ch )
- if ( organelles_on ) {
- if ( !polishing_on ) {
- ORGANELLES_CONTIGS(CAT_CAT_PURGEDUPS.out.file_out, PREPARE_INPUT.out.mito)
- }
- }
+
+ //
+ // LOGIC: DEFINE MERGED ASSEMBLY
+ //
+ merged_pri_alt = CAT_CAT_PURGEDUPS.out.file_out
if ( polishing_on ) {
+ //
+ // MODULE: INDEX FASTA FOR THE MERGED PRIMARY CONTIGS AND HAPLOTIGS
+ //
SAMTOOLS_FAIDX_PURGEDUPS( CAT_CAT_PURGEDUPS.out.file_out, [[],[]] )
+ ch_versions = ch_versions.mix(SAMTOOLS_FAIDX_PURGEDUPS.out.versions)
+
+ //
+ // LOGIC: CREATE AN ASSEMBLY CHANNEL FOR POLISHING
+ //
CAT_CAT_PURGEDUPS.out.file_out.join( SAMTOOLS_FAIDX_PURGEDUPS.out.fai )
.set{ reference_ch }
+ //
+ // LOGIC: REFACTOR ILLUMINA CHANNEL TO PASS IT INTO THE POLISHING SUBWORKFLOW
+ //
PREPARE_INPUT.out.illumina_10X.map{ meta, reads, kmers -> [reads] }
.set{ illumina_10X_ch }
+ //
+ // SUBWORKFLOW: POLISH THE PRIMARY AND ALT
+ //
POLISHING(reference_ch, illumina_10X_ch, bed_chunks_polishing)
ch_versions = ch_versions.mix(POLISHING.out.versions)
-
- if ( organelles_on ) {
- ORGANELLES_CONTIGS(POLISHING.out.fasta, PREPARE_INPUT.out.mito)
- }
-
- // Separate the primary and alternative contigs again after polishing
- // Separate primary contigs
+
+ //
+ // LOGIC: UPDATE MERGED ASSEMBLY
+ //
+ merged_pri_alt = POLISHING.out.fasta
+
+ //
+ // MODULE: EXTRACT THE NAMES OF THE PRIMARY CONTIGS
+ //
KEEP_SEQNAMES_PRIMARY(PURGE_DUPS_PRI.out.pri)
ch_versions = ch_versions.mix(KEEP_SEQNAMES_PRIMARY.out.versions)
+
+ //
+ // MODULE: SEPARATE POLISHED PRIMARY CONTIGS
+ //
SEQTK_SUBSEQ_PRIMARY(POLISHING.out.fasta, KEEP_SEQNAMES_PRIMARY.out.seqlist)
ch_versions = ch_versions.mix(SEQTK_SUBSEQ_PRIMARY.out.versions)
+
+ //
+ // LOGIC: UPDATE THE PRIMARY CONTIGS CHANNEL WITH THE POLISHED
+ // PRIMARY CONTIGS
+ //
POLISHING.out.fasta.map{ meta, f -> [id: meta.id] }
.combine(SEQTK_SUBSEQ_PRIMARY.out.sequences)
.set{ primary_contigs_ch }
- // Separate alt contigs
+ //
+ // MODULE: EXTRACT THE NAMES OF THE HAPLOTIGS
+ //
KEEP_SEQNAMES_HAPLOTIGS(PURGE_DUPS_ALT.out.pri)
- ch_versions = ch_versions.mix(KEEP_SEQNAMES_HAPLOTIGS.out.versions)
+
+ //
+ // MODULE: SEPARATE THE POLSIHED HAPLOTIGS
+ //
SEQTK_SUBSEQ_HAPLOTIGS(POLISHING.out.fasta, KEEP_SEQNAMES_HAPLOTIGS.out.seqlist)
- ch_versions = ch_versions.mix(SEQTK_SUBSEQ_HAPLOTIGS.out.versions)
+
+ //
+ // LOGIC: UPDATE THE HAPLOTIGS CHANNEL WITH THE POLISHED HAPLOTIGS
+ //
POLISHING.out.fasta.map{ meta, f -> [id: meta.id] }
.combine(SEQTK_SUBSEQ_HAPLOTIGS.out.sequences)
.set{ haplotigs_contigs_ch }
- // Check genome stats for polished pri and alt
+ //
+ // LOGIC: COMBINE PRI AND ALT POLISHED CONTIGS INTO A CHANNEL
+ //
primary_contigs_ch.join(haplotigs_contigs_ch)
.map{ meta, pri, alt -> [[id:meta.id], pri, alt]}
.set{ polished_asm_stats_input_ch }
+
+ //
+ // SUBWORKFLOW: CALCULATE ASSEMBLY STATISTICS FOR THE POLISHED
+ // ASSEMBLY
+ //
GENOME_STATISTICS_POLISHED( polished_asm_stats_input_ch,
PREPARE_INPUT.out.busco,
GENOMESCOPE_MODEL.out.hist,
GENOMESCOPE_MODEL.out.ktab
)
- ch_versions = ch_versions.mix(GENOME_STATISTICS_POLISHED.out.versions)
}
+ if ( organelles_on ) {
+ //
+ // SUBWORKFLOW: INDETIFY MITO IN THE ASSEMBLY CONTIGS
+ //
+ ORGANELLES_CONTIGS(merged_pri_alt, PREPARE_INPUT.out.mito)
+ }
+
+ //
+ // LOGIC: CREATE A CHANNEL FOR THE PATHS TO HIC DATA
+ //
PREPARE_INPUT.out.hic.map{ meta, crams, motif -> [meta, crams] }
.set{ crams_ch }
- // Map HiC data to the primary assembly
+ //
+ // SUBWORKFLOW: MAP HIC DATA TO THE PRIMARY ASSEMBLY
+ //
HIC_MAPPING ( primary_contigs_ch,crams_ch )
ch_versions = ch_versions.mix(HIC_MAPPING.out.versions)
+ //
+ // SUBWORKFLOW: SCAFFOLD THE PRIMARY ASSEMBLY
+ //
SCAFFOLDING( HIC_MAPPING.out.bed, primary_contigs_ch, cool_bin )
ch_versions = ch_versions.mix(SCAFFOLDING.out.versions)
+ //
+ // LOGIC: CREATE A CHANNEL FOR THE FINAL ASSEMBLY REPRESENTED BY
+ // THE SCAFFOLDS AND HAPLOTIGS
+ //
SCAFFOLDING.out.fasta.combine(haplotigs_ch)
.map{meta_s, fasta_s, meta_h, fasta_h -> [ meta_h, fasta_s, fasta_h ]}
.set{ stats_input_ch }
-
+
+ //
+ // SUBWORKFLOW: CALCULATE ASSEMBLY STATISTICS FOR THE FINAL ASSEMBLY
+ //
GENOME_STATISTICS_SCAFFOLDS( stats_input_ch,
PREPARE_INPUT.out.busco,
GENOMESCOPE_MODEL.out.hist,
GENOMESCOPE_MODEL.out.ktab
)
+
//
// MODULE: Collate versions.yml file
//