diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a584..4ecfbfe3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,6 +2,7 @@ "name": "nfcore", "image": "nfcore/gitpod:latest", "remoteUser": "gitpod", + "runArgs": ["--privileged"], // Configure tool-specific properties. "customizations": { diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index c58b4779..c1642f76 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -9,7 +9,9 @@ Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/ampliseq then the best place to ask is on the nf-core Slack [#ampliseq](https://nfcore.slack.com/channels/ampliseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +:::info +If you need help using or modifying nf-core/ampliseq then the best place to ask is on the nf-core Slack [#ampliseq](https://nfcore.slack.com/channels/ampliseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +::: ## Contribution workflow @@ -116,4 +118,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 79bc7c1a..3bd77263 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,7 +42,7 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 22.10.1)_ + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 503a5925..eafcf567 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,18 +14,23 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/ampliseq/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/ampliseq/results-${{ github.sha }}" } - profiles: test_full,aws_tower + profiles: test_full + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index d42ecdfd..7a4f39de 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,18 +12,22 @@ jobs: steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/ampliseq/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/ampliseq/results-test-${{ github.sha }}" } - profiles: test,aws_tower + profiles: test + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06ad337f..788582d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,6 +47,7 @@ jobs: - "test" - "test_single" - "test_fasta" + - "test_failed" - "test_multi" - "test_reftaxcustom" - "test_doubleprimers" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 888cb4bc..b8bdd214 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.11" architecture: "x64" - name: Install dependencies diff --git a/.github/workflows/release-announcments.yml b/.github/workflows/release-announcments.yml new file mode 100644 index 00000000..6ad33927 --- /dev/null +++ b/.github/workflows/release-announcments.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.0.2 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..25488dcc 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,4 +1,9 @@ image: nfcore/gitpod:latest +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update vscode: extensions: # based on nf-core.nf-core-extensionpack diff --git a/CHANGELOG.md b/CHANGELOG.md index db3140ab..3d4313bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,56 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## nf-core/ampliseq version 2.6.1 - 2023-06-27 +## nf-core/ampliseq version 2.7.0 - 2023-10-20 + +### `Added` + +- [#558](https://github.com/nf-core/ampliseq/pull/558),[#619](https://github.com/nf-core/ampliseq/pull/619),[#625](https://github.com/nf-core/ampliseq/pull/625),[#632](https://github.com/nf-core/ampliseq/pull/632),[#644](https://github.com/nf-core/ampliseq/pull/644) - Pipeline summary report +- [#615](https://github.com/nf-core/ampliseq/pull/615) - Phyloseq R object creation +- [#622](https://github.com/nf-core/ampliseq/pull/622) - ASV post-clustering with Vsearch +- [#637](https://github.com/nf-core/ampliseq/pull/637) - Taxonomic classification with Kraken2, parameter `--kraken2_ref_taxonomy`, `--kraken2_ref_tax_custom`, `--kraken2_assign_taxlevels`, `--kraken2_confidence` +- [#639](https://github.com/nf-core/ampliseq/pull/639) - GTDB release 214.1 for taxonomic classification with DADA2, using `--dada_ref_taxonomy gtdb` or `--dada_ref_taxonomy gtdb=R08-RS214` +- [#641](https://github.com/nf-core/ampliseq/pull/641) - Continue analysis even when individual files fail the filtering threshold, added parameter `--ignore_failed_filtering` + +### `Changed` + +- [#616](https://github.com/nf-core/ampliseq/pull/616) - When using a sample sheet with `--input` containing forward and reverse reads, specifying `--single_end` will only extract forward reads and treat the data as single ended instead of extracting forward and reverse reads. +- [#616](https://github.com/nf-core/ampliseq/pull/616) - `--input` was split into three params: (1) `--input` for samplesheet, (2) `--input_fasta` for ASV/OTU fasta input, (3) `--input_folder` direct FASTQ input + +| Param updated | Param old | Accepts | +| ------------- | --------- | ---------------------------------------- | +| input | input | samplesheet, .tsv/.csv/.yml/.yaml | +| input_fasta | input | ASV/OTU sequences, .fasta | +| input_folder | input | Folder containing compressed fastq files | + +- [#639](https://github.com/nf-core/ampliseq/pull/639) - `--dada_ref_taxonomy gtdb` points towards GTDB release 214.1 instead of GTDB release 207 for taxonomic classification with DADA2 +- [#645](https://github.com/nf-core/ampliseq/pull/645) - Updated documentation, including workflow figure + +### `Fixed` + +- [#605](https://github.com/nf-core/ampliseq/pull/605) - Make `--sbdiexport` compatible with PR2 version 5.0.0 +- [#614](https://github.com/nf-core/ampliseq/pull/614),[#620](https://github.com/nf-core/ampliseq/pull/620),[#642](https://github.com/nf-core/ampliseq/pull/642) - Template update for nf-core/tools version 2.10 +- [#617](https://github.com/nf-core/ampliseq/pull/617) - Fix database compatibility check for `--sbdiexport` +- [#628](https://github.com/nf-core/ampliseq/pull/628) - Fix edge case for sample sheet input when using specific combinations of sampleID and forwardReads or reverseReads that will forward one file too much to cutadapt +- [#630](https://github.com/nf-core/ampliseq/pull/630) - ASV rRNA (barrnap), length, and codon filter now work with ASV fasta file input +- [#633](https://github.com/nf-core/ampliseq/pull/633) - UNIFRAC in QIIME2_DIVERSITY_CORE is now prevented from using a GPU to avoid errors +- [#643](https://github.com/nf-core/ampliseq/pull/643) - Fix using `--skip_dada_addspecies` without `--dada_ref_tax_custom_sp` which was broken in 2.6.0 & 2.6.1 +- [#647](https://github.com/nf-core/ampliseq/pull/647) - Update of credits + +### `Dependencies` + +- [#646](https://github.com/nf-core/ampliseq/pull/646) - Updated dependencies, see below: + +| software | previously | now | +| -------- | ---------- | ------ | +| FASTQC | 0.11.9 | 0.12.1 | +| DADA2 | 1.22.0 | 1.28.0 | +| PICRUSt2 | 2.5.0 | 2.5.2 | +| QIIME2 | 2022.11 | 2023.7 | + +### `Removed` + +## nf-core/ampliseq version 2.6.1 - 2023-06-28 ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index e488e7bd..ee03b01c 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,8 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + - [Cutadapt](https://journal.embnet.org/index.php/embnetjournal/article/view/200/479) > Marcel, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet. journal 17.1 (2011): pp-10. doi: 10.14806/ej.17.1.200. @@ -29,7 +31,7 @@ - [DADA2](https://pubmed.ncbi.nlm.nih.gov/27214047/) > Callahan BJ, McMurdie PJ, Rosen MJ, Han AW, Johnson AJ, Holmes SP. DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods. 2016 Jul;13(7):581-3. doi: 10.1038/nmeth.3869. Epub 2016 May 23. PMID: 27214047; PMCID: PMC4927377. -### Taxonomic classification and database (only one database) +### Taxonomic classification and databases - Classification by [QIIME2 classifier](https://pubmed.ncbi.nlm.nih.gov/29773078/) @@ -109,6 +111,10 @@ > Jari Oksanen, F. Guillaume Blanchet, Michael Friendly, Roeland Kindt, Pierre Legendre, Dan McGlinn, Peter R. Minchin, R. B. O’Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. Stevens, Eduard Szoecs, and Helene Wagner. vegan: Community Ecology Package. 2018. R package version 2.5-3. +- [Phyloseq](https://doi.org/10.1371/journal.pone.0061217) + + > McMurdie PJ, Holmes S (2013). “phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data.” PLoS ONE, 8(4), e61217. + ### Non-default tools - [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073) @@ -139,9 +145,14 @@ > Edgar RC. (2016) SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences, BioRxiv, 074161. Preprint. +- [Kraken2](https://pubmed.ncbi.nlm.nih.gov/31779668/) + + > Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome biology, 20(1), 257. https://doi.org/10.1186/s13059-019-1891-0 + ### Summarizing software - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. ## Data @@ -165,5 +176,8 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f..c089ec78 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,18 +1,20 @@ -# Code of Conduct at nf-core (v1.0) +# Code of Conduct at nf-core (v1.4) ## Our Pledge -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - Age +- Ability - Body size +- Caste - Familial status - Gender identity and expression - Geographical location - Level of experience - Nationality and national origins - Native language -- Physical and neurological ability +- Neurodiversity - Race or ethnicity - Religion - Sexual identity and orientation @@ -22,80 +24,133 @@ Please note that the list above is alphabetised and is therefore not ranked in a ## Preamble -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +:::note +This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. +::: -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). + +Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. ## Our Responsibilities -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When and where does this Code of Conduct apply? -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - Communicating with an official project email address. - Communicating with community members within the nf-core Slack channel. - Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. - Representing nf-core on social media. This includes both official and personal accounts. ## nf-core cares 😊 -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - Ask for consent before sharing another community member’s personal information (including photographs) on social media. - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) - Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Accept feedback, yet be unafraid to question, deliberate, and learn. - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) - Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) ## nf-core frowns on 😕 -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. - Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. ### Online Trolling -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. -## Procedures for Reporting CoC violations +## Procedures for reporting CoC violations If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. + +Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. + +All reports will be handled with the utmost discretion and confidentiality. + +You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: + +- Your contact information. +- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. +- The behaviour that was in violation and the circumstances surrounding the incident. +- The approximate time of the behaviour (if different than the time the report was made). +- Other people involved in the incident, if applicable. +- If you believe the incident is ongoing. +- If there is a publicly available record (e.g. mailing list record, a screenshot). +- Any additional information. + +After you file a report, one or more members of our Safety Team will contact you to follow up on your report. + +## Who will read and handle reports + +All reports will be read and handled by the members of the Safety Team at nf-core. + +If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. + +To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. + +## Reviewing reports + +After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. + +The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. + +In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. -All reports will be handled with utmost discretion and confidentially. +## Confidentiality + +All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. + +We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. + +## Enforcement + +Actions taken by the nf-core’s Safety Team may include, but are not limited to: + +- Asking anyone to stop a behaviour. +- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. +- Removing access to the gather.town and Slack, either temporarily or permanently. +- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. +- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. +- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. +- No action. ## Attribution and Acknowledgements @@ -106,6 +161,22 @@ All reports will be handled with utmost discretion and confidentially. ## Changelog -### v1.0 - March 12th, 2021 +### v1.4 - February 8th, 2022 + +- Included a new member of the Safety Team. Corrected a typographical error in the text. + +### v1.3 - December 10th, 2021 + +- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. + +### v1.2 - November 12th, 2021 + +- Removed information specific to reporting CoC violations at the Hackathon in October 2021. + +### v1.1 - October 14th, 2021 + +- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. + +### v1.0 - March 15th, 2021 - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/README.md b/README.md index 56e499a3..ec59cee5 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Introduction -**nfcore/ampliseq** is a bioinformatics analysis pipeline used for amplicon sequencing, supporting denoising of any amplicon and, currently, taxonomic assignment of 16S, ITS, CO1 and 18S amplicons. Phylogenetic placement is also possible. Supported is paired-end Illumina or single-end Illumina, PacBio and IonTorrent data. Default is the analysis of 16S rRNA gene amplicons sequenced paired-end with Illumina. +**nfcore/ampliseq** is a bioinformatics analysis pipeline used for amplicon sequencing, supporting denoising of any amplicon and supports a variety of taxonomic databases for taxonomic assignment including 16S, ITS, CO1 and 18S. Phylogenetic placement is also possible. Supported is paired-end Illumina or single-end Illumina, PacBio and IonTorrent data. Default is the analysis of 16S rRNA gene amplicons sequenced paired-end with Illumina. A video about relevance, usage and output of the pipeline (version 2.1.0; 26th Oct. 2021) can also be found in [YouTube](https://youtu.be/a0VOEeAvETs) and [billibilli](https://www.bilibili.com/video/BV1B44y1e7MM), the slides are deposited at [figshare](https://doi.org/10.6084/m9.figshare.16871008.v1). @@ -35,19 +35,23 @@ By default, the pipeline currently performs the following: - Sequencing quality control ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) - Trimming of reads ([Cutadapt](https://journal.embnet.org/index.php/embnetjournal/article/view/200)) - Infer Amplicon Sequence Variants (ASVs) ([DADA2](https://doi.org/10.1038/nmeth.3869)) +- Optional post-clustering with [VSEARCH](https://github.com/torognes/vsearch) - Predict whether ASVs are ribosomal RNA sequences ([Barrnap](https://github.com/tseemann/barrnap)) - Phylogenetic placement ([EPA-NG](https://github.com/Pbdas/epa-ng)) -- Taxonomical classification using DADA2, [SINTAX](https://doi.org/10.1101/074161) or [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) +- Taxonomical classification using DADA2; alternatives are [SINTAX](https://doi.org/10.1101/074161), [Kraken2](https://doi.org/10.1186/s13059-019-1891-0), and [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) - Excludes unwanted taxa, produces absolute and relative feature/taxa count tables and plots, plots alpha rarefaction curves, computes alpha and beta diversity indices and plots thereof ([QIIME2](https://www.nature.com/articles/s41587-019-0209-9)) - Calls differentially abundant taxa ([ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277)) -- Overall pipeline run summaries ([MultiQC](https://multiqc.info/)) +- Creates phyloseq R objects ([Phyloseq](https://www.bioconductor.org/packages/release/bioc/html/phyloseq.html)) +- Pipeline QC summaries ([MultiQC](https://multiqc.info/)) +- Pipeline summary report ([R Markdown](https://github.com/rstudio/rmarkdown)) ## Usage -> **Note** -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how -> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) -> with `-profile test` before running the workflow on actual data. +:::note +If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +with `-profile test` before running the workflow on actual data. +::: First, you need to know whether the sequencing files at hand are expected to contain primer sequences (usually yes) and if yes, what primer sequences. In the example below, the paired end sequencing data was produced with 515f (GTGYCAGCMGCCGCGGTAA) and 806r (GGACTACNVGGGTWTCTAAT) primers of the V4 region of the 16S rRNA gene. Please note, that those sequences should not contain any sequencing adapter sequences, only the sequence that matches the biological amplicon. @@ -64,19 +68,25 @@ nextflow run nf-core/ampliseq \ --outdir ``` -> **Note** -> Adding metadata will considerably increase the output, see [metadata documentation](https://nf-co.re/ampliseq/usage#metadata). +:::note +Adding metadata will considerably increase the output, see [metadata documentation](https://nf-co.re/ampliseq/usage#metadata). +::: -> **Warning:** -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those -> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; -> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +:::note +By default the taxonomic assignment will be performed with DADA2 on SILVA database, but there are various tools and databases readily available, see [taxonomic classification documentation](https://nf-co.re/ampliseq/usage#taxonomic-classification). +::: -For more details, please refer to the [usage documentation](https://nf-co.re/ampliseq/usage) and the [parameter documentation](https://nf-co.re/ampliseq/parameters). +:::warning +Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +::: + +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/ampliseq/usage) and the [parameter documentation](https://nf-co.re/ampliseq/parameters). ## Pipeline output -To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/ampliseq/results) tab on the nf-core website pipeline page. +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/ampliseq/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/ampliseq/output). @@ -86,12 +96,7 @@ nf-core/ampliseq was originally written by Daniel Straub ([@d4straub](https://gi We thank the following people for their extensive assistance in the development of this pipeline (in alphabetical order): -- [Diego Brambilla](https://github.com/DiegoBrambilla) -- [Emelie Nilsson](https://github.com/emnilsson) -- [Jeanette Tångrot](https://github.com/jtangrot) -- [Lokeshwaran Manoharan](https://github.com/lokeshbio) -- [Marissa Dubbelaar](https://github.com/marissaDubbelaar) -- [Sabrina Krakau](https://github.com/skrakau) +[Adam Bennett](https://github.com/a4000), [Diego Brambilla](https://github.com/DiegoBrambilla), [Emelie Nilsson](https://github.com/emnilsson), [Jeanette Tångrot](https://github.com/jtangrot), [Lokeshwaran Manoharan](https://github.com/lokeshbio), [Marissa Dubbelaar](https://github.com/marissaDubbelaar), [Sabrina Krakau](https://github.com/skrakau), [Sam Minot](https://github.com/sminot), [Till Englert](https://github.com/tillenglert) ## Contributions and Support diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index cef829de..3f1b1e44 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,17 +3,22 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/ampliseq Methods Description" section_href: "https://github.com/nf-core/ampliseq" plot_type: "html" +## nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/ampliseq v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

Data was processed using nf-core/ampliseq v${workflow.manifest.version} ${doi_text} (Straub et al., 2020) of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 3452ef88..8613d28d 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/ampliseq + This report has been generated by the nf-core/ampliseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-ampliseq-methods-description": order: -1000 diff --git a/assets/nf-core-ampliseq_logo_light.png b/assets/nf-core-ampliseq_logo_light.png index dea08d56..58f01531 100644 Binary files a/assets/nf-core-ampliseq_logo_light.png and b/assets/nf-core-ampliseq_logo_light.png differ diff --git a/assets/nf-core-ampliseq_logo_light_long.png b/assets/nf-core-ampliseq_logo_light_long.png new file mode 100644 index 00000000..8aac12e2 Binary files /dev/null and b/assets/nf-core-ampliseq_logo_light_long.png differ diff --git a/assets/nf-core_style.css b/assets/nf-core_style.css new file mode 100644 index 00000000..0195a723 --- /dev/null +++ b/assets/nf-core_style.css @@ -0,0 +1,70 @@ +body { + font-family: Calibri, helvetica, sans-serif; +} + +h1 { + color: rgb(36, 176, 100); + font-size: 200%; +} + +h2 { + color: rgb(36, 176, 100); + font-size: 150%; +} + +h3 { + font-size: 100%; + font-weight: bold; +} + +h3.subtitle { + font-size: 120%; + color: rgb(0, 0, 0); + font-weight: bold; +} + +h4 { + font-size: 100%; + font-weight: bold; + font-style: italic; +} + +.watermark { + opacity: 0.1; + position: fixed; + top: 50%; + left: 50%; + font-size: 500%; + color: #24b064; +} + +.list-group-item.active { + z-index: 2; + color: #fff; + background-color: #24b064; + border-color: #24b064; +} +.list-group-item.active:hover { + z-index: 2; + color: #fff; + background-color: #24b064; + border-color: #24b064; +} + +#TOC { + background-size: contain; + padding-top: 60px !important; + background-repeat: no-repeat; +} + +.nav-pills > li.active > a, +.nav-pills > li.active > a:hover, +.nav-pills > li.active > a:focus { + color: #fff; + background-color: #24b064; +} + +a { + color: #24b064; + text-decoration: none; +} diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd new file mode 100644 index 00000000..f0b4073e --- /dev/null +++ b/assets/report_template.Rmd @@ -0,0 +1,1887 @@ +--- +output: + html_document: + toc: true # table of contents + toc_float: true # float the table of contents to the left of the main document content + toc_depth: 3 # header levels 1,2,3 + theme: default + number_sections: true # add section numbering to headers + df_print: paged # tables are printed as an html table with support for pagination over rows and columns + highlight: pygments + pdf_document: true +#bibliography: ./references.bibtex +params: + # any parameter that is by default "FALSE" is used to evaluate the inclusion of a codeblock with e.g. "eval=!isFALSE(params$mqc_plot)" + + # report style + css: NULL + report_logo: NULL + report_title: "Summary of analysis results" + report_abstract: FALSE + + # pipeline versions + workflow_manifest_version: NULL + workflow_scriptid: NULL + + # flags and arguments + flag_retain_untrimmed: FALSE + kraken2_confidence: "" + flag_single_end: FALSE + barplot: FALSE + abundance_tables: FALSE + alpha_rarefaction: FALSE + ancom: FALSE + trunclenf: "" + trunclenr: "" + max_ee: "" + trunc_qmin: FALSE + trunc_rmin: "" + dada_sample_inference: "" + vsearch_cluster_id: "" + filter_ssu: FALSE + min_len_asv: "" + max_len_asv: "" + cut_its: FALSE + dada2_ref_tax_title: FALSE + qiime2_ref_tax_title: FALSE + sintax_ref_tax_title: FALSE + kraken2_ref_tax_title: FALSE + dada2_ref_tax_file: "" + qiime2_ref_tax_file: "" + sintax_ref_tax_file: "" + kraken2_ref_tax_file: "" + dada2_ref_tax_citation: "" + qiime2_ref_tax_citation: "" + sintax_ref_tax_citation: "" + kraken2_ref_tax_citation: "" + exclude_taxa: "" + min_frequency: "" + min_samples: "" + qiime2_filtertaxa: "" + val_used_taxonomy: FALSE + metadata_category_barplot: FALSE + qiime_adonis_formula: FALSE + + # file paths + metadata: FALSE + input_samplesheet: FALSE + input_fasta: FALSE + input_folder: FALSE + mqc_plot: FALSE + cutadapt_summary: FALSE + dada_filtntrim_args: FALSE + dada_qc_f_path: FALSE + dada_qc_r_path: "" + dada_pp_qc_f_path: "" + dada_pp_qc_r_path: "" + dada_err_path: FALSE + dada_err_run: "" + asv_table_path: FALSE + path_asv_fa: FALSE + path_dada2_tab: FALSE + dada_stats_path: FALSE + vsearch_cluster: FALSE + path_barrnap_sum: FALSE + filter_ssu_stats: FALSE + filter_ssu_asv: "" + filter_len_asv: FALSE + filter_len_asv_len_orig: FALSE + filter_codons_fasta: FALSE + filter_codons_stats: FALSE + stop_codons: "" + itsx_cutasv_summary: "" + cut_dada_ref_taxonomy: FALSE + dada2_taxonomy: FALSE + sintax_taxonomy: FALSE + pplace_taxonomy: FALSE + pplace_heattree: "" + qiime2_taxonomy: FALSE + kraken2_taxonomy: FALSE + filter_stats_tsv: FALSE + diversity_indices_depth: "" + diversity_indices_alpha: FALSE + diversity_indices_beta: FALSE + diversity_indices_adonis: "" + picrust_pathways: FALSE + sbdi: FALSE + phyloseq: FALSE +--- + + + +```{r libraries, include=FALSE} +library("dplyr") +library("ggplot2") +library("knitr") +library("DT") +library("formattable") +library("purrr") +``` + + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) # echo is set in differentialabundance v1.2.0 to TRUE +``` + + + +```{r, echo=FALSE} +htmltools::includeCSS(params$css) +``` + +```{r results="asis", echo=FALSE} +cat(paste0(" + +")) +``` + + + +```{r} +if ( endsWith( params$workflow_manifest_version, "dev") ) { + ampliseq_version = paste0("version ", params$workflow_manifest_version, ", revision ", params$workflow_scriptid) +} else { + ampliseq_version = paste0("version ",params$workflow_manifest_version) +} +report_title <- params$report_title +report_subtitle <- paste0('nf-core/ampliseq workflow ', ampliseq_version) +``` + +--- +title: "`r report_title`" +subtitle: `r report_subtitle` +date: '`r format(Sys.Date(), "%B %d, %Y")`' +--- + +--- + + + +```{r, results='asis'} +if ( !isFALSE(params$report_abstract) ) { + report_abstract <- paste(readLines(params$report_abstract), collapse="\n") + cat(report_abstract) +} else { + # with tab indentation, the following will be a code block! + cat(paste0(" +# Abstract + +The bioinformatics analysis pipeline [nfcore/ampliseq](https://nf-co.re/ampliseq) is used for amplicon sequencing, +supporting denoising of any amplicon and supports a variety of taxonomic databases for taxonomic assignment including 16S, ITS, CO1 and 18S. + ")) +} +``` + + + +```{r, results='asis'} +if ( !isFALSE(params$metadata) ) { + cat(paste0(" +# Data input and Metadata + +Pipeline input was saved to the [input](../input) directory. + ")) +} else { + cat(paste0(" +# Data input + +Pipeline input was saved in folder [input](../input). + ")) +} + +if ( !isFALSE(params$input_samplesheet) ) { + # samplesheet input + cat("\nSequencing data was provided in the samplesheet file `", params$input_samplesheet, "` that is displayed below:", sep="") + + samplesheet <- read.table(file = params$input_samplesheet, header = TRUE, sep = "\t") + # Display table + datatable(samplesheet, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) +} else if ( !isFALSE(params$input_fasta) ) { + # fasta input + cat("\nASV/OTU sequences were provided in the fasta file `", params$input_fasta, "`. ", sep="") +} else if ( !isFALSE(params$input_folder) ) { + # folder input + cat("\nSequencing data was retrieved from folder `", params$input_folder, "`. ", sep="") +} +if ( !isFALSE(params$metadata) ) { + cat("\nMetadata associated with the sequencing data was provided in `", params$metadata, "` and is displayed below:", sep="") + + metadata <- read.table(file = params$metadata, header = TRUE, sep = "\t") + # Display table + datatable(metadata, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) +} +``` + + + +```{r, eval = !isFALSE(params$mqc_plot) || !isFALSE(params$dada_filtntrim_args), results='asis'} +cat("# Preprocessing\n") +``` + + + +```{r, eval = !isFALSE(params$mqc_plot), results='asis'} +cat(paste0(" +## FastQC + +[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. +It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), +adapter contamination and overrepresented sequences. The sequence quality was checked using FastQC and resulting data was +aggregated using the FastQC module of [MultiQC](https://multiqc.info/). For more quality controls and per sample quality checks you can check the full +MultiQC report, which can be found in [multiqc/multiqc_report.html](../multiqc/multiqc_report.html). +")) +``` + +```{r, eval = !isFALSE(params$mqc_plot), out.width='100%', dpi=1200, fig.align='center'} +knitr::include_graphics(params$mqc_plot) +``` + + + +```{r, eval = !isFALSE(params$cutadapt_summary), results='asis'} +cat(paste0(" +## Primer removal with Cutadapt + +[Cutadapt](https://journal.embnet.org/index.php/embnetjournal/article/view/200) is trimming primer sequences from sequencing reads. +Primer sequences are non-biological sequences that often introduce point mutations that do not reflect sample sequences. This is especially +true for degenerated PCR primer. If primer trimming were to be omitted, artifactual amplicon sequence variants might be computed by +the denoising tool or sequences might be lost due to being labelled as PCR chimera. +")) + +# import tsv +cutadapt_summary <- read.table(file = params$cutadapt_summary, header = TRUE, sep = "\t") + +cutadapt_passed_col <- as.numeric(substr( + cutadapt_summary$cutadapt_passing_filters_percent, 1, 4)) + +cutadapt_max_discarded <- round( 100 - min(cutadapt_passed_col), 1 ) +cutadapt_avg_passed <- round(mean(cutadapt_passed_col),1) + +cutadapt_text_unch <- "Primers were trimmed using cutadapt" +cutadapt_text_ch <- paste0(" and all untrimmed sequences were discarded. ", + "Sequences that did not contain primer sequences were considered artifacts. Less than ", + cutadapt_max_discarded, "% of the sequences were discarded per sample and a mean of ", + cutadapt_avg_passed, "% of the sequences per sample passed the filtering. ") + +if ( isFALSE(params$flag_retain_untrimmed) ) cutadapt_text <- paste0( + cutadapt_text_unch, cutadapt_text_ch + ) else cutadapt_text <- paste0(cutadapt_text_unch, ". ") + +cat(cutadapt_text) +cat("Cutadapt results can be found in folder [cutadapt](../cutadapt).") + +# shorten header by "cutadapt_" to optimize visualisation +colnames(cutadapt_summary) <- gsub("cutadapt_","",colnames(cutadapt_summary)) + +datatable(cutadapt_summary, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) +``` + + + +```{r, eval = !isFALSE(params$dada_filtntrim_args), results='asis'} +cat(paste0(" +## Quality filtering using DADA2 + +Additional quality filtering can improve sequence recovery. +Often it is advised trimming the last few nucleotides to avoid less well-controlled errors that can arise there. +")) + +if (params$trunc_qmin) { + f_and_tr_args <- readLines(params$dada_filtntrim_args) + trunc_len <- strsplit(gsub(".*truncLen = c\\((.+)\\),maxN.*", "\\1", + f_and_tr_args), ", ") + tr_len_f <- trunc_len[[1]][1] + tr_len_r <- trunc_len[[1]][2] + cat("Reads were trimmed to a specific length and the length cutoff was ", + "automatically determined by the median quality of all input reads. ", + "Reads were trimmed before median quality drops ", + "below ", params$trunc_qmin, " and at least ",params$trunc_rmin*100, + "% of reads are retained, resulting in a trim of ", + "forward reads at ", tr_len_f, " bp and reverse ", + "reads at ", tr_len_r, " bp, reads shorter than this were discarded. ", sep = "") +} else if (params$trunclenf == "null" && params$trunclenr == "null") { + cat("Reads were not trimmed. ") +} else if (params$trunclenf != 0 && params$trunclenr != 0) { + cat("Forward reads were trimmed at ", params$trunclenf, + " bp and reverse reads were trimmed at ", params$trunclenr, + " bp, reads shorter than this were discarded. ", sep = "") +} else if (params$trunclenf != 0) { + cat("Forward reads were trimmed at ", params$trunclenf," bp, reads shorter than this were discarded. ", sep = "") +} else if (params$trunclenr != 0) { + cat("Reverse reads were trimmed at ", params$trunclenr," bp, reads shorter than this were discarded. ", sep = "") +} +cat("Reads with more than", params$max_ee,"expected errors were discarded.", + "Read counts passing the filter are shown in section ['Read counts per sample'](#read-counts-per-sample)", + "column 'filtered'.", sep = " ") +``` + + + +```{r, eval = !isFALSE(params$dada_qc_f_path), results='asis'} +cat ("**Quality profiles:**\n\n") + +if (params$flag_single_end) { + cat("Read quality stats for incoming data:") +} else { + cat("Forward (left) and reverse (right) read quality stats for incoming data:") +} +``` + +```{r, eval = !isFALSE(params$dada_qc_f_path), out.width="49%", fig.show='hold', fig.align='default'} +if (params$flag_single_end) { + knitr::include_graphics(params$dada_qc_f_path) +} else { + knitr::include_graphics(c(params$dada_qc_f_path, params$dada_qc_r_path)) +} +``` + +```{r, eval = !isFALSE(params$dada_qc_f_path), results='asis'} +if (params$flag_single_end) { + cat("Read quality stats for preprocessed data:") +} else { + cat("Forward (left) and reverse (right) read quality stats for preprocessed data:") +} +``` + +```{r, eval = !isFALSE(params$dada_qc_f_path), out.width="49%", fig.show='hold', fig.align='default'} +if (params$flag_single_end) { + knitr::include_graphics(params$dada_pp_qc_f_path) +} else { + knitr::include_graphics(c(params$dada_pp_qc_f_path, params$dada_pp_qc_r_path)) +} +``` + +```{r, eval = !isFALSE(params$dada_qc_f_path), results='asis'} +cat(paste0(" +Overall read quality profiles are displayed as heat map of the frequency of each quality score at each base position. +The mean quality score at each position is shown by the green line, and the quartiles of the quality score +distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least +that position. Original plots can be found [folder dada2/QC/](../dada2/QC/) with names that end in `_qual_stats.pdf`. +")) +``` + + + +```{r, eval = !isFALSE(params$dada_err_path) || !isFALSE(params$dada_stats_path) || !isFALSE(params$asv_table_path), results='asis'} +cat(paste0(" +# ASV inference using DADA2 + +[DADA2](https://doi.org/10.1038/nmeth.3869) performs fast and accurate sample inference from amplicon data with single-nucleotide +resolution. It infers exact amplicon sequence variants (ASVs) from amplicon data with fewer false positives than many other +methods while maintaining high sensitivity. + +DADA2 reduces sequence errors and dereplicates sequences by quality filtering, denoising, +read pair merging (for paired end Illumina reads only) and PCR chimera removal. +")) +``` + + + +```{r, eval = !isFALSE(params$dada_err_path), results='asis'} +cat(paste0(" +## Error correction + +Read error correction was performed using estimated error rates, visualized below. +")) + +# check if single run or multirun +flag_multirun = length ( unlist( strsplit( params$dada_err_run,"," ) ) ) != 1 + +if ( flag_multirun && params$flag_single_end ) { + # single end multi run + cat("Error rates were estimated for each sequencing run separately. ", + "Each 4x4 figure represents one run, in the sequence ", params$dada_err_run,".") +} else if ( flag_multirun && !params$flag_single_end ) { + # paired end multi run + cat("Error rates were estimated for each sequencing run separately. ", + "Each row represents one run, in the sequence ", params$dada_err_run,".", + "For each row, the error rates for forward reads are at the left side and reverse reads are at the right side.") +} else if ( !flag_multirun && !params$flag_single_end ) { + # paired end single run + cat("Error rates for forward reads are at the left side and reverse reads are at the right side.") +} +``` + +```{r, eval = !isFALSE(params$dada_err_path), out.width="49%", fig.show='hold', fig.align='default'} +dada_err_path <- unlist( strsplit( params$dada_err_path,"," ) ) +knitr::include_graphics(dada_err_path) +``` + +```{r, eval = !isFALSE(params$dada_err_path), results='asis'} +cat(paste0(" +Estimated error rates are displayed for each possible transition. The black line shows the estimated error rates after +convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal +definition of the Q-score. The estimated error rates (black line) should be a good fit to the observed rates +(points), and the error rates should drop with increased quality. Original plots can be found in +[folder dada2/QC/](../dada2/QC/) with names that end in `.err.pdf`. +")) +``` + + + +```{r, eval = !isFALSE(params$dada_stats_path), results='asis'} +cat(paste0(" +## Read counts per sample + +Tracking read numbers through DADA2 processing steps for each sample. The following table shows the read numbers after each processing stage. +")) + +if ( params$flag_single_end ) { + cat("Processing stages are: input - reads into DADA2, filtered - reads passed quality filtering, ", + "denoised - reads after denoising, nonchim - reads in non-chimeric sequences (final ASVs).") +} else { + cat("Processing stages are: input - read pairs into DADA2, filtered - read pairs passed quality filtering, ", + "denoisedF - forward reads after denoising, denoisedR - reverse reads after denoising, ", + "merged - successfully merged read pairs, nonchim - read pairs in non-chimeric sequences (final ASVs).") +} + +# import stats tsv +dada_stats <- read.table(file = params$dada_stats_path, header = TRUE, sep = "\t") + +# Display table +datatable(dada_stats, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) + +cat(paste0(" +Samples with unusual low reads numbers relative to the number of expected ASVs +should be treated cautiously, because the abundance estimate will be very granular +and might vary strongly between (theoretical) replicates due to high impact of stochasticity. + +Following, the numbers of the table above are shown in stacked barcharts as percentage of DADA2 input reads. +")) + +# Stacked barchart to num of reads + +# Calc exluded asvs and transform all cols to percent + +if ( params$flag_single_end ) { + # single end + cat("Stacked barcharts of read numbers per sample and processing stage") + + dada_stats_ex <- data.frame(sample = dada_stats$sample, + input = dada_stats$DADA2_input, + filtered = dada_stats$DADA2_input-dada_stats$filtered, + denoised = dada_stats$filtered-dada_stats$denoised, + nonchim = dada_stats$denoised-dada_stats$nonchim, + analysis = dada_stats$nonchim) + dada_stats_p <- data.frame(sample = dada_stats_ex$sample, round(dada_stats_ex[2:6]/dada_stats_ex$input*100, 2)) + dada_stats_p_analysis_average <- round(sum(dada_stats_p$analysis)/length(dada_stats_p$analysis), 1) + # If more than 20 sample only display subset! + if ( nrow(dada_stats_p)>=20 ) { + cat(" (display 10 samples of each lowest and highest percentage of reads analysed, of",nrow(dada_stats_p),"samples)") + dada_stats_p <- dada_stats_p[order(-dada_stats_p$analysis),] + dada_stats_p <- rbind(head(dada_stats_p,10),tail(dada_stats_p,10)) + } + # Stack columns for both stacked barcharts + n_samples <- length(dada_stats_p$sample) + samples_t <- c(rep(dada_stats_p$sample, 4)) + steps_t <- c(rep("excluded by filtering", n_samples), rep("excluded by denoised", n_samples), + rep("excluded by nonchim", n_samples), rep("reads in final ASVs", n_samples)) + # stack the column for percentage of asvs + asvs_p_t <- as.array(flatten_dbl(dada_stats_p[3:6])) + dada_stats_p_t <- data.frame(samples_t, steps_t, asvs_p_t) +} else { + # paired end + cat("Stacked barchart of read pair numbers (denoisedF & denoisedR halfed, because each pair is split) per sample and processing stage") + + dada_stats_ex <- data.frame(sample = dada_stats$sample, + DADA2_input = dada_stats$DADA2_input, + filtered = dada_stats$DADA2_input-dada_stats$filtered, + denoisedF = (dada_stats$filtered-dada_stats$denoisedF)/2, + denoisedR = (dada_stats$filtered-dada_stats$denoisedR)/2, + merged = (dada_stats$denoisedF+dada_stats$denoisedR)/2-dada_stats$merged, + nonchim = dada_stats$merged-dada_stats$nonchim, + analysis = dada_stats$nonchim) + dada_stats_p <- data.frame(sample = dada_stats_ex$sample, round(dada_stats_ex[2:8]/dada_stats_ex$DADA2_input*100, 2)) + dada_stats_p_analysis_average <- round(sum(dada_stats_p$analysis)/length(dada_stats_p$analysis), 1) + # If more than 20 sample only display subset! + if ( nrow(dada_stats_p)>=20 ) { + cat(" (display 10 samples of each lowest and highest percentage of reads analysed, of",nrow(dada_stats_p),"samples)") + dada_stats_p <- dada_stats_p[order(-dada_stats_p$analysis),] + dada_stats_p <- rbind(head(dada_stats_p,10),tail(dada_stats_p,10)) + } + # Stack columns for both stacked barcharts + n_samples <- length(dada_stats_p$sample) + samples_t <- c(rep(dada_stats_p$sample, 6)) + steps_t <- c(rep("excluded by filtering", n_samples), rep("excluded by denoisedF", n_samples), + rep("excluded by denoisedR", n_samples), rep("excluded by merged", n_samples), + rep("excluded by nonchim", n_samples), rep("reads in final ASVs", n_samples)) + # stack the column for percentage of asvs + asvs_p_t <- as.array(flatten_dbl(dada_stats_p[3:8])) + dada_stats_p_t <- data.frame(samples_t, steps_t, asvs_p_t) +} +cat(":\n\n") + +# Plot +dada_stats_p_t$steps_t <- factor(dada_stats_p_t$steps_t, levels=unique(dada_stats_p_t$steps_t)) +dada_stats_p_t$samples_t <- factor(dada_stats_p_t$samples_t, levels=dada_stats_p_t[order(dada_stats_p$analysis),"samples_t"]) + +plot_dada_stats_p_t <- ggplot(dada_stats_p_t, aes(fill = steps_t, y = asvs_p_t, x = samples_t)) + + geom_bar(position = "fill", stat = "identity") + + xlab("Samples") + + ylab("Fraction of total reads") + + coord_flip() + + scale_fill_brewer("Filtering Steps", palette = "Spectral") +plot_dada_stats_p_t + +svg("stacked_barchart_of_reads.svg") +plot_dada_stats_p_t +invisible(dev.off()) + +cat(paste0(" + +Between ",min(dada_stats_p$analysis),"% and ",max(dada_stats_p$analysis),"% reads per sample (average ",dada_stats_p_analysis_average,"%) +were retained for analysis within DADA2 steps. + +The proportion of lost reads per processing stage and sample should not be too high, totalling typically <50%. +Samples that are very different in lost reads (per stage) to the majority of samples must be compared with caution, because an unusual problem +(e.g. during nucleotide extraction, library preparation, or sequencing) could have occurred that might add bias to the analysis. +")) +``` + + + +```{r, eval = !isFALSE(params$asv_table_path), results='asis'} +cat("## Inferred ASVs\n\n") + +#import asv table +asv_table <- read.table(file = params$asv_table_path, header = TRUE, sep = "\t") +n_asv <- length(asv_table$ASV_ID) +n_asv_dada <- length(asv_table$ASV_ID) #this is to report the original number later in the methods section + +# Output text +cat("Finally,", n_asv, + "amplicon sequence variants (ASVs) were obtained across all samples. ") +cat("The ASVs can be found in [`dada2/ASV_seqs.fasta`](../dada2/). And the corresponding", + " quantification of the ASVs across samples is in", + "[`dada2/ASV_table.tsv`](../dada2/). An extensive table containing both was ", + "saved as [`dada2/DADA2_table.tsv`](../dada2/). ") +if ( params$dada_sample_inference == "independent" ) { + cat("ASVs were inferred for each sample independently.") +} else if ( params$dada_sample_inference == "pooled" ) { + cat("ASVs were inferred from pooled sample information.") +} else { + cat("ASVs were initally inferred for each sample independently, but re-examined with all samples (pseudo-pooled).") +} +``` + +```{r, results='asis'} +flag_any_filtering <- !isFALSE(params$path_barrnap_sum) || !isFALSE(params$filter_len_asv) || !isFALSE(params$filter_codons_fasta) || !isFALSE(params$vsearch_cluster) +``` + + + +```{r, eval = flag_any_filtering, results='asis'} +cat("# Post processing of ASVs\n") +``` + + + +```{r, eval = !isFALSE(params$vsearch_cluster), results='asis'} +vsearch_cluster = read.table( params$vsearch_cluster, header = TRUE, sep = "\t", stringsAsFactors = FALSE) +n_asv_vsearch_cluster <- nrow(vsearch_cluster) + +cat(paste0(" +## Clustering of ASVs + +[VSEARCH](https://peerj.com/articles/2584/) clustered ",n_asv_dada," ASVs into ",n_asv_vsearch_cluster," +centroids with pairwise identity of ",params$vsearch_cluster_id,". +Clustered ASV sequences and abundances can be found in folder [vsearch_cluster](../vsearch_cluster). +")) +``` + + + +```{r, eval = !isFALSE(params$path_barrnap_sum), results='asis'} +cat("## rRNA detection\n") +cat("[Barrnap](https://github.com/tseemann/barrnap) classifies the ASVs into the origin domain (including mitochondrial origin).\n\n", sep = "") + +# Read the barrnap files and count the lines +barrnap_sum = read.table( params$path_barrnap_sum, header = TRUE, sep = "\t", stringsAsFactors = FALSE) +# keep only ASV_ID & eval columns & sort +barrnap_sum <- subset(barrnap_sum, select = c(ASV_ID,mito_eval,euk_eval,arc_eval,bac_eval)) +# choose kingdom (column) with lowest evalue +barrnap_sum[is.na(barrnap_sum)] <- 1 +barrnap_sum$result = colnames(barrnap_sum[,2:5])[apply(barrnap_sum[,2:5],1,which.min)] +barrnap_sum$result = gsub("_eval", "", barrnap_sum$result) + +#import asv table +asv_table <- readLines(params$path_asv_fa) +n_asv_barrnap <- sum(grepl("^>", asv_table)) + +# calculate numbers +n_classified <- length(barrnap_sum$result) +n_bac <- sum(grepl("bac", barrnap_sum$result)) +n_arc <- sum(grepl("arc", barrnap_sum$result)) +n_mito <- sum(grepl("mito", barrnap_sum$result)) +n_euk <- sum(grepl("euk", barrnap_sum$result)) + +barrnap_df_sum <- data.frame(label=c('Bacteria','Archaea','Mitochondria','Eukaryotes','Unclassified'), + count=c(n_bac,n_arc,n_mito,n_euk,n_asv_barrnap - n_classified), + percent=c(round( (n_bac/n_asv_barrnap)*100, 2), round( (n_arc/n_asv_barrnap)*100, 2), round( (n_mito/n_asv_barrnap)*100, 2), round( (n_euk/n_asv_barrnap)*100, 2), round( ( (n_asv_barrnap - n_classified) /n_asv_barrnap)*100, 2) ) ) + +# Build outputtext +cat( "Barrnap classified ") +cat( barrnap_df_sum$count[1], "(", barrnap_df_sum$percent[1],"%) ASVs as most similar to Bacteria, " ) +cat( barrnap_df_sum$count[2], "(", barrnap_df_sum$percent[2],"%) ASVs to Archea, " ) +cat( barrnap_df_sum$count[3], "(", barrnap_df_sum$percent[3],"%) ASVs to Mitochondria, " ) +cat( barrnap_df_sum$count[4], "(", barrnap_df_sum$percent[4],"%) ASVs to Eukaryotes, and " ) +cat( barrnap_df_sum$count[5], "(", barrnap_df_sum$percent[5],"%) were below similarity threshold to any kingdom." ) + +# Barplot +plot_barrnap_df_sum <- ggplot(barrnap_df_sum, + aes(x = reorder(label, desc(label)), y = percent)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("% Classification") + + xlab("rRNA origins") + + coord_flip() + + theme_bw() + + ylim(0, 100) +plot_barrnap_df_sum + +svg("rrna_detection_with_barrnap.svg") +plot_barrnap_df_sum +invisible(dev.off()) + +cat("\n\nrRNA classification results can be found in folder [barrnap](../barrnap).") +``` + + + +```{r, eval = !isFALSE(params$path_barrnap_sum) && !isFALSE(params$filter_ssu), results='asis'} +# Read the barrnap asv file +filter_ssu_asv <- read.table( params$filter_ssu_asv, header = FALSE, sep = "\t", stringsAsFactors = FALSE) +filter_ssu_asv_filtered <- nrow(filter_ssu_asv)/2 + + # "n_asv_barrnap" is taken from the barrnap block above +cat(paste0(" +ASVs were filtered for `",params$filter_ssu,"` (`bac`: Bacteria, `arc`: Archaea, `mito`: Mitochondria, `euk`: Eukaryotes) using the above classification. +The number of ASVs was reduced by ",n_asv_barrnap-filter_ssu_asv_filtered, +" (",100-round( filter_ssu_asv_filtered/n_asv_barrnap*100 ,2),"%), from ",n_asv_barrnap," to ",filter_ssu_asv_filtered," ASVs. +")) +``` + +```{r, eval = !isFALSE(params$path_barrnap_sum) && !isFALSE(params$filter_ssu) && !isFALSE(params$filter_ssu_stats), results='asis'} +cat("The following table shows read counts for each sample before and after filtering:\n\n", sep = "") + +# Read the barrnap stats file +filter_ssu_stats = read.table( params$filter_ssu_stats, header = TRUE, sep = "\t", stringsAsFactors = FALSE) +# shorten header by "ssufilter_" to optimize visualisation +colnames(filter_ssu_stats) <- gsub("ssufilter_","",colnames(filter_ssu_stats)) +filter_ssu_stats <- subset(filter_ssu_stats, select = c(sample,input,output)) +filter_ssu_stats$'retained%' <- round( filter_ssu_stats$output / filter_ssu_stats$input *100, 2) +filter_ssu_stats_avg_removed <- 100-sum(filter_ssu_stats$'retained%')/length(filter_ssu_stats$'retained%') +filter_ssu_stats_max_removed <- 100-min(filter_ssu_stats$'retained%') + +# Display table +datatable(filter_ssu_stats, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) + +cat("In average", round(filter_ssu_stats_avg_removed,2), "% reads were removed, but at most",filter_ssu_stats_max_removed,"% reads per sample. ") +``` + + + +```{r, eval = !isFALSE(params$filter_len_asv_len_orig), results='asis'} +cat(paste0(" +## Sequence length + +A length filter was used to reduce potential contamination. +Before filtering, ASVs had the following length profile (count of 1 was transformed to 1.5 to allow plotting on log10 scale): + +")) + +# ASV length profile + +# import length profile tsv +filter_len_profile <- read.table(file = params$filter_len_asv_len_orig, header = TRUE, sep = "\t") + +# find number of ASVs filtered +filter_len_asv_filtered <- filter_len_profile +if ( params$min_len_asv != 0 ) { + filter_len_asv_filtered <- subset(filter_len_asv_filtered, Length >= params$min_len_asv) +} +if ( params$max_len_asv != 0 ) { + filter_len_asv_filtered <- subset(filter_len_asv_filtered, Length <= params$max_len_asv) +} + +# replace 1 with 1.5 to display on log scale +filter_len_profile$Counts[filter_len_profile$Counts == 1] <- 1.5 + +plot_filter_len_profile <- ggplot(filter_len_profile, + aes(x = Length, y = Counts)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("Number of ASVs") + + xlab("Length") + + scale_y_continuous(trans = "log10") + + theme_bw() +plot_filter_len_profile + +svg("asv_length_profile_before_length_filter.svg") +plot_filter_len_profile +invisible(dev.off()) + +cat("\n\n") +if ( params$min_len_asv != 0 && params$max_len_asv != 0 ) { + cat("Filtering omitted all ASVs with length lower than",params$min_len_asv,"or above",params$max_len_asv,"bp. ") +} else if ( params$min_len_asv != 0 ) { + cat("Filtering omitted all ASVs with length lower than",params$min_len_asv,"bp. ") +} else if ( params$max_len_asv != 0 ) { + cat("Filtering omitted all ASVs with length above",params$max_len_asv,"bp. ") +} +``` + +```{r, eval = !isFALSE(params$filter_len_asv), results='asis'} +# import stats tsv +filter_len_stats <- read.table(file = params$filter_len_asv, header = TRUE, sep = "\t") +# only if file not empty continue with reporting below +flag_filter_len_stats <- nrow(filter_len_stats) > 0 +``` + +```{r, eval = !isFALSE(params$filter_len_asv) && flag_filter_len_stats, results='asis'} +# Reads removed + +# re-name & re-order columns +colnames(filter_len_stats) <- gsub("lenfilter_","",colnames(filter_len_stats)) +filter_len_stats <- filter_len_stats[, c("sample", "input", "output")] +filter_len_stats$'retained%' <- round( filter_len_stats$output / filter_len_stats$input * 100 , 2) +filter_len_stats_avg_removed <- 100-sum(filter_len_stats$'retained%')/length(filter_len_stats$'retained%') +filter_len_stats_max_removed <- 100-min(filter_len_stats$'retained%') + +cat("The following table shows read counts for each sample before and after filtering:") + +# Display table +datatable(filter_len_stats, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) + +cat("In average", filter_len_stats_avg_removed, "% reads were removed, but at most",filter_len_stats_max_removed,"% reads per sample.") +``` + +```{r, eval = !isFALSE(params$filter_len_asv_len_orig), results='asis'} +cat("The number of ASVs was reduced by",sum(filter_len_profile$Counts)-sum(filter_len_asv_filtered$Counts),"(",100-round( sum(filter_len_asv_filtered$Counts)/sum(filter_len_profile$Counts)*100 ,2),"%), from",sum(filter_len_profile$Counts),"to",sum(filter_len_asv_filtered$Counts)," ASVs.") +cat("\n\nLength filter results can be found in folder [asv_length_filter](../asv_length_filter).") +``` + + + +```{r, eval = !isFALSE(params$filter_codons_fasta), results='asis'} +filter_codons_fasta <- read.table(file = params$filter_codons_fasta, header = FALSE, sep = "\t") +filter_codons_fasta_passed <- nrow(filter_codons_fasta)/2 + +cat(paste0(" +## Codon usage + +Amplicons of coding regions are expected to be free of stop codons and consist of condon tripletts. +ASVs were filtered against the presence of stop codons (",params$stop_codons,") in the specified open reading frame of the ASV. +Additionally, ASVs that are not multiple of 3 in length were omitted. +",filter_codons_fasta_passed," ASVs passed the filtering. + +Codon usage filter results can be found in folder [codon_filter](../codon_filter). +")) +``` + +```{r, eval = !isFALSE(params$filter_codons_stats), results='asis'} +# import stats tsv +filter_codons_stats <- read.table(file = params$filter_codons_stats, header = TRUE, sep = "\t") + +cat("The following table shows read counts for each sample after filtering:") + +# Display table +datatable(filter_codons_stats, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) +``` + + + +```{r, results='asis'} +# Check if any taxonomic classification is available +any_taxonomy <- !isFALSE(params$dada2_taxonomy) || !isFALSE(params$kraken2_taxonomy) || !isFALSE(params$qiime2_taxonomy) || !isFALSE(params$sintax_taxonomy) || !isFALSE(params$pplace_taxonomy) +``` + +```{r, eval = any_taxonomy, results='asis'} +# Header if any taxonomic classification is available +cat("# Taxonomic Classification\n") +``` + + + +```{r, eval = !isFALSE(params$cut_its), results='asis'} +cat(paste0(" +## ITS regions + +The ",params$cut_its," region was extracted from each ASV sequence using [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073). +Taxonomic classification should have improved performance based on extracted ITS sequence. ITSx results can be found in folder [itsx](../itsx). + +Taxonomies per extracted region was then transferred back to the full ASV sequence. No filtering was done based on whether the region was found or not. +Those taxonomic classifications per ASV can be found in files `ASV_tax.tsv` and `ASV_tax_species.tsv` in folder [dada2/](../dada2/). + +However, the files `ASV_ITS_tax.tsv` and `ASV_ITS_tax_species.tsv` in folder [dada2/](../dada2/) contain only the chosen ITS part of just the ASVs where the region was found. +Of course, different ASVs may contain identical ",params$cut_its," regions, leading to identical taxonomy assignments, +but the full ASVs were recorded as separate entries anyway to retain maximum resolution at this stage. +")) + +# Read ITSX summary +itsx_summary <- readLines(params$itsx_cutasv_summary) + +origins = FALSE +itsx_origins <- data.frame(origin=character(), count=numeric(), stringsAsFactors=FALSE) +for (line in itsx_summary){ + # get basic statistic + if (grepl("Number of sequences in input file:", line)) { + itsx_summary_nasv <- as.numeric( sub("Number of sequences in input file: *\t*", "", line) ) + } + if (grepl("Sequences detected as ITS by ITSx:", line)) { + itsx_summary_its <- as.numeric( sub("Sequences detected as ITS by ITSx: *\t*", "", line) ) + } + # get preliminar origins + if (grepl("----------------------------", line)) { + origins = FALSE + } + if (isTRUE(origins)) { + add <- data.frame(origin=sub(":.*", "", line), count=as.numeric( sub(".*: *\t*", "", line) ) ) + itsx_origins <- rbind(itsx_origins, add) + } + if (grepl("ITS sequences by preliminary origin:", line)) { + origins = TRUE + } +} +itsx_origins$percent <- round( itsx_origins$count / itsx_summary_nasv * 100, 2) + +cat(itsx_summary_its, "of",itsx_summary_nasv,"(",round( itsx_summary_its/itsx_summary_nasv*100 ,2),"%) ASVs were identified as ITS.", + "The following plot shows ITS sequences by preliminary origin:") + +plot_itsx_origins <- ggplot(itsx_origins, + aes(x = origin, y = percent)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("%") + + xlab("ITS sequences by preliminary origin") + + coord_flip() + + theme_bw() +plot_itsx_origins + +svg("itsx_preliminary_origin.svg") +plot_itsx_origins +invisible(dev.off()) +``` + + + +```{r, eval = !isFALSE(params$dada2_taxonomy), results='asis'} +cat("## DADA2\n") + +# indicate reference taxonomy +if ( !isFALSE(params$dada2_ref_tax_title) ) { + cat("The taxonomic classification was performed by [DADA2](https://pubmed.ncbi.nlm.nih.gov/27214047/) + using the database: `", params$dada2_ref_tax_title, "`. + More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "") +} else { + cat("The taxonomic classification was performed by DADA2 using a custom database ", + "provided by the user.\n\n", sep = "") +} + +# mention if taxonomy was cut by cutadapt +if ( !isFALSE(params$cut_dada_ref_taxonomy) ) { + cut_dada_ref_taxonomy <- readLines(params$cut_dada_ref_taxonomy) + for (line in cut_dada_ref_taxonomy){ + if (grepl("Total reads processed:", line)) { + cut_dada_ref_taxonomy_orig <- sub("Total reads processed: *\t*", "", line) + } + if (grepl("Reads written \\(passing filters\\):", line)) { + cut_dada_ref_taxonomy_filt <- sub("Reads written .passing filters.: *\t*", "", line) + } + if (grepl("Total basepairs processed:", line)) { + cut_dada_ref_taxonomy_orig_bp <- sub("Total basepairs processed: *\t*", "", line) + } + if (grepl("Total written \\(filtered\\):", line)) { + cut_dada_ref_taxonomy_filt_bp <- sub("Total written \\(filtered\\): *\t*", "", line) + } + } + + cat("The taxonomic reference database was cut by primer sequences to improve matching. + The original database had ",cut_dada_ref_taxonomy_orig," sequences with ",cut_dada_ref_taxonomy_orig_bp, + ", retained were ",cut_dada_ref_taxonomy_filt," sequences that represented ",cut_dada_ref_taxonomy_filt_bp,".\n\n", + sep = "") +} + +# make statistics of taxonomic classification +asv_tax <- read.table(params$dada2_taxonomy, header = TRUE, sep = "\t") + +# Calculate the classified numbers/percent of asv +level <- subset(asv_tax, select = -c(ASV_ID,confidence,sequence)) +level <- colnames(level) + +n_asv_tax = nrow(asv_tax) + +asv_tax_subset <- subset(asv_tax, select = level) +n_asv_classified <- colSums( asv_tax_subset != "" & !is.na(asv_tax_subset) ) + +n_asv_unclassified <- n_asv_tax - n_asv_classified +p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2) + +asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified) + +# Build output string +outputstr <- "DADA2 classified " +for (row in seq_len(nrow(asv_classi_df))) { + outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified, + " % ASVs at ", asv_classi_df[row, ]$level, " level, ") +} +outputstr <- substr(outputstr, 1, nchar(outputstr)-2) +outputstr <- paste0(outputstr, ".\n\n") + +# Output Text Classifications +cat(outputstr) + +# Barplot +# Plot +asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level) +plot_asv_classi_df <- ggplot(asv_classi_df, + aes(x = reorder(level, desc(level)), y = p_asv_classified)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("% Classification") + + xlab("Levels") + + coord_flip() + + theme_bw() +plot_asv_classi_df + +svg("dada2_taxonomic_classification_per_taxonomy_level.svg") +plot_asv_classi_df +invisible(dev.off()) + +cat("\n\nDADA2 taxonomy assignments can be found in folder [dada2](../dada2) in files `ASV_tax_*.tsv`.") +``` + + + +```{r, eval = !isFALSE(params$qiime2_taxonomy), results='asis'} +# Header +cat("## QIIME2\n") + +cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) + using the database: `", params$qiime2_ref_tax_title, "`. + More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "") + +# Read file and prepare table +asv_tax <- read.table(params$qiime2_taxonomy, header = TRUE, sep = "\t") +#asv_tax <- data.frame(do.call('rbind', strsplit(as.character(asv_tax$Taxon),'; ',fixed=TRUE))) +asv_tax <- subset(asv_tax, select = Taxon) + +# Remove greengenes85 ".__" placeholders +df = as.data.frame(lapply(asv_tax, function(x) gsub(".__", "", x))) +# remove all last, empty ; +df = as.data.frame(lapply(df, function(x) gsub(" ;","",x))) +# remove last remaining, empty ; +df = as.data.frame(lapply(df, function(x) gsub("; $","",x))) + +# get maximum amount of taxa levels per ASV +max_taxa <- lengths(regmatches(df$Taxon, gregexpr("; ", df$Taxon)))+1 + +# Currently, all QIIME2 databases seem to have the same levels! +level <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species") + +# Calculate the classified numbers/percent of asv +n_asv_tax = nrow(asv_tax) + +n_asv_classified <- length(which(max_taxa>=1)) +for (x in 2:length(level)) { + n_asv_classified <- c(n_asv_classified, length(which(max_taxa>=x)) ) +} +p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2) + +asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified) + +# Build output string +outputstr <- "QIIME2 classified " +for (row in seq_len(nrow(asv_classi_df))) { + outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified, + " % ASVs at ", asv_classi_df[row, ]$level, " level, ") +} +outputstr <- substr(outputstr, 1, nchar(outputstr)-2) +outputstr <- paste0(outputstr, ".\n\n") + +# Output Text Classifications +cat(outputstr) + +# Barplot +# Plot +asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level) +plot_asv_classi_df <- ggplot(asv_classi_df, + aes(x = reorder(level, desc(level)), y = p_asv_classified)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("% Classification") + + xlab("Levels") + + coord_flip() + + theme_bw() +plot_asv_classi_df + +svg("qiime2_taxonomic_classification_per_taxonomy_level.svg") +plot_asv_classi_df +invisible(dev.off()) + +cat("\n\nQIIME2 taxonomy assignments can be found in folder [qiime2/taxonomy](../qiime2/taxonomy).") +``` + + + +```{r, eval = !isFALSE(params$sintax_taxonomy), results='asis'} +# Header +cat("## SINTAX\n") + +cat("The taxonomic classification was performed by [SINTAX](https://doi.org/10.1101/074161) + using the database: `", params$sintax_ref_tax_title, "`. + More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "") + +asv_tax <- read.table(params$sintax_taxonomy, header = TRUE, sep = "\t") + +# Calculate the classified numbers/percent of asv +level <- subset(asv_tax, select = -c(ASV_ID,confidence,sequence)) +level <- colnames(level) + +n_asv_tax = nrow(asv_tax) + +asv_tax_subset <- subset(asv_tax, select = level) +n_asv_classified <- colSums( asv_tax_subset != "" & !is.na(asv_tax_subset) ) + +n_asv_unclassified <- n_asv_tax - n_asv_classified +p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2) + +asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified) + +# Build output string +outputstr <- "SINTAX classified " +for (row in seq_len(nrow(asv_classi_df))) { + outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified, + " % ASVs at ", asv_classi_df[row, ]$level, " level, ") +} +outputstr <- substr(outputstr, 1, nchar(outputstr)-2) +outputstr <- paste0(outputstr, ".\n\n") + +# Output Text Classifications +cat(outputstr) + +# Barplot +# Plot +asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level) +plot_asv_classi_df <- ggplot(asv_classi_df, + aes(x = reorder(level, desc(level)), y = p_asv_classified)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("% Classification") + + xlab("Levels") + + coord_flip() + + theme_bw() +plot_asv_classi_df + +svg("sintax_taxonomic_classification_per_taxonomy_level.svg") +plot_asv_classi_df +invisible(dev.off()) + +cat("\n\nSINTAX taxonomy assignments can be found in folder [sintax](../sintax).") +``` + + + +```{r, eval = !isFALSE(params$kraken2_taxonomy), results='asis'} +cat("## Kraken2\n") + +# indicate reference taxonomy +if ( !isFALSE(params$kraken2_ref_tax_title) ) { + cat("The taxonomic classification was performed by [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + using the database: `", params$kraken2_ref_tax_title, "`. + More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "") +} else { + cat("The taxonomic classification was performed by [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) using a custom database provided by the user.\n\n", sep = "") +} + +if ( params$kraken2_confidence != "0" ) { + cat("A confidence score threshold of",params$kraken2_confidence,"was applied for taxonomic classifications.\n\n") +} + +asv_tax <- read.table(params$kraken2_taxonomy, header = TRUE, sep = "\t") + +# Calculate the classified numbers/percent of asv +level <- subset(asv_tax, select = -c(ASV_ID,lowest_match)) +level <- colnames(level) + +n_asv_tax = nrow(asv_tax) + +asv_tax_subset <- subset(asv_tax, select = level) +n_asv_classified <- colSums( asv_tax_subset != "" & !is.na(asv_tax_subset) ) + +n_asv_unclassified <- n_asv_tax - n_asv_classified +p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2) + +asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified) + +# Build output string +outputstr <- "Kraken2 classified " +for (row in seq_len(nrow(asv_classi_df))) { + outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified, + " % ASVs at ", asv_classi_df[row, ]$level, " level, ") +} +outputstr <- substr(outputstr, 1, nchar(outputstr)-2) +outputstr <- paste0(outputstr, ".\n\n") + +# Output Text Classifications +cat(outputstr) + +# Barplot +asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level) +plot_asv_classi_df <- ggplot(asv_classi_df, + aes(x = reorder(level, desc(level)), y = p_asv_classified)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("% Classification") + + xlab("Levels") + + coord_flip() + + theme_bw() +plot_asv_classi_df + +svg("kraken2_taxonomic_classification_per_taxonomy_level.svg") +plot_asv_classi_df +invisible(dev.off()) + +cat("\n\nKraken2 taxonomy assignments can be found in folder [kraken2](../kraken2).") +``` + + + +```{r, eval = !isFALSE(params$pplace_taxonomy), results='asis'} +cat(paste0(" +## Phylogenetic Placement + +Phylogenetic placement grafts sequences onto a phylogenetic reference tree and optionally outputs taxonomic annotations. +The reference tree is ideally made from full-length high-quality sequences containing better evolutionary signal than short amplicons. +It is hence superior to estimating de-novo phylogenetic trees from short amplicon sequences. +Extraction of taxonomic classification was performed with [EPA-NG](https://github.com/Pbdas/epa-ng) and [Gappa](https://pubmed.ncbi.nlm.nih.gov/32016344/). +")) + +# Read file and prepare table +asv_tax <- read.table(params$pplace_taxonomy, header = TRUE, sep = "\t") + +# get maximum amount of taxa levels per ASV +max_taxa <- lengths(regmatches(asv_tax$taxonomy, gregexpr(";", asv_tax$taxonomy)))+1 + +# labels for levels +level <- rep(1:max(max_taxa)) + +# Calculate the classified numbers/percent of asv +n_asv_tax = nrow(asv_tax) + +n_asv_classified <- length(which(max_taxa>=1)) +for (x in 2:length(level)) { + n_asv_classified <- c(n_asv_classified, length(which(max_taxa>=x)) ) +} +p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2) + +asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified) + +# Build output string +outputstr <- "Phylogenetic Placement classified " +for (row in seq_len(nrow(asv_classi_df))) { + outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified, + " % ASVs at taxonomic level ", asv_classi_df[row, ]$level, ", ") +} +outputstr <- substr(outputstr, 1, nchar(outputstr)-2) +outputstr <- paste0(outputstr, ".\n\n") + +# Output Text Classifications +cat(outputstr) + +# Barplot +# Plot +asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level) +plot_asv_classi_df <- ggplot(asv_classi_df, + aes(x = reorder(level, desc(level)), y = p_asv_classified)) + + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + + ylab("% Classification") + + xlab("Taxonomic levels") + + coord_flip() + + theme_bw() +plot_asv_classi_df + +svg("phylogenetic_placement_taxonomic_classification_per_taxonomy_level.svg") +plot_asv_classi_df +invisible(dev.off()) + +cat("\n\nHeattree of the phylogenetic placement:") +``` + +```{r, eval = !isFALSE(params$pplace_taxonomy), out.width="100%", fig.show='hold', fig.align='default'} +knitr::include_graphics(c(params$pplace_heattree)) +``` + +```{r, eval = !isFALSE(params$pplace_taxonomy), results='asis'} +cat("\n\nPhylogenetic placement taxonomy assignments can be found in folder [pplace](../pplace) in file `*.taxonomy.per_query_unique.tsv`.") +``` + + + +```{r, eval = !isFALSE(params$val_used_taxonomy), results='asis'} +# Header +cat("# Downstream analysis with QIIME2\n", + "Files that were input to [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) can be found in folder [qiime2/input/](../qiime2/input/).", + "Results of taxonomic classification of",params$val_used_taxonomy,"was used in all following analysis, see in the above sections.") +``` + + + +```{r, eval = !isFALSE(params$filter_stats_tsv), results='asis'} +cat(paste0(" +## ASV filtering + +Unwanted taxa are often off-targets generated in PCR with primers that are not perfectly specific for the target DNA. +For 16S rRNA sequencing mitrochondria and chloroplast sequences are typically removed because these are frequent unwanted non-bacteria PCR products. +")) + +if ( params$exclude_taxa != "none" ) { + cat("ASVs were removed when the taxonomic string contained any of `", params$exclude_taxa, "` (comma separated)", sep="") +} +if ( params$min_frequency != 1 ) { + cat(", had fewer than", params$min_frequency ,"total read counts over all samples") +} +if ( params$min_samples != 1 ) { + cat(", or that were present in fewer than", params$min_samples ,"samples") +} +cat(". ") + +qiime2_filtertaxa <- unlist( strsplit( params$qiime2_filtertaxa, "," ) ) +qiime2_filtertaxa_orig <- as.numeric( qiime2_filtertaxa[1] ) -1 +qiime2_filtertaxa_filt <- as.numeric( qiime2_filtertaxa[2] ) -2 +qiime2_filtertaxa_rm <- qiime2_filtertaxa_orig-qiime2_filtertaxa_filt +qiime2_filtertaxa_rm_percent <- round( qiime2_filtertaxa_rm/qiime2_filtertaxa_orig*100 ,2) + +cat("Consequently,",qiime2_filtertaxa_orig,"ASVs were reduced by",qiime2_filtertaxa_rm,"(",qiime2_filtertaxa_rm_percent,"%) to",qiime2_filtertaxa_filt,".", + "The following table shows read counts for each sample before and after filtering:") + +# import stats tsv +filter_stats_tsv <- read.table(file = params$filter_stats_tsv, header = TRUE, sep = "\t") +colnames(filter_stats_tsv) <- gsub("_tax_filter","",colnames(filter_stats_tsv)) +filter_stats_tsv$retained_percent <- round( filter_stats_tsv$retained_percent, 2) +filter_stats_tsv$lost_percent <- round( filter_stats_tsv$lost_percent, 2) +colnames(filter_stats_tsv) <- gsub("_percent","%",colnames(filter_stats_tsv)) + +# Display table +datatable(filter_stats_tsv, options = list( + scrollX = TRUE, + scrollY = "300px", + paging = FALSE)) + +cat("\n\nTables with read count numbers and filtered abundance tables are in folder [qiime2/abundance_tables](../qiime2/abundance_tables).") +``` + + + +```{r, eval = !isFALSE(params$abundance_tables), results='asis'} +cat(paste0(" +## Abundance tables + +The abundance tables are the final data for further downstream analysis and visualisations. +The tables are based on the computed ASVs and taxonomic classification, but after removal of unwanted taxa. +Folder [qiime2/abundance_tables](../qiime2/abundance_tables) contains tap-separated files (.tsv) +that can be opened by any spreadsheet software. + +## Relative abundance tables + +Absolute abundance tables produced by the previous steps contain count data, but the compositional +nature of 16S rRNA amplicon sequencing requires sequencing depth normalisation. This step computes +relative abundance tables using TSS (Total Sum Scaling normalisation) for various taxonomic levels +and detailed tables for all ASVs with taxonomic classification, sequence and relative abundance for +each sample. Typically used for in depth investigation of taxa abundances. +Folder [qiime2/rel_abundance_tables](../qiime2/rel_abundance_tables) contains tap-separated files (.tsv) +that can be opened by any spreadsheet software. +")) +``` + + + +```{r, eval = !isFALSE(params$barplot), results='asis'} +cat(paste0(" +## Barplot + +Interactive abundance plot that aids exploratory browsing the discovered taxa and their abundance +in samples and allows sorting for associated meta data. Folder [qiime2/barplot](../qiime2/barplot) +contains barplots, click [qiime2/barplot/index.html](../qiime2/barplot/index.html) to open it in +your web browser. +")) +``` + +```{r, eval = !isFALSE(params$metadata_category_barplot), results='asis'} +cat(paste0(" +Additionally, barplots with average relative abundance values were produced +for `",params$metadata_category_barplot,"` (comma separated if several) in [qiime2/barplot_average](../qiime2/barplot_average) +in separate folders following the scheme `barplot_{treatment}`: +")) + +metadata_category_barplot <- sort( unlist( strsplit( params$metadata_category_barplot,"," ) ) ) +for (category in metadata_category_barplot) { + barplot_folder_path <- paste0("qiime2/barplot_average/barplot_",category) + cat("\n- [",barplot_folder_path,"/index.html](../",barplot_folder_path,"/index.html)\n", sep="") +} +``` + + + +```{r, eval = !isFALSE(params$alpha_rarefaction), results='asis'} +cat(paste0(" +## Alpha diversity rarefaction curves + +Produces rarefaction plots for several alpha diversity indices, and is primarily used to determine if the +richness of the samples has been fully observed or sequenced. If the slope of the curves does not level +out and the lines do not become horizontal, this might be because the sequencing depth was too low to observe +all diversity or that sequencing error artificially increases sequence diversity and causes false discoveries. + +Folder [qiime2/alpha-rarefaction](../qiime2/alpha-rarefaction) contains the data, click +[qiime2/alpha-rarefaction/index.html](../qiime2/alpha-rarefaction/index.html) to open it in your web browser. +")) +``` + + + +```{r, eval = !isFALSE(params$diversity_indices_alpha) || !isFALSE(params$diversity_indices_beta), results='asis'} +diversity_indices_depth <- readLines(params$diversity_indices_depth) + +cat(paste0(" +## Diversity analysis + +Diversity measures summarize important sample features (alpha diversity) or differences between samples (beta diversity). +Diversity calculations are based on sub-sampled data rarefied to ",diversity_indices_depth, " counts. +")) +``` + +```{r, eval = !isFALSE(params$diversity_indices_alpha), results='asis'} +cat(paste0(" +### Alpha diversity indices + +Alpha diversity measures the species diversity within samples. +")) + +if ( params$dada_sample_inference == "independent") { + cat("Please note that ASVs were inferred for each sample independently, that can make alpha diversity indices a poor estimate of true diversity. ") +} + +cat(paste0(" +This step calculates alpha diversity using various methods and performs pairwise comparisons of groups of samples. It is based on a phylogenetic tree of all ASV sequences. +Folder [qiime2/diversity/alpha_diversity](../qiime2/diversity/alpha_diversity) contains the alpha-diversity data: + +- Shannon’s diversity index (quantitative): [qiime2/diversity/alpha_diversity/shannon_vector/index.html](../qiime2/diversity/alpha_diversity/shannon_vector/index.html) +- Pielou’s Evenness: [qiime2/diversity/alpha_diversity/evenness_vector/index.html](../qiime2/diversity/alpha_diversity/evenness_vector/index.html) +- Faith’s Phylogenetic Diversity (qualitiative, phylogenetic) [qiime2/diversity/alpha_diversity/faith_pd_vector/index.html](../qiime2/diversity/alpha_diversity/faith_pd_vector/index.html) +- Observed OTUs (qualitative): [qiime2/diversity/alpha_diversity/observed_features_vector/index.html](../qiime2/diversity/alpha_diversity/observed_features_vector/index.html) +")) +``` + +```{r, eval = !isFALSE(params$diversity_indices_alpha) && !isFALSE(params$abundance_tables), results='asis'} +count <- read.table("./abundance_tables/feature-table.tsv", sep = '\t', header = TRUE, na.strings = c("NA", "-", "?"), comment="", skip=1) +count <- as.data.frame( colSums(count[2:ncol(count)]) ) +colnames(count)[1] <- "count" + +res_alphadiversity <- data.frame( + diversity=character(), + rho=integer(), + p=integer(), + stringsAsFactors=FALSE) +plots_alphadiversity_counts_spearman <- c() +for (alphadiversity_folder in c("shannon_vector","evenness_vector", "faith_pd_vector", "observed_features_vector")) { + alpha <- read.table(paste0("./alpha_diversity/",alphadiversity_folder,"/metadata.tsv"), sep = '\t', header = TRUE, na.strings = c("NA", "-", "?")) + df <- merge(alpha,count, by.x='id', by.y='row.names') + + df_subset <- df[,(ncol(df)-1):ncol(df)] + colnames(df_subset) <- c("alpha","count") + + spearman <- cor.test(df_subset$alpha, df_subset$count, method="spearman") + pearson <- cor.test(df_subset$alpha, df_subset$count, method="pearson") + + plot_alpha_count_spearman <- ggplot(df_subset, aes(x=count, y=alpha)) + + geom_point() + + ggtitle(paste0("Spearman's rank correlation\ncoefficient rho ",round(spearman$estimate,2)," and p=",round(spearman$p.value, 3))) + + xlab("Total counts per sample") + + ylab(paste0(alphadiversity_folder," alpha diversity")) + + geom_smooth(method=lm) + + theme_bw() + + outfile <- paste0("./",alphadiversity_folder,"_spearman.svg") + svg(outfile, height = 3.6, width = 3.6) + plot(plot_alpha_count_spearman) + invisible(dev.off()) + + plots_alphadiversity_counts_spearman <- c(plots_alphadiversity_counts_spearman,outfile) + res_alphadiversity <- rbind(res_alphadiversity, + data.frame( + diversity=alphadiversity_folder, + rho=spearman$estimate, + p=spearman$p.value, + stringsAsFactors=FALSE)) +} +sign_alphadiversity <- res_alphadiversity[res_alphadiversity$rho > 0 & res_alphadiversity$p < 0.05,] + +cat(paste0(" +Alpha diversity is considered not trustworthy when it correlates positively with sequencing depth. +Spearman's rank correlation was calculated for total counts per sample after all filtering steps +(in folder [qiime2/abundance_tables](../qiime2/abundance_tables)) with alpha diversity measures. +")) +if ( nrow(sign_alphadiversity) > 0 ) { + cat(paste0("Significant positive correlation was found for ", paste(sign_alphadiversity$diversity, collapse=', ' ),":\n\n")) +} else { + cat("No significant positive correlation was found between alpha diversity and sample counts:\n\n") +} +``` + +```{r, eval = !isFALSE(params$diversity_indices_alpha) && !isFALSE(params$abundance_tables), out.width="25%", fig.show='hold', fig.align='default'} +knitr::include_graphics(plots_alphadiversity_counts_spearman) +``` + +```{r, eval = !isFALSE(params$diversity_indices_alpha) && !isFALSE(params$abundance_tables), results='asis'} +cat("Scatter plots with linear regression line (blue) with 95% confidence interval (gray shaded area).") +``` + +```{r, eval = !isFALSE(params$diversity_indices_beta), results='asis'} +cat(paste0(" +### Beta diversity indices + +Beta diversity measures the species community differences between samples. This step calculates beta diversity distances using +various methods and performs pairwise comparisons of groups of samples. Additionally, principle coordinates analysis (PCoA) +plots are produced that can be visualized with Emperor in your default browser without the need for installation. +These calculations are based on a phylogenetic tree of all ASV sequences. +Folder [qiime2/diversity/beta_diversity](../qiime2/diversity/beta_diversity) contains the beta-diverity data: + +#### PCoA for four different beta diversity distances are accessible via: + +- Bray-Curtis distance (quantitative): [qiime2/diversity/beta_diversity/bray_curtis_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/bray_curtis_pcoa_results-PCoA/index.html) +- Jaccard distance (qualitative): [qiime2/diversity/beta_diversity/jaccard_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/jaccard_pcoa_results-PCoA/index.html) +- unweighted UniFrac distance (qualitative, phylogenetic) [qiime2/diversity/beta_diversity/unweighted_unifrac_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/unweighted_unifrac_pcoa_results-PCoA/index.html) +- weighted UniFrac distance (quantitative, phylogenetic): [qiime2/diversity/beta_diversity/weighted_unifrac_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/weighted_unifrac_pcoa_results-PCoA/index.html) + +#### Pairwise comparisons between groups of samples + +Statistics on differences between specific metadata groups that can be found in folder +[qiime2/diversity/beta_diversity/](../qiime2/diversity/beta_diversity/). Each significance test +result is in its separate folder following the scheme `{method}_distance_matrix-{treatment}`: +")) + +diversity_indices_beta <- sort( unlist( strsplit( params$diversity_indices_beta,"," ) ) ) +for (folder in diversity_indices_beta) { + beta_folder_path <- paste0("qiime2/diversity/",folder) #"beta_diversity/" is defined in input section with "stageAs: 'beta_diversity/*'" + cat("\n- [",beta_folder_path,"/index.html](../",beta_folder_path,"/index.html)\n", sep="") +} +``` + +```{r, eval = !isFALSE(params$qiime_adonis_formula), results='asis'} +cat(paste0(" +#### ADONIS test + +Permutational multivariate analysis of variance using distance matrices +[adonis](https://doi.org/10.1111/j.1442-9993.2001.01070.pp.x) (in [VEGAN](https://CRAN.R-project.org/package=vegan)) +determines whether groups of samples are significantly different from one another. +The formula was `",params$qiime_adonis_formula,"` (multiple formulas are comma separated). +adonis computes an R2 value (effect size) which shows the percentage of variation explained +by a condition, as well as a p-value to determine the statistical significance. +The sequence of conditions in the formula matters, the variance of factors is removed +(statistically controlled for) from beginning to end of the formula. + +Test results are in separate folders following the scheme `{method}_distance_matrix-{adonis formula}`: +")) + +diversity_indices_adonis <- sort( unlist( strsplit( params$diversity_indices_adonis,"," ) ) ) +for (folder in diversity_indices_adonis) { + adonis_index_path <- paste0("qiime2/diversity/",folder) #"beta_diversity/" is defined in input section with "stageAs: 'beta_diversity/adonis/*'" + cat("\n- [",adonis_index_path,"/index.html](../",adonis_index_path,"/index.html)\n", sep="") +} +``` + + + +```{r, eval = !isFALSE(params$ancom), results='asis'} +cat(paste0(" +## ANCOM + +[Analysis of Composition of Microbiomes (ANCOM)](https://www.ncbi.nlm.nih.gov/pubmed/26028277) +is applied to identify features that are differentially +abundant across sample groups. A key assumption made by ANCOM is that few taxa (less than about 25%) +will be differentially abundant between groups otherwise the method will be inaccurate. +Comparisons between groups of samples is performed for specific metadata that can be found in folder +[qiime2/ancom/](../qiime2/ancom/). + +Test results are in separate folders following the scheme `Category-{treatment}-{taxonomic level}`: +")) + +ancom <- sort( unlist( strsplit( params$ancom,"," ) ) ) +for (folder in ancom) { + ancom_path <- paste0("qiime2/ancom/",folder) + cat("\n- [",ancom_path,"/index.html](../",ancom_path,"/index.html)\n", sep="") +} +``` + + + +```{r, eval = !isFALSE(params$picrust_pathways), results='asis'} +cat(paste0(" +# PICRUSt2 + +[PICRUSt2](https://pubmed.ncbi.nlm.nih.gov/32483366/) (Phylogenetic Investigation of Communities by Reconstruction of Unobserved States) +is a software for predicting functional abundances based only on marker gene sequences. +Enzyme Classification numbers (EC), KEGG orthologs (KO) and MetaCyc ontology predictions were made for each sample. +In folder [PICRUSt2/](../PICRUSt2/) are predicted quantifications for Enzyme Classification numbers (EC), see +`EC_pred_metagenome_unstrat_descrip.tsv`, KEGG orthologs (KO), see `KO_pred_metagenome_unstrat_descrip.tsv`, MetaCyc ontology, +see `METACYC_path_abun_unstrat_descrip.tsv`. Quantifications are not normalized yet, they can be normalized e.g. by the total sum per sample. +")) +``` + + + +```{r, eval = !isFALSE(params$sbdi), results='asis'} +cat(paste0(" +# SBDI + +The [Swedish Biodiversity Infrastructure (SBDI)](https://biodiversitydata.se/) provides a cost-effective, cutting-edge +infrastructure that supports Swedish and international biodiversity and ecosystems research. +Files in preparation for submission to SBDI can be found in folder [SBDI](../SBDI/). +Tables are generated from the DADA2 denoising and taxonomy assignment steps. +Each table, except `annotation.tsv`, corresponds to one tab in the [SBDI submission template](https://asv-portal.biodiversitydata.se/submit). +Most of the fields in the template will not be populated, +but if you run nf-core/ampliseq with a sample metadata table (`--metadata`) any fields corresponding to a field in the template will be used. +")) +``` + + + +```{r, eval = !isFALSE(params$phyloseq), results='asis'} +cat(paste0(" +# Phyloseq + +[Phyloseq](https://doi.org/10.1371/journal.pone.0061217) +is a popular R package to analyse and visualize microbiom data. +The produced RDS files contain phyloseq objects and can be loaded directely into R and phyloseq. +The objects contain an ASV abundance table and a taxonomy table. +If available, metadata and phylogenetic tree will also be included in the phyloseq object. +The files can be found in folder [phyloseq](../phyloseq/). +")) +``` + + + +# Methods + + + +## Proposed methods section + +Data was processed using nf-core/ampliseq `r ampliseq_version` (doi: [10.5281/zenodo.1493841](https://zenodo.org/badge/latestdoi/150448201)) +([Straub et al., 2020](https://doi.org/10.3389/fmicb.2020.550420)) of the nf-core collection of workflows +([Ewels et al., 2020](https://dx.doi.org/10.1038/s41587-020-0439-x)), utilising reproducible software environments +from the Bioconda ([Grüning et al., 2018](https://pubmed.ncbi.nlm.nih.gov/29967506/)) and Biocontainers +([da Veiga Leprevost et al., 2017](https://pubmed.ncbi.nlm.nih.gov/28379341/)) projects. + +```{r, eval = !isFALSE(params$mqc_plot), results='asis'} +cat(paste0(" +Data quality was evaluated with FastQC ([Andrews, 2010](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +and summarized with MultiQC ([Ewels et al., 2016](https://pubmed.ncbi.nlm.nih.gov/27312411/)). +")) +``` +```{r, eval = !isFALSE(params$cutadapt_summary), results='asis'} +cutadapt_intro = "Cutadapt ([Marcel et al., 2011](https://doi.org/10.14806/ej.17.1.200)) trimmed primers" +if ( isFALSE(params$flag_retain_untrimmed) ) { + cat(paste0(cutadapt_intro,cutadapt_text_ch)) +} else { + cat(paste0(cutadapt_intro, ".")) +} +``` +```{r, eval = !isFALSE(params$dada_filtntrim_args), results='asis'} +if ( !isFALSE(params$cutadapt_summary) && isFALSE(params$flag_retain_untrimmed) ) { + cat("Adapter and primer-free sequences were processed ") +} else { + cat("Sequences were processed ") +} + +if ( params$dada_sample_inference == "independent" ) { + cat("sample-wise (independent) ") +} else if ( params$dada_sample_inference == "pooled" ) { + cat("as one pool (pooled)") +} else { + cat("initally independently, but re-examined as one pool (pseudo-pooled) ") +} + +cat("with DADA2 ([Callahan et al., 2016](https://pubmed.ncbi.nlm.nih.gov/27214047/)) to eliminate PhiX contamination, ") + +if (params$trunc_qmin) { + cat("trim reads (before median quality drops below ", params$trunc_qmin, + " and at least ",params$trunc_rmin*100, "% of reads are retained; ", + "forward reads at ", tr_len_f, " bp and reverse reads at ", tr_len_r, + " bp, reads shorter than this were discarded), ", sep = "") +} else if (params$trunclenf == "null" && params$trunclenr == "null") { + cat("") +} else if (params$trunclenf != 0 && params$trunclenr != 0) { + cat("trim reads (forward reads at ", params$trunclenf, + " bp and reverse reads at ", params$trunclenr, + " bp, reads shorter than this were discarded), ", sep = "") +} else if (params$trunclenf != 0) { + cat("trim reads (forward reads at ", params$trunclenf," bp, reads shorter than this were discarded), ", sep = "") +} else if (params$trunclenr != 0) { + cat("trim reads (reverse reads at ", params$trunclenr," bp, reads shorter than this were discarded), ", sep = "") +} +if ( as.integer(params$max_ee) > 0 ) { + cat("discard reads with >", params$max_ee,"expected errors, ") +} +cat("correct errors, ") +if ( isFALSE(params$flag_single_end) ) { + cat("merge read pairs, ") +} +cat(paste0(" +and remove polymerase chain reaction (PCR) chimeras; +ultimately, ",n_asv_dada," amplicon sequencing variants (ASVs) were obtained across all samples. +Between ",min(dada_stats_p$analysis),"% and ",max(dada_stats_p$analysis),"% reads per sample +(average ",dada_stats_p_analysis_average,"%) were retained. +The ASV count table contained in total ",sum(dada_stats_ex$analysis)," counts, +at least ",min(dada_stats_ex$analysis)," and at most ",max(dada_stats_ex$analysis)," per sample +(average ",round(sum(dada_stats_ex$analysis)/length(dada_stats_ex$analysis),0),"). +")) +``` + +```{r, eval = !isFALSE(params$vsearch_cluster), results='asis'} +cat(paste0(" +VSEARCH ([Rognes et al., 2016](https://peerj.com/articles/2584/)) clustered ",n_asv_dada," ASVs into ",n_asv_vsearch_cluster," +centroids with pairwise identity of ",params$vsearch_cluster_id,". +")) +``` +```{r, eval = !isFALSE(params$path_barrnap_sum) && !isFALSE(params$filter_ssu), results='asis'} +cat(paste0("Barrnap ([Seemann, 2013](https://github.com/tseemann/barrnap)) filtered ASVs for `",params$filter_ssu,"` (`bac`: Bacteria, `arc`: Archaea, `mito`: Mitochondria, `euk`: Eukaryotes), +",n_asv_barrnap-filter_ssu_asv_filtered," ASVs were removed ")) +if ( !isFALSE(params$filter_ssu_stats) ) { + cat(paste0("with less than ",filter_ssu_stats_max_removed,"% counts per sample ")) +} +cat(paste0("(",filter_ssu_asv_filtered," ASVs passed).")) +``` +```{r, eval = !isFALSE(params$filter_len_asv_len_orig), results='asis'} +cat(sum(filter_len_profile$Counts)-sum(filter_len_asv_filtered$Counts)) +if ( params$min_len_asv != 0 && params$max_len_asv != 0 ) { + cat(" ASVs with length lower than",params$min_len_asv,"or above",params$max_len_asv,"bp ") +} else if ( params$min_len_asv != 0 ) { + cat(" ASVs with length lower than",params$min_len_asv,"bp ") +} else if ( params$max_len_asv != 0 ) { + cat(" ASVs with length above",params$max_len_asv,"bp ") +} +if ( !isFALSE(params$filter_len_asv) && flag_filter_len_stats ) { + cat(paste0("were removed with less than ",round(filter_len_stats_max_removed,2),"% counts per sample (",sum(filter_len_asv_filtered$Counts)," ASVs passed).")) +} else { + cat(paste0("were removed (",sum(filter_len_asv_filtered$Counts)," ASVs passed).")) +} +``` +```{r, eval = !isFALSE(params$filter_codons_fasta), results='asis'} +cat(filter_codons_fasta_passed,"ASVs had no stop codons (",params$stop_codons,") and a length of a multiple of 3 (tripletts).") +``` + +```{r, eval = any_taxonomy, results='asis'} +methods_taxonomic_classification <- c("Taxonomic classification was performed by ") +if ( !isFALSE(params$dada2_taxonomy) ) { + if ( !isFALSE(params$dada2_ref_tax_title) ) { + methods_taxonomic_classification_dada <- paste("DADA2 and the database '",params$dada2_ref_tax_title,"' (`", params$dada2_ref_tax_citation,"`)", sep="") + } else { + methods_taxonomic_classification_dada <- paste("DADA2 with a user provided database", sep="") + } + if ( !isFALSE(params$cut_dada_ref_taxonomy) ) { + methods_taxonomic_classification_dada <- paste(methods_taxonomic_classification_dada, + "that had",cut_dada_ref_taxonomy_filt,"sequences extracted by PCR primers to improve assignments") + } + methods_taxonomic_classification <- c(methods_taxonomic_classification, methods_taxonomic_classification_dada) +} +if ( !isFALSE(params$qiime2_ref_tax_title) ) { + methods_taxonomic_classification <- c(methods_taxonomic_classification, + paste("QIIME2 and the database '",params$qiime2_ref_tax_title,"' (`", params$qiime2_ref_tax_citation,"`)", sep="")) +} else if (!isFALSE(params$qiime2_taxonomy)) { + methods_taxonomic_classification <- c(methods_taxonomic_classification, + paste("QIIME2 with a user provided database", sep="")) +} +if ( !isFALSE(params$kraken2_ref_tax_title) ) { + methods_taxonomic_classification <- c(methods_taxonomic_classification, + paste("Kraken2 and the database '",params$kraken2_ref_tax_title,"' (`", params$kraken2_ref_tax_citation,"`)", sep="")) +} else if (!isFALSE(params$kraken2_taxonomy)) { + methods_taxonomic_classification <- c(methods_taxonomic_classification, + paste("Kraken2 with a user provided database", sep="")) +} +if ( !isFALSE(params$sintax_ref_tax_title) ) { + methods_taxonomic_classification <- c(methods_taxonomic_classification, + paste("SINTAX and the database '",params$sintax_ref_tax_title,"' (`", params$sintax_ref_tax_citation,"`)", sep="")) +} else if (!isFALSE(params$sintax_taxonomy)) { + methods_taxonomic_classification <- c(methods_taxonomic_classification, + paste("SINTAX with a user provided database", sep="")) +} + +cat(paste0(methods_taxonomic_classification[1],methods_taxonomic_classification[2])) +if (length(methods_taxonomic_classification) >= 3) { + for (x in 3:length(methods_taxonomic_classification)) { + cat(paste(",",methods_taxonomic_classification[x])) + } +} +cat(".") +``` + +```{r, eval = !isFALSE(params$val_used_taxonomy), results='asis'} +cat("ASV sequences, abundance and ",params$val_used_taxonomy," taxonomic assignments were loaded into QIIME2 ([Bolyen et al., 2019](https://www.nature.com/articles/s41587-019-0209-9)).") +``` +```{r, eval = !isFALSE(params$filter_stats_tsv), results='asis'} +if ( as.integer(qiime2_filtertaxa_rm) > 0 ) { + qiime_filter <- c(paste("Of",qiime2_filtertaxa_orig,"ASVs,",qiime2_filtertaxa_rm,"were removed because")) + if ( params$exclude_taxa != "none" ) { + qiime_filter <- c(qiime_filter, + paste("the taxonomic string contained any of (", params$exclude_taxa,")",sep="")) + } + if ( params$min_frequency != 1 ) { + qiime_filter <- c(qiime_filter, + paste("had fewer than", params$min_frequency ,"total read counts over all samples")) + } + if ( params$min_samples != 1 ) { + qiime_filter <- c(qiime_filter, + paste("were present in fewer than", params$min_samples ,"samples")) + } + cat(paste(qiime_filter[1],qiime_filter[2])) + if (length(qiime_filter) >= 3) { + for (x in 3:length(qiime_filter)) { + cat(paste(", ",qiime_filter[x])) + } + } + cat(" (",qiime2_filtertaxa_filt," ASVs passed). ",sep="") +} +``` +```{r, eval = !isFALSE(params$val_used_taxonomy), results='asis'} +if (!isFALSE(params$barplot) || !isFALSE(params$alpha_rarefaction) || !isFALSE(params$diversity_indices_beta) || !isFALSE(params$ancom)) { + qiime_final <- c("Within QIIME2, the final microbial community data was") + if (!isFALSE(params$barplot)) { + qiime_final <- c(qiime_final,"visualized in a barplot") + } + if (!isFALSE(params$alpha_rarefaction)) { + qiime_final <- c(qiime_final,"evaluated for sufficient sequencing depth with alpha rarefaction curves") + } + if (!isFALSE(params$diversity_indices_beta)) { + qiime_final <- c(qiime_final, + paste("investigated for alpha (within-sample) and beta (between-sample) diversity after rarefaction to",diversity_indices_depth,"counts")) + } + if (!isFALSE(params$ancom)) { + qiime_final <- c(qiime_final,"used to find differential abundant taxa with ANCOM ([Mandal et al., 2015](https://pubmed.ncbi.nlm.nih.gov/26028277/))") + } + cat(paste(qiime_final[1],qiime_final[2])) + if (length(qiime_final) >= 3) { + for (x in 3:length(qiime_final)) { + cat(paste(", ",qiime_final[x])) + } + } + cat(".") +} +``` + +```{r, results='asis'} +cat(paste0(" +> **WARNING** +> This methods section is lacking software versions, these can be found +")) +if ( !isFALSE(params$mqc_plot) ) { + cat("in [MultiQC's report section Software Versions](../multiqc/multiqc_report.html#software_versions) or ") +} +cat("in folder [pipeline_info](../pipeline_info) file `software_versions.yml`.") +``` + + + +```{r, eval = any_taxonomy, results='asis'} +cat("## Reference databases\n\n") + +if ( !isFALSE(params$dada2_ref_tax_title) ) { + cat("Taxonomic classification by DADA2:\n\n", + "- database: `", params$dada2_ref_tax_title, "`\n\n", + "- files: `", params$dada2_ref_tax_file, "`\n\n", + "- citation: `", params$dada2_ref_tax_citation, "`\n\n", sep = "") +} else if (!isFALSE(params$dada2_taxonomy)) { + cat("Taxonomic classification by DADA2:\n\n", + "- database: unknown - user provided\n\n", sep = "") +} + +if ( !isFALSE(params$sintax_ref_tax_title) ) { + cat("Taxonomic classification by SINTAX:\n\n", + "- database: `", params$sintax_ref_tax_title, "`\n\n", + "- files: `", params$sintax_ref_tax_file, "`\n\n", + "- citation: `", params$sintax_ref_tax_citation, "`\n\n", sep = "") +} else if (!isFALSE(params$sintax_taxonomy)) { + cat("Taxonomic classification by SINTAX:\n\n", + "- database: unknown - user provided\n\n", sep = "") +} + +if ( !isFALSE(params$kraken2_ref_tax_title) ) { + cat("Taxonomic classification by Kraken2:\n\n", + "- database: `", params$kraken2_ref_tax_title, "`\n\n", + "- files: `", params$kraken2_ref_tax_file, "`\n\n", + "- citation: `", params$kraken2_ref_tax_citation, "`\n\n", sep = "") +} else if (!isFALSE(params$kraken2_taxonomy)) { + cat("Taxonomic classification by Kraken2:\n\n", + "- database: unknown - user provided\n\n", sep = "") +} + +if ( !isFALSE(params$qiime2_ref_tax_title) ) { + cat("Taxonomic classification by QIIME2:\n\n", + "- database: `", params$qiime2_ref_tax_title, "`\n\n", + "- files: `", params$qiime2_ref_tax_file, "`\n\n", + "- citation: `", params$qiime2_ref_tax_citation, "`\n\n", sep = "") +} else if (!isFALSE(params$qiime2_taxonomy)) { + cat("Taxonomic classification by QIIME2:\n\n", + "- database: unknown - user provided\n\n", sep = "") +} +``` + + + +```{r, eval = !isFALSE(params$mqc_plot), results='asis'} +cat(paste0(" +## MultiQC methods summary + +[MultiQC](https://multiqc.info/) summarized computational methods in [multiqc/multiqc_report.html](../multiqc/multiqc_report.html). +The proposed short methods description can be found in [MultiQC's Methods Description](../multiqc/multiqc_report.html#nf-core-ampliseq-methods-description), +versions of software collected at runtime in [MultiQC's Software Versions](../multiqc/multiqc_report.html#software_versions), +and a summary of non-default parameter in [MultiQC's Workflow Summary](../multiqc/multiqc_report.html#nf-core-ampliseq-summary). +")) +``` + + + +```{r, results='asis'} +cat(paste0(" +## Nextflow and pipeline information + +Technical information to the pipeline run are collected in folder [pipeline_info](../pipeline_info), +including software versions collected at runtime in file `software_versions.yml` (can be viewed with a text editor), +all parameter settings in file `params_{date}_{time}.json` (can be viewed with a text editor), +execution report in file `execution_report_{date}_{time}.html`, +execution trace in file `execution_trace_{date}_{time}.txt`, +execution timeline in file `execution_timelime_{date}_{time}.html`, and +pipeline direct acyclic graph (DAG) in file `pipeline_dag_{date}_{time}.html`. +")) +``` + + + +# Final notes + +This report (file `summary_report.html`) is located in folder [summary_report](.) of the original pipeline results folder. +In this file, all links to files and folders are relative, therefore hyperlinks will only work when the report is at its original place in the pipeline results folder. +Plots specifically produced for this report (if any) can be also found in folder [summary_report](.). + +A comprehensive read count report throughout the pipeline can be found in the [base results folder](../) in file `overall_summary.tsv`. + +Please cite the [pipeline publication](https://doi.org/10.3389/fmicb.2020.550420) and any software tools used by the pipeline (see [citations](https://nf-co.re/ampliseq#citations)) when you use any of the pipeline results in your study. diff --git a/assets/schema_input.json b/assets/schema_input.json index 5b22676f..8a016da6 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,30 +7,35 @@ "items": { "type": "object", "properties": { - "sample": { + "sampleID": { "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "pattern": "^[a-zA-Z][a-zA-Z0-9_]+$", + "unique": true, + "errorMessage": "Unique sample ID must be provided: Must start with a letter, and can only contain letters, numbers or underscores; Regex: '^[a-zA-Z][a-zA-Z0-9_]+$'", + "meta": ["id"] }, - "fastq_1": { + "forwardReads": { "type": "string", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, - "fastq_2": { - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", - "anyOf": [ - { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$" - }, - { - "type": "string", - "maxLength": 0 - } - ] + "reverseReads": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "errorMessage": "FastQ file for reads 2 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + }, + "run": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Run name cannot contain spaces", + "meta": ["run"], + "default": "1" } }, - "required": ["sample", "fastq_1"] + "required": ["sampleID", "forwardReads"] } } diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f2..b170caab 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "nf-core/ampliseq v${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/bin/filt_clusters.py b/bin/filt_clusters.py new file mode 100755 index 00000000..53681f03 --- /dev/null +++ b/bin/filt_clusters.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +import argparse +import gzip +import pandas as pd + +usage = """This program filters ASVs that aren't centroids after post-clustering.""" + +parser = argparse.ArgumentParser(description=usage) + +parser.add_argument( + "-t", + "--count-table", + dest="count", + type=argparse.FileType("r"), + help="Count table file of the ASVs from the AmpliSeq pipeline", + required=True, +) + +parser.add_argument( + "-p", + "--prefix", + dest="prefix", + type=str, + help="Prefix of the output files", + required=True, +) + +parser.add_argument( + "-c", + "--cluster-fastas", + dest="cluster_fastas", + type=str, + help="Space separated list of fasta files of the clusters. First read of the cluster should be the centroid of that cluster.", + required=True, +) + +args = parser.parse_args() + +# This dictionary will store the centroid ASVs as keys, and the values will be the ASVs clustered to that centroid +cluster_dict = {} + +# Loop though list of cluster fasta files to populate cluster_dict and to create centroid fasta file +cluster_fastas = args.cluster_fastas.split(" ") +for cluster_fasta in cluster_fastas: + read_num = 0 + + # Loop through each line of current fasta file and open output fasta file in append mode + with gzip.open(cluster_fasta, "rt") as in_fasta, open(args.prefix + "_filtered.fna", "a") as out_fasta: + for line in in_fasta: + line = line.rstrip("\n") + + # If the line is not a sequence + if line.startswith(">"): + read_num += 1 + asv_name = line[1:] + + # If the read is the centroid + if read_num == 1: + centroid_name = asv_name + cluster_dict[centroid_name] = [] + out_fasta.write(f"{line}\n") + + # If the read is not the centroid + else: + cluster_dict[centroid_name].append(asv_name) + + # If the line is a sequence + else: + # If the read is the centroid + if read_num == 1: + out_fasta.write(f"{line}\n") + +# This dictionary will store the samples as keys, and the values will be the number of ASVs in that sample +sam_asv_counts = {} + +# This count_df will have ASVs as the index, and samples as the header +count_df = pd.read_table(args.count, delimiter="\t", index_col=0, header=0) + +# Get the number of ASVs per sample before clustering +for sample in count_df.columns: + sam_asv_counts[sample] = (count_df[sample] != 0).sum() +stats_df = pd.DataFrame(list(sam_asv_counts.items()), columns=["sample", "ASVs_before_clustering"]) + +# Loop through centroids +for centroid in cluster_dict.keys(): + # If the current centroid has ASVs clustered to it + if cluster_dict[centroid] != []: + # Get a list of all ASVs in the cluster (including the centroid) + cluster_list = cluster_dict[centroid].copy() + cluster_list.append(centroid) + + # Sum all rows in the cluster + summed_row = count_df.loc[cluster_list].sum() + + # Assign summed row to centroid row and drop non-centroid rows + count_df.loc[centroid] = summed_row + count_df.drop(cluster_dict[centroid], inplace=True) + +# Get the number of ASVs per sample after clustering +for sample in count_df.columns: + sam_asv_counts[sample] = (count_df[sample] != 0).sum() +stats_df["ASVs_after_clustering"] = list(sam_asv_counts.values()) + +# Output filtered count tsv and stats tsv +count_df.to_csv(args.prefix + "_filtered.table.tsv", sep="\t") +stats_df.to_csv(args.prefix + "_filtered.stats.tsv", sep="\t", index=False) diff --git a/bin/filt_codons.py b/bin/filt_codons.py index 09754b57..4df16892 100755 --- a/bin/filt_codons.py +++ b/bin/filt_codons.py @@ -43,7 +43,7 @@ dest="count", type=argparse.FileType("r"), help="Count table file of the ASVs from the AmpliSeq pipeline", - required=True, + required=False, ) parser.add_argument( @@ -115,21 +115,25 @@ def check_asv(seq, start, stop): Out_Seq = open(args.prefix + "_filtered.fna", "w") Out_list = open(args.prefix + "_filtered.list", "w") -Out_table = open(args.prefix + "_filtered.table.tsv", "w") +if args.count is not None: + Out_table = open(args.prefix + "_filtered.table.tsv", "w") +else: + Out_table = open("empty_" + args.prefix + "_filtered.table.tsv", "w") count_dict = {} p1 = re.compile("\t") p2 = re.compile(">") -count = 0 -for line in args.count: - line = line.rstrip("\n") - if count == 0: - print(line, file=Out_table) - count += 1 - else: - tmp_list = re.split(p1, line) - count_dict[tmp_list[0]] = line +if args.count is not None: + count = 0 + for line in args.count: + line = line.rstrip("\n") + if count == 0: + print(line, file=Out_table) + count += 1 + else: + tmp_list = re.split(p1, line) + count_dict[tmp_list[0]] = line for line in args.fasta: line = line.rstrip("\n") @@ -143,9 +147,11 @@ def check_asv(seq, start, stop): if check_asv(line, begin, end): print(">", bin_head, "\n", line, file=Out_Seq, sep="") print(bin_head, file=Out_list) - print(count_dict[bin_head], file=Out_table) + if args.count is not None: + print(count_dict[bin_head], file=Out_table) -args.count.close() +if args.count is not None: + args.count.close() args.fasta.close() Out_Seq.close() Out_list.close() diff --git a/bin/reformat_tax_for_phyloseq.py b/bin/reformat_tax_for_phyloseq.py new file mode 100755 index 00000000..f35aaf03 --- /dev/null +++ b/bin/reformat_tax_for_phyloseq.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import pandas as pd +import sys + +tax_file = sys.argv[1] +out_file = sys.argv[2] + +# Import tsv file +tax_df = pd.read_csv(tax_file, sep="\t") + +# The second column should hold the taxonomy information +tax_col = tax_df.columns[1] + +# Split the values in the tax column +split_tax = tax_df[tax_col].str.split(";", expand=True) + +# Assign names to the new columns with an auto incrementing integer +new_col_names = [f"{tax_col}_{i+1}" for i in range(split_tax.shape[1])] +split_tax.columns = new_col_names + +# Strip whitespace from the tax names +split_tax = split_tax.applymap(lambda x: x.strip() if isinstance(x, str) else x) + +# Drop the original tax column +tax_df = tax_df.drop(columns=[tax_col]) + +# Add the new tax columns to the df +result = pd.concat([tax_df, split_tax], axis=1) + +# Create new tsv file +result.to_csv(out_file, sep="\t", index=False) diff --git a/bin/sbdiexport.R b/bin/sbdiexport.R index e0a3575d..8885424b 100755 --- a/bin/sbdiexport.R +++ b/bin/sbdiexport.R @@ -44,8 +44,8 @@ n_samples <- length(colnames(asvs)) - 1 # Read taxonomy table and make sure all expected columns are there taxonomy <- read.delim(taxtable, sep = '\t', stringsAsFactors = FALSE) %>% mutate(Domain = if("Domain" %in% colnames(.)) Domain else '') %>% - mutate(Kingdom = if("Kingdom" %in% colnames(.)) Kingdom else '') %>% - mutate(Phylum = if("Phylum" %in% colnames(.)) Phylum else '') %>% + mutate(Kingdom = if("Kingdom" %in% colnames(.)) Kingdom else if ("Supergroup" %in% colnames(.)) Supergroup else '') %>% + mutate(Phylum = if("Phylum" %in% colnames(.)) Phylum else if ("Division" %in% colnames(.)) Division else '') %>% mutate(Class = if("Class" %in% colnames(.)) Class else '') %>% mutate(Order = if("Order" %in% colnames(.)) Order else '') %>% mutate(Family = if("Family" %in% colnames(.)) Family else '') %>% @@ -150,7 +150,7 @@ asvtax <- asvs %>% mutate( domain = str_remove(domain, 'Reversed:_'), associatedSequences = '', - kingdom = ifelse(is.na(kingdom), 'Unassigned', kingdom), + kingdom = ifelse(is.na(kingdom) | kingdom == '', 'Unassigned', kingdom), specificEpithet = ifelse(!(is.na(Species_exact) | Species_exact == ''), Species_exact, specificEpithet), specificEpithet = ifelse( (!(is.na(genus) | genus == '')), str_replace(specificEpithet, paste('^',genus, '[_[:space:]]' ,sep=''), ''), specificEpithet), specificEpithet = ifelse( str_detect(specificEpithet, '^[sS]p{1,2}.?$'), '', specificEpithet), @@ -160,5 +160,5 @@ asvtax <- asvs %>% ) %>% relocate(otu, .after = infraspecificEpithet) %>% relocate(associatedSequences, .before = domain) %>% - select_if(!names(.) %in% c('confidence','domain', 'Species_exact', 'SH', 'BOLD_bin')) %>% + select_if(!names(.) %in% c('confidence','domain', 'Species_exact', 'SH', 'BOLD_bin', 'Supergroup', 'Division', 'Subdivision')) %>% write_tsv("asv-table.tsv", na = '') diff --git a/bin/sbdiexportreannotate.R b/bin/sbdiexportreannotate.R index ccbc958a..19d5e3ae 100755 --- a/bin/sbdiexportreannotate.R +++ b/bin/sbdiexportreannotate.R @@ -47,8 +47,8 @@ predictions <- data.frame( taxtable <- taxonomy %>% inner_join(predictions, by = 'ASV_ID') %>% mutate(Domain = if("Domain" %in% colnames(.)) Domain else '') %>% - mutate(Kingdom = if("Kingdom" %in% colnames(.)) Kingdom else '') %>% - mutate(Phylum = if("Phylum" %in% colnames(.)) Phylum else '') %>% + mutate(Kingdom = if("Kingdom" %in% colnames(.)) Kingdom else if ("Supergroup" %in% colnames(.)) Supergroup else '') %>% + mutate(Phylum = if("Phylum" %in% colnames(.)) Phylum else if ("Division" %in% colnames(.)) Division else '') %>% mutate(Class = if("Class" %in% colnames(.)) Class else '') %>% mutate(Order = if("Order" %in% colnames(.)) Order else '') %>% mutate(Family = if("Family" %in% colnames(.)) Family else '') %>% @@ -115,12 +115,12 @@ taxtable <- taxonomy %>% ), identification_references = 'https://docs.biodiversitydata.se/analyse-data/molecular-tools/#taxonomy-annotation', taxon_remarks = ifelse(!(is.na(domain) | domain == ''), paste('Domain = \'',domain,'\'',sep=''),''), - kingdom = ifelse(is.na(kingdom), 'Unassigned', kingdom) + kingdom = ifelse(is.na(kingdom) | kingdom == '', 'Unassigned', kingdom) ) %>% relocate(asv_sequence, .after = asv_id_alias) %>% relocate(scientificName:taxonRank, .after = asv_sequence) %>% relocate(infraspecificEpithet, .after = specificEpithet) %>% relocate(annotation_confidence, .after = otu) %>% relocate(date_identified:taxon_remarks, .after = annotation_confidence) %>% - select_if(!names(.) %in% c('domain', 'species_exact', 'SH', 'BOLD_bin')) %>% + select_if(!names(.) %in% c('domain', 'species_exact', 'SH', 'BOLD_bin', 'Supergroup', 'Division', 'Subdivision')) %>% write_tsv("annotation.tsv", na = '') diff --git a/conf/modules.config b/conf/modules.config index 95d8569a..68794ab7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -421,6 +421,24 @@ process { ] } + withName: KRAKEN2_KRAKEN2 { + // "--use-names" is required for downstream processes! + ext.args = "--use-names --confidence ${params.kraken2_confidence}" + publishDir = [ + path: { "${params.outdir}/kraken2" }, + mode: params.publish_dir_mode, + pattern: "{*.txt,*classified*}" + ] + } + + withName: 'FORMAT_TAXRESULTS_KRAKEN2' { + publishDir = [ + path: { "${params.outdir}/kraken2" }, + mode: params.publish_dir_mode, + pattern: "*.tsv" + ] + } + withName: VSEARCH_USEARCHGLOBAL { ext.args = '--top_hits_only --output_no_hits --maxaccepts 50 --query_cov 0.9' publishDir = [ @@ -432,6 +450,22 @@ process { ] } + withName: VSEARCH_CLUSTER { + ext.args = "--id ${params.vsearch_cluster_id} --usersort" + ext.args2 = '--cluster_smallmem' + ext.args3 = '--clusters' + } + + withName: FILTER_CLUSTERS { + publishDir = [ + [ + path: { "${params.outdir}/vsearch_cluster" }, + mode: params.publish_dir_mode, + pattern: "*{.tsv,.fna}" + ] + ] + } + withName: ASSIGNSH { publishDir = [ [ @@ -785,6 +819,14 @@ process { ] } + withName: PHYLOSEQ { + publishDir = [ + path: { "${params.outdir}/phyloseq" }, + mode: params.publish_dir_mode, + pattern: "*.rds" + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -796,9 +838,16 @@ process { withName: MULTIQC { ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' publishDir = [ - path: "${params.outdir}/multiqc", + path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + + withName: SUMMARY_REPORT { + publishDir = [ + path: { "${params.outdir}/summary_report" }, + mode: params.publish_dir_mode + ] + } } diff --git a/conf/ref_databases.config b/conf/ref_databases.config index dd06c820..c80820ec 100644 --- a/conf/ref_databases.config +++ b/conf/ref_databases.config @@ -42,11 +42,18 @@ params { taxlevels = "Phylum,Class,Order,Family,Genus,Species" } 'gtdb' { - title = "GTDB - Genome Taxonomy Database - Release R07-RS207" - file = [ "https://data.ace.uq.edu.au/public/gtdb/data/releases/release207/207.0/genomic_files_reps/bac120_ssu_reps_r207.tar.gz", "https://data.ace.uq.edu.au/public/gtdb/data/releases/release207/207.0/genomic_files_reps/ar53_ssu_reps_r207.tar.gz" ] + title = "GTDB - Genome Taxonomy Database - Release R08-RS214.1" + file = [ "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/genomic_files_reps/bac120_ssu_reps_r214.tar.gz", "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/genomic_files_reps/ar53_ssu_reps_r214.tar.gz" ] citation = "Parks DH, Chuvochina M, Waite DW, Rinke C, Skarshewski A, Chaumeil PA, Hugenholtz P. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol. 2018 Nov;36(10):996-1004. doi: 10.1038/nbt.4229. Epub 2018 Aug 27. PMID: 30148503." fmtscript = "taxref_reformat_gtdb.sh" - dbversion = "GTDB R07-RS207 (https://data.ace.uq.edu.au/public/gtdb/data/releases/release207/207.0)" + dbversion = "GTDB R08-RS214.1 (https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1)" + } + 'gtdb=R08-RS214' { + title = "GTDB - Genome Taxonomy Database - Release R08-RS214.1" + file = [ "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/genomic_files_reps/bac120_ssu_reps_r214.tar.gz", "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/genomic_files_reps/ar53_ssu_reps_r214.tar.gz" ] + citation = "Parks DH, Chuvochina M, Waite DW, Rinke C, Skarshewski A, Chaumeil PA, Hugenholtz P. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol. 2018 Nov;36(10):996-1004. doi: 10.1038/nbt.4229. Epub 2018 Aug 27. PMID: 30148503." + fmtscript = "taxref_reformat_gtdb.sh" + dbversion = "GTDB R08-RS214.1 (https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1)" } 'gtdb=R07-RS207' { title = "GTDB - Genome Taxonomy Database - Release R07-RS207" @@ -235,17 +242,17 @@ params { } //QIIME2 taxonomic reference databases qiime_ref_databases { - //SILVA for QIIME2 v2021.2, see https://docs.qiime2.org/2021.2/data-resources/#silva-16s-18s-rrna + //SILVA for QIIME2 v2023.7, see https://docs.qiime2.org/2023.7/data-resources/#silva-16s-18s-rrna 'silva=138' { title = "QIIME2 pre-formatted SILVA dereplicated at 99% similarity - Version 138" - file = [ "https://data.qiime2.org/2022.11/common/silva-138-99-seqs.qza", "https://data.qiime2.org/2022.11/common/silva-138-99-tax.qza" ] + file = [ "https://data.qiime2.org/2023.7/common/silva-138-99-seqs.qza", "https://data.qiime2.org/2023.7/common/silva-138-99-tax.qza" ] citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" license = "https://www.arb-silva.de/silva-license-information/" fmtscript = "taxref_reformat_qiime_silva138.sh" } 'silva' { title = "QIIME2 pre-formatted SILVA dereplicated at 99% similarity - Version 138" - file = [ "https://data.qiime2.org/2022.11/common/silva-138-99-seqs.qza", "https://data.qiime2.org/2022.11/common/silva-138-99-tax.qza" ] + file = [ "https://data.qiime2.org/2023.7/common/silva-138-99-seqs.qza", "https://data.qiime2.org/2023.7/common/silva-138-99-tax.qza" ] citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" license = "https://www.arb-silva.de/silva-license-information/" fmtscript = "taxref_reformat_qiime_silva138.sh" @@ -295,7 +302,7 @@ params { } 'greengenes85' { title = "Greengenes 16S - Version 13_8 - clustered at 85% similarity - for testing purposes only" - file = [ "https://data.qiime2.org/2022.11/tutorials/training-feature-classifiers/85_otus.fasta", "https://data.qiime2.org/2022.11/tutorials/training-feature-classifiers/85_otu_taxonomy.txt" ] + file = [ "https://data.qiime2.org/2023.7/tutorials/training-feature-classifiers/85_otus.fasta", "https://data.qiime2.org/2023.7/tutorials/training-feature-classifiers/85_otu_taxonomy.txt" ] citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" fmtscript = "taxref_reformat_qiime_greengenes85.sh" } @@ -375,4 +382,74 @@ params { dbversion = "UNITE-alleuk v8.2 (https://dx.doi.org/10.15156/BIO/786376)" } } + // Kraken2 reference databases + kraken2_ref_databases { + // Corresponding S3 URLs can be obtained by removing https://genome-idx.s3.amazonaws.com from the beginning of the URLs linked to above and replacing with s3://genome-idx. + 'silva' { + title = "Kraken2 pre-formatted SILVA - Version 138" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_Silva138_20200326.tgz" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "" + taxlevels = "D,P,C,O,F,G" + } + 'silva=138' { + title = "Kraken2 pre-formatted SILVA - Version 138" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_Silva138_20200326.tgz" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "" + taxlevels = "D,P,C,O,F,G" + } + 'silva=132' { + title = "Kraken2 pre-formatted SILVA - Version 132" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_Silva132_20200326.tgz" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "" + taxlevels = "D,P,C,O,F,G" + } + 'rdp' { + title = "RDP - Ribosomal Database Project - RDP trainset 18/release 11.5" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_RDP11.5_20200326.tgz" ] + citation = "Cole JR, Wang Q, Fish JA, Chai B, McGarrell DM, Sun Y, Brown CT, Porras-Alfaro A, Kuske CR, Tiedje JM. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res. 2014 Jan;42(Database issue):D633-42. doi: 10.1093/nar/gkt1244. Epub 2013 Nov 27. PMID: 24288368; PMCID: PMC3965039." + fmtscript = "" + taxlevels = "D,P,C,O,F,G" + } + 'rdp=18' { + title = "RDP - Ribosomal Database Project - RDP trainset 18/release 11.5" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_RDP11.5_20200326.tgz" ] + citation = "Cole JR, Wang Q, Fish JA, Chai B, McGarrell DM, Sun Y, Brown CT, Porras-Alfaro A, Kuske CR, Tiedje JM. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res. 2014 Jan;42(Database issue):D633-42. doi: 10.1093/nar/gkt1244. Epub 2013 Nov 27. PMID: 24288368; PMCID: PMC3965039." + fmtscript = "" + taxlevels = "D,P,C,O,F,G" + } + 'greengenes' { + title = "Kraken2 pre-formatted Greengenes - Version 13.5" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "" + taxlevels = "D,P,C,O,F,G,S" + } + 'greengenes=13.5' { + title = "Kraken2 pre-formatted Greengenes - Version 13.5" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "" + taxlevels = "D,P,C,O,F,G,S" + } + 'standard' { + title = "Standard database - Version 20230605" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_20230605.tar.gz" ] + citation = "Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome biology, 20(1), 257. https://doi.org/10.1186/s13059-019-1891-0" + fmtscript = "" + taxlevels = "D,P,C,O,F,G,S" + } + 'standard=20230605' { + title = "Standard database - Version 20230605" + file = [ "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_20230605.tar.gz" ] + citation = "Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome biology, 20(1), 257. https://doi.org/10.1186/s13059-019-1891-0" + fmtscript = "" + taxlevels = "D,P,C,O,F,G,S" + } + } } diff --git a/conf/test.config b/conf/test.config index e46c93a8..afd370e4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,7 +24,7 @@ params { RV_primer = "GGACTACNVGGGTWTCTAAT" input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet.tsv" metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Metadata.tsv" - dada_ref_taxonomy = "gtdb" + dada_ref_taxonomy = "gtdb=R07-RS207" cut_dada_ref_taxonomy = true qiime_ref_taxonomy = "greengenes85" max_len_asv = 255 @@ -45,4 +45,6 @@ params { qiime_adonis_formula = "treatment1,mix8" diversity_rarefaction_depth = 500 + + vsearch_cluster = true } diff --git a/conf/test_doubleprimers.config b/conf/test_doubleprimers.config index 6b275dc8..730393db 100644 --- a/conf/test_doubleprimers.config +++ b/conf/test_doubleprimers.config @@ -23,8 +23,11 @@ params { FW_primer = "NNNNCCTAHGGGRBGCAGCAG" RV_primer = "GACTACHVGGGTATCTAATCC" double_primer = true - dada_ref_taxonomy = false input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet_double_primer.tsv" trunc_qmin = 30 + kraken2_ref_taxonomy = "greengenes" + + // skipping skip_fastqc = true + skip_dada_taxonomy = true } diff --git a/conf/test_failed.config b/conf/test_failed.config new file mode 100644 index 00000000..12509a25 --- /dev/null +++ b/conf/test_failed.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/ampliseq -profile test_failed, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile - failed sample' + config_profile_description = 'Minimal test dataset to check pipeline function for failed samples' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + FW_primer = "GTGYCAGCMGCCGCGGTAA" + RV_primer = "GGACTACNVGGGTWTCTAAT" + input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet_failed_sample.tsv" + metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Metadata_failed_sample.tsv" + dada_ref_tax_custom = "https://zenodo.org/record/4310151/files/rdp_train_set_18.fa.gz" + skip_dada_addspecies = true + cut_dada_ref_taxonomy = true + max_len_asv = 255 + filter_ssu = "bac" + ignore_failed_trimming = true + ignore_empty_input_files = true + ignore_failed_filtering = true + + //this is to remove low abundance ASVs to reduce runtime of downstream processes + min_samples = 2 + min_frequency = 10 + + // Skipping steps + skip_fastqc = true +} diff --git a/conf/test_fasta.config b/conf/test_fasta.config index 78babb74..fbb60f87 100644 --- a/conf/test_fasta.config +++ b/conf/test_fasta.config @@ -20,9 +20,14 @@ params { max_time = '6.h' // Input data - input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/ASV_seqs.fasta" + input_fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/ASV_seqs.fasta" dada_ref_taxonomy = "rdp=18" dada_assign_taxlevels = "K,P,C,O,F,Genus" + filter_codons = true + orf_end = 30 + max_len_asv = 265 + filter_ssu = "bac" + skip_qiime = true } diff --git a/conf/test_pplace.config b/conf/test_pplace.config index b6eaff1d..ecd5424d 100644 --- a/conf/test_pplace.config +++ b/conf/test_pplace.config @@ -24,7 +24,7 @@ params { RV_primer = "GGACTACNVGGGTWTCTAAT" input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet.tsv" metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Metadata.tsv" - dada_ref_taxonomy = false + skip_dada_taxonomy = true qiime_ref_taxonomy = "greengenes85" filter_ssu = "bac" diff --git a/conf/test_reftaxcustom.config b/conf/test_reftaxcustom.config index b1bb76ae..4233d1ea 100644 --- a/conf/test_reftaxcustom.config +++ b/conf/test_reftaxcustom.config @@ -28,6 +28,8 @@ params { dada_ref_tax_custom = "https://zenodo.org/record/4310151/files/rdp_train_set_18.fa.gz" dada_ref_tax_custom_sp = "https://zenodo.org/record/4310151/files/rdp_species_assignment_18.fa.gz" dada_assign_taxlevels = "Kingdom,Phylum,Class,Order,Family,Genus" + kraken2_ref_tax_custom = "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" + kraken2_assign_taxlevels = "D,P,C,O" // Skip downstream analysis with QIIME2 skip_qiime = true diff --git a/conf/test_single.config b/conf/test_single.config index 4050ad67..b24e852b 100644 --- a/conf/test_single.config +++ b/conf/test_single.config @@ -22,7 +22,8 @@ params { // Input data FW_primer = "GTGYCAGCMGCCGCGGTAA" RV_primer = "GGACTACNVGGGTWTCTAAT" - input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet_single_end.tsv" + input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet.tsv" + single_end = true dada_ref_taxonomy = "rdp=18" cut_dada_ref_taxonomy = true diff --git a/docs/images/ampliseq_workflow.png b/docs/images/ampliseq_workflow.png index 4b1e7adb..80aa6d3e 100644 Binary files a/docs/images/ampliseq_workflow.png and b/docs/images/ampliseq_workflow.png differ diff --git a/docs/images/ampliseq_workflow.svg b/docs/images/ampliseq_workflow.svg index 6180d78b..be64d887 100644 --- a/docs/images/ampliseq_workflow.svg +++ b/docs/images/ampliseq_workflow.svg @@ -108,7 +108,7 @@ id="linearGradient5670-9" xlink:href="#linearGradient5668" inkscape:collect="always" - gradientTransform="matrix(0.26458333,0,0,0.28227707,12.561448,88.655713)" /> + gradientTransform="matrix(0.26458333,0,0,0.28227707,12.561448,86.009878)" /> + gradientTransform="matrix(0,0.18484422,-0.08144904,0,113.64058,61.805265)" /> + gradientTransform="matrix(0.26458333,0,0,0.2796764,103.97164,86.468423)" /> + gradientTransform="matrix(0.26458333,0,0,0.26670362,191.46837,87.565812)" /> + gradientTransform="matrix(-0.26458333,0,0,-0.10679224,158.30358,175.15067)" /> + - - - biom - - - - nwk - + transform="translate(-20.49968,1.918233)" + id="g128964"> + + biom + + + + nwk Primer trimming @@ -531,49 +535,35 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-3" - y="139.7159" + y="137.0701" x="-2.5661235" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:0.25;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:0.25;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Quality filtering - Legend Evaluation + transform="translate(-33.108802,-10.583334)"> cutadapt + style="font-size:3.17499995px;stroke-width:0.26458299">cutadapt + transform="translate(0,5.4632945)"> cutadapt + style="font-size:3.17499995px;stroke-width:0.26458299">cutadapt + transform="translate(-33.108802,32.279181)"> filterAndTrim + style="font-size:3.17499995px;stroke-width:0.26458299">filterAndTrim - default - on demand - - + transform="translate(-33.108802,20.212078)"> FastQC + style="font-size:3.17499995px;stroke-width:0.26458299">FastQC + transform="translate(-33.108802,26.733707)"> plotQualityProfile + style="font-size:3.17499995px;stroke-width:0.26458299">plotQualityProfile - Infer ASVs @@ -775,23 +717,23 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-4" - y="102.46249" + y="99.816643" x="-2.4670768" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Remove multipleor read-throughprimers @@ -800,16 +742,16 @@ inkscape:export-xdpi="305.81" inkscape:connector-curvature="0" id="path5662-9-2" - d="m 123.78452,111.16816 7e-5,-28.067503 0.22614,1.847862 c 2.01924,16.499541 3.07454,20.544671 6.67589,25.589701 l 0.92328,1.29336 -0.88175,1.46622 c -1.65672,2.75483 -2.41296,4.31183 -3.26592,6.72415 -1.10577,3.12731 -2.04838,7.68782 -3.03195,14.66901 -0.21285,1.51074 -0.44524,3.15133 -0.51642,3.64577 -0.11127,0.77289 -0.1294,-3.03666 -0.12934,-27.16857 z" + d="m 123.78452,108.52231 7e-5,-28.067488 0.22614,1.847862 c 2.01924,16.499535 3.07454,20.544656 6.67589,25.589686 l 0.92328,1.29336 -0.88175,1.46622 c -1.65672,2.75483 -2.41296,4.31183 -3.26592,6.72415 -1.10577,3.12731 -2.04838,7.68782 -3.03195,14.66901 -0.21285,1.51074 -0.44524,3.15133 -0.51642,3.64577 -0.11127,0.77289 -0.1294,-3.03666 -0.12934,-27.16857 z" style="opacity:1;fill:url(#linearGradient5670-9-95);fill-opacity:1;stroke:#000000;stroke-width:0.05;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png" /> Taxonomicclassification @@ -838,13 +780,13 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-9" - y="83.788673" + y="81.142838" x="14.292807" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Quality control @@ -852,27 +794,27 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-7" - y="92.799553" - x="96.14991" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + y="90.153717" + x="96.285126" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">DADA2 DADA2 @@ -880,19 +822,19 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-7-2" - y="116.78242" - x="96.411736" + y="114.13659" + x="96.546951" style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">QIIME2 + transform="translate(-4.9145208,-5.8208336)"> + transform="translate(-4.9145208,-7.4083337)"> + transform="translate(0,-12.170835)"> + transform="translate(0,-13.229169)"> + transform="translate(0,-12.700002)"> Reference taxonomy + transform="translate(55.409533,-8.7168798)"> + transform="translate(88.479685,-14.295412)"> + transform="translate(82.758013,-24.621866)"> + transform="translate(55.700763,-27.964738)"> SILVA + transform="translate(93.656771,-30.176216)"> Visualisation @@ -1202,13 +1144,13 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-5-6-84" - y="139.88019" + y="137.23439" x="193.07434" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Predict function @@ -1216,92 +1158,92 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-5-6-6" - y="94.241638" + y="91.595802" x="193.00026" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Differentialabundance ANCOM + y="99.512726" + style="font-size:3.17499995px;stroke-width:0.26458299">ANCOM barplot + y="85.542328" + style="font-size:3.17499995px;stroke-width:0.26458299">barplot PICRUSTt2 + x="185.64171" + y="142.01181" + style="font-size:3.17499995px;stroke-width:0.26458299">PICRUSt2 Quality control @@ -1309,10 +1251,10 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="g4902-7" - transform="translate(50.928406,-33.158376)" + transform="translate(50.928406,-35.804211)" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png"> alpha-rarefaction + style="font-size:3.17499995px;stroke-width:0.26458299">alpha-rarefaction Alpha- & beta-diversity @@ -1375,20 +1317,20 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-5-6-7" - y="83.788673" + y="81.142838" x="238.43135" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Reporting VisualisationsVisualisations,& tables + id="tspan4263">tables, formats - - - fasta - - - - tsv - - - - html - + transform="translate(241.63088,7.9526664)" + id="g1942-6"> + + fasta + + + + tsv + + + + html + id="g17795" + transform="translate(0,-2.6458334)"> qiime diversity adonis @@ -1607,7 +1547,7 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" ry="3.8374951" - y="79.754517" + y="77.108681" x="131.5838" height="68.205917" width="35.044548" @@ -1619,18 +1559,18 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-5-6-84-8" - y="83.788673" + y="81.142838" x="149.21545" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Taxonomicfiltering @@ -1638,31 +1578,31 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-0-6-5-6-6-5" - y="106.41251" + y="103.76669" x="149.71155" - style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Abundance & prevalencefiltering + transform="translate(5.3253841,71.267981)"> feature-table filter-features + style="font-size:2.46943998px;stroke-width:0.26458299">feature-table filter-features + transform="translate(4.9834499,68.09298)"> taxa filter-table + style="font-size:3.17499995px;stroke-width:0.26458299">taxa filter-table + transform="translate(6.5650501,76.030481)"> taxa filter-seqs + style="font-size:3.17499995px;stroke-width:0.26458299">taxa filter-seqs Abundancetables + transform="translate(5.0271053,77.617983)"> + transform="translate(8.4984789,76.514199)"> + transform="translate(-34.925004,-29.37259)"> + transform="translate(-34.925004,-6.6221022)"> + transform="translate(-34.925004,-32.026023)"> + transform="translate(0,-16.404171)"> + transform="translate(0,-10.318754)"> - - @@ -2049,7 +1967,7 @@ id="flowPara4555-5-1-0" style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:20.7634px;line-height:1.25;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro';fill:#000000;stroke-width:1.00309px" /> + transform="translate(-34.925004,-30.69168)"> + transform="translate(-34.925004,8.525433)"> (metadata) + transform="translate(58.192293,-8.8838973)"> cutadapt + style="font-size:3.17499995px;stroke-width:0.26458299">cutadapt Extract ITS region @@ -2208,13 +2126,13 @@ inkscape:export-ydpi="305.81" inkscape:export-xdpi="305.81" id="text4732-3-3-2" - y="128.14722" + y="125.5014" x="41.395306" style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778006px;line-height:0.25;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299" xml:space="preserve" inkscape:export-filename="C:\Users\fagernaes\Pictures\eager2\eager2_workflow_review.png">Filter ASVs @@ -2249,31 +2167,111 @@ transform="matrix(0.26458333,0,0,0.26458333,-7.5591109,37.685768)" id="text69165" style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:11.25px;line-height:125%;font-family:'Abyssinica SIL';-inkscape-font-specification:'Abyssinica SIL, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;white-space:pre;shape-inside:url(#rect69167);display:inline;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> - + Legend + default + on demand + + + + + optional - optional + mandatory + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:3.17499995px;line-height:125%;font-family:'Abyssinica SIL';-inkscape-font-specification:'Abyssinica SIL, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458299px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + xml:space="preserve">mandatory + + transform="translate(66.443358,-30.17621)"> + transform="translate(-0.04483596,-4.1501987)"> SINTAX + transform="translate(0,10.383895)"> Phylogeneticplacement + transform="translate(0,24.810739)"> + transform="translate(0,23.752406)"> + transform="translate(0,24.281573)"> EPA-NG + + + + RMarkdown + + + + + RefSeq + + + + Greengenes + + Kraken2 + + + VSEARCH + + Cluster OTUs + + + + phyloseq + + + + + rds + diff --git a/docs/output.md b/docs/output.md index 2d479272..f12fc41f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -17,12 +17,14 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [Input](#input) - Input files +- [Pipeline summary report](#pipeline-summary-report) - Overview of pipeline output - [Preprocessing](#preprocessing) - [FastQC](#fastqc) - Read quality control - [Cutadapt](#cutadapt) - Primer trimming - [MultiQC](#multiqc) - Aggregate report describing results - [ASV inferrence with DADA2](#asv-inferrence-with-dada2) - Infer Amplicon Sequence Variants (ASVs) - [Optional ASV filtering](#optional-asv-filtering) - Filter ASVs to optimize downstream analysis + - [VSEARCH cluster](#vsearch-cluster) - Centroid fasta file, filtered asv table, and stats - [Barrnap](#barrnap) - Predict ribosomal RNA sequences and optional filtering - [Length filter](#length-filter) - Optionally, ASV can be filtered by length thresholds - [ITSx](#itsx) - Optionally, the ITS region can be extracted @@ -31,7 +33,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [DADA2](#dada2) - Taxonomic classification with DADA2 - [assignSH](#assignsh) - Optionally, a UNITE species hypothesis (SH) can be added to the DADA2 taxonomy - [SINTAX](#sintax) - Taxonomic classification with SINTAX - - [Taxonomic classification with QIIME2](#taxonomic-classification-with-qiime2) - Taxonomic classification with QIIME2 + - [Kraken2](#kraken2) - Taxonomic classification with Kraken2 + - [QIIME2](#qiime2) - Taxonomic classification with QIIME2 - [Phlogenetic placement and taxonomic classification](#phylogenetic-placement-and-taxonomic-classification) - Placing ASVs into a phyloenetic tree - [QIIME2](#qiime2) - Secondary analysis - [Abundance tables](#abundance-tables) - Exported abundance tables @@ -41,6 +44,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Diversity analysis](#diversity-analysis) - High level overview with different diversity indices - [ANCOM](#ancom) - Differential abundance analysis - [PICRUSt2](#picrust2) - Predict the functional potential of a bacterial community +- [SBDI export](#sbdi-export) - Swedish Biodiversity Infrastructure (SBDI) submission file +- [Phyloseq](#phyloseq) - Phyloseq R objects - [Read count report](#read-count-report) - Report of read counts during various steps of the pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -52,9 +57,23 @@ Samplesheet, ASV fasta, and metadata file are copied into the results folder. Output files - `input/` - - `*.tsv`: Samplesheet input if specified with `--input`. + - `*`: Samplesheet input if specified with `--input`. - `*.tsv`: Metadata input if specified with `--metadata`. - - `*.fasta|.fna|.fa`: ASV sequence input if specified with `--input`. + - `*`: ASV sequence input if specified with `--input_fasta`. + + + +### Pipeline summary report + +A summary report for most pipeline results in html format produced by [R Markdown](https://rmarkdown.rstudio.com/). The report gives a general overview of the analysis, includes many tables and visualizations, and links to interactive downstream analysis results, if available. + +
+Output files + +- `summary_report/` + - `summary_report.html`: pipeline summary report as standalone HTML file that can be viewed in your web browser. + - `*.svg*`: plots that were produced for (and are included in) the report. + - `versions.yml`: software versions used to produce this report.
@@ -101,7 +120,9 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: ### ASV inferrence with DADA2 @@ -143,6 +164,21 @@ For binned quality scores in NovaSeq data, monotonicity in the fitted error mode ### Optional ASV filtering +#### VSEARCH cluster + +Optionally, VSEARCH can be used to cluster the denoised ASVs. This will be performed before other filtering steps. +This directory will hold the centroid fasta file, the filtered asv count table (after merging non-centroid counts with their respective centroid counts), and a stats table. + +
+Output files + +- `vsearch_cluster/` + - `ASV_post_clustering_filtered.fna`: Centroid fasta file. + - `ASV_post_clustering_filtered.stats.tsv`: Stats table. + - `ASV_post_clustering_filtered.table.tsv`: ASV table. + +
+ #### Barrnap Barrnap predicts the location of ribosomal RNA genes in genomes, here it can be used to discriminate rRNA sequences from potential contamination. It supports bacteria (5S,23S,16S), archaea (5S,5.8S,23S,16S), metazoan mitochondria (12S,16S) and eukaryotes (5S,5.8S,28S,18S). @@ -215,7 +251,7 @@ Codon filtering can be activated by `--filter_codons`. By default, the codons ar ### Taxonomic classification -DADA2 and/or SINTAX can be used to taxonomically classify the ASVs using a choice of supplied databases (specified with `--dada_ref_taxonomy` and/or `--sintax_ref_taxonomy`). By default, DADA2 is used for the classification. The taxonomic classification will be done based on filtered ASV sequences (see above). +Taxonomic classification of ASVs can be performed with a choice of DADA2, SINTAX, Kraken2 or QIIME2 using supplied databases or user supplied databases (see parameter documentation). By default, DADA2 is used for the classification. The taxonomic classification will be done based on filtered ASV sequences (see above). #### DADA2 @@ -288,9 +324,26 @@ Files when using ITSx: -#### Taxonomic classification with QIIME2 +#### Kraken2 -Taxonomic classification with QIIME2 is typically similar to DADA2 classifications. However, both options are available. When taxonomic classification with DADA2 and QIIME2 is performed, DADA2 classification takes precedence over QIIME2 classifications for all downstream analysis. Taxonomic classification by SINTAX or phylogenetic placement superseeds DADA2 and QIIME2 classification. +Kraken2 taxonomically classifies ASVs using exact k-mer matches. Kraken2 matches each k-mer within a query sequence to the lowest common ancestor (LCA) of all genomes/sequences containing the given k-mer. + +
+Output files + +- `kraken2` + - `ASV_tax.*.kraken2.report.txt`: Kraken2 report file, i.e. taxonomic classification of ASVs (shows number of ASVs matched at any given taxonomic level) + - `ASV_tax.*.kraken2.keys.tsv`: Tab-separated table with extracted information from the report file + - `ASV_tax.*.kraken2.classifiedreads.txt`: Classified sequence file, i.e. taxonomic classification (leaf) per ASV + - `ASV_tax.*.kraken2.complete.tsv`: Tab-separated table with all extracted and parsed information from report and classified sequence file for each ASV + - `ASV_tax.*.kraken2.tsv`: Tab-separated table with chosen taxonomic ranks per ASV + - `ASV_tax.*.kraken2.into-qiime2.tsv`: Table with two tab-separated columns, `ASV_ID` and aggregated `taxonomy` (semicolon separated string), input to QIIME2 + +
+ +#### QIIME2 + +Taxonomic classification with QIIME2 is based on a classifier trained on sequences extracted with the primers.
Output files @@ -495,7 +548,9 @@ PICRUSt2 is preferentially applied to filtered data by QIIME2 but will use DADA2
-> **NB:** Quantifications are not normalized yet, they can be normalized e.g. by the total sum per sample. +:::note +Quantifications are not normalized yet, they can be normalized e.g. by the total sum per sample. +::: ### SBDI export @@ -510,7 +565,7 @@ Most of the fields in the template will not be populated by the export process, Output files - `SBDI/` - - `annotation.tsv`: SBDI specific output for taxonomi reannotation, not used in submission to SBDI. + - `annotation.tsv`: SBDI specific output for taxonomic reannotation, not used in submission to SBDI. - `asv-table.tsv`: asv-table tab of template. - `emof.tsv`: emof tab of template. - `event.tsv`: event tab of template. @@ -518,9 +573,21 @@ Most of the fields in the template will not be populated by the export process, +### Phyloseq + +This directory will hold phyloseq objects for each taxonomy table produced by this pipeline. The objects will contain an ASV abundance table and a taxonomy table. If the pipeline is provided with metadata, that metadata will also be included in the phyloseq object. A phylogenetic tree will also be included if the pipeline produces a tree. + +
+Output files + +- `phyloseq/` + - `_phyloseq.rds`: Phyloseq R object. + +
+ ## Read count report -This report includes information on how many reads per sample passed each pipeline step in which a loss can occur. Specifically, how many read pairs entered cutadapt, were reverse complemented, passed trimming; how many read pairs entered DADA2, were denoised, merged and non-chimeric; and how many counts were lost during excluding unwanted tax and removing low abundance/prevalence sequences in QIIME2. +This report includes information on how many reads per sample passed each pipeline step in which a loss can occur. Specifically, how many read pairs entered cutadapt, were reverse complemented, passed trimming; how many read pairs entered DADA2, were denoised, merged and non-chimeric; and how many counts were lost during excluding unwanted taxa and removing low abundance/prevalence sequences in QIIME2.
Output files @@ -537,5 +604,7 @@ This report includes information on how many reads per sample passed each pipeli - `pipeline_info/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`.
diff --git a/docs/usage.md b/docs/usage.md index 73930402..74e2dcfc 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,9 +10,10 @@ - [Quick start](#quick-start) - [Setting parameters in a file](#setting-parameters-in-a-file) - [Input specifications](#input-specifications) - - [Direct FASTQ input](#direct-fastq-input) - [Samplesheet input](#samplesheet-input) - [ASV/OTU fasta input](#asvotu-fasta-input) + - [Direct FASTQ input](#direct-fastq-input) + - [Taxonomic classification](#taxonomic-classification) - [Metadata](#metadata) - [Updating the pipeline](#updating-the-pipeline) - [Reproducibility](#reproducibility) @@ -35,16 +36,16 @@ The typical command for running the pipeline is as follows: ```bash nextflow run nf-core/ampliseq \ - -r 2.3.2 \ + -r 2.7.0 \ -profile singularity \ - --input "data" \ + --input "samplesheet.tsv" \ --FW_primer GTGYCAGCMGCCGCGGTAA \ --RV_primer GGACTACNVGGGTWTCTAAT \ - --metadata "data/Metadata.tsv" + --metadata "data/Metadata.tsv" \ --outdir "./results" ``` -In this example, `--input` is the [Direct FASTQ input](#direct-fastq-input), other options are [Samplesheet input](#samplesheet-input) and [ASV/OTU fasta input](#asvotu-fasta-input). For more details on metadata, see [Metadata](#metadata). For [Reproducibility](#reproducibility), specify the version to run using `-r` (= release, here: 2.3.2). See the [nf-core/ampliseq website documentation](https://nf-co.re/ampliseq/parameters) for more information about pipeline specific parameters. +In this example, `--input` is the [Samplesheet input](#samplesheet-input), other options are [Direct FASTQ input](#direct-fastq-input) and [ASV/OTU fasta input](#asvotu-fasta-input). For more details on metadata, see [Metadata](#metadata). For [Reproducibility](#reproducibility), specify the version to run using `-r` (= release, e.g. 2.7.0, please use the most recent release). See the [nf-core/ampliseq website documentation](https://nf-co.re/ampliseq/parameters) for more information about pipeline specific parameters. It is possible to not provide primer sequences (`--FW_primer` & `--RV_primer`) and skip primer trimming using `--skip_cutadapt`, but this is only for data that indeed does not contain any PCR primers in their sequences. Also, metadata (`--metadata`) isnt required, but aids downstream analysis. @@ -59,7 +60,9 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -> **NB:** If the data originates from multiple sequencing runs, the error profile of each of those sequencing runs needs to be considered separately. Using the `run` column in the samplesheet input or adding `--multiple_sequencing_runs` for Direct FASTQ input will separate certain processes by the sequencing run. Please see the following example: +:::note +If the data originates from multiple sequencing runs, the error profile of each of those sequencing runs needs to be considered separately. Using the `run` column in the samplesheet input or adding `--multiple_sequencing_runs` for direct FASTQ input will separate certain processes by the sequencing run. Please see the following example: +:::

nf-core/ampliseq workflow overview with --multiple_sequencing_runs @@ -71,8 +74,11 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: ```bash nextflow run nf-core/ampliseq -profile docker -params-file params.yaml @@ -81,27 +87,83 @@ nextflow run nf-core/ampliseq -profile docker -params-file params.yaml with `params.yaml` containing: ```yaml -input: "data" +input: "samplesheet.tsv" FW_primer: "GTGYCAGCMGCCGCGGTAA" RV_primer: "GGACTACNVGGGTWTCTAAT" metadata: "data/Metadata.tsv" outdir: "./results" +<...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). ### Input specifications -The input data can be passed to nf-core/ampliseq in three possible ways using the `--input` parameter, either a folder containing zipped FastQ files, a tab-separated samplesheet, or a fasta file to be taxonomically classified. +The input data can be passed to nf-core/ampliseq in three possible ways using the parameters `--input`, `--input_fasta`, or `--input_folder`. +The three parameters and input types are mutually exclusive. + +- [Samplesheet input](#samplesheet-input) using `--input`: Samplesheet tab-separated, comma-separated, or in YAML format +- [ASV/OTU fasta input](#asvotu-fasta-input) using `--input_fasta`: Fasta file with sequences to be taxonomically classified +- [Direct FASTQ input](#direct-fastq-input) using `--input_folder`: Folder containing zipped FastQ files. Optionally, a metadata sheet can be specified for downstream analysis. +#### Samplesheet input + +The sample sheet file can be tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml) and can have two to four columns/entries with the following headers: + +| Column | Necessity | Description | +| ------------ | --------- | ----------------------------------------------------------------------------- | +| sampleID | required | Unique sample identifiers | +| forwardReads | required | Paths to (forward) reads zipped FastQ files | +| reverseReads | optional | Paths to reverse reads zipped FastQ files, required if the data is paired-end | +| run | optional | If the data was produced by multiple sequencing runs, any string | + +```bash +--input 'path/to/samplesheet.tsv' +``` + +For example, the tab-separated samplesheet may contain: + +| sampleID | forwardReads | reverseReads | run | +| -------- | ------------------------- | ------------------------- | --- | +| sample1 | ./data/S1_R1_001.fastq.gz | ./data/S1_R2_001.fastq.gz | A | +| sample2 | ./data/S2_fw.fastq.gz | ./data/S2_rv.fastq.gz | A | +| sample3 | ./S4x.fastq.gz | ./S4y.fastq.gz | B | +| sample4 | ./a.fastq.gz | ./b.fastq.gz | B | + +Please note the following requirements: + +- 2 to 4 columns/entries +- File extensions `.tsv`,`.csv`,`.yml`,`.yaml` specify the file type, otherwise file type will be derived from content, if possible +- Must contain the header `sampleID` and `forwardReads` +- May contain the header `reverseReads` and `run` +- Sample IDs must be unique +- Sample IDs must start with a letter +- Sample IDs can only contain letters, numbers or underscores +- FastQ files must be compressed (`.fastq.gz`, `.fq.gz`) +- Within one samplesheet, only one type of raw data should be specified (same amplicon & sequencing method) + +An [example samplesheet](../assets/samplesheet.tsv) has been provided with the pipeline. + +To avoid producing a sample sheet, [Direct FASTQ input](#direct-fastq-input) may be used instead. + +#### ASV/OTU fasta input + +To taxonomically classify pre-computed sequence files, a fasta format file with sequences may be provided. +Most of the steps of the pipeline will be skipped, but ITSx & Barrnap & length filtering can be applied before taxonomic classification. +The sequence header line may contain a description, that will be kept as part of the sequence name. However, tabs will be changed into spaces. + +```bash +--input_fasta 'path/to/amplicon_sequences.fasta' +``` + #### Direct FASTQ input -The easiest way is to specify directly the path to the folder that contains your input FASTQ files. For example: +An easy way to input sequencing data to the pipeline is to specify directly the path to the folder that contains your input FASTQ files. For example: ```bash ---input 'path/to/data/' +--input_folder 'path/to/data/' ``` File names must follow a specific pattern, default is `/*_R{1,2}_001.fastq.gz`, but this can be adjusted with `--extension`. @@ -146,59 +208,40 @@ Please note the following additional requirements: - Sample identifiers are extracted from file names, i.e. the string before the first underscore `_`, these must be unique (also across sequencing runs) - If your data is scattered, produce a sample sheet -#### Samplesheet input +### Taxonomic classification -The sample sheet file is an alternative way to provide input reads, it must be a tab-separated file ending with `.tsv` that must have two to four columns with the following headers: +Taxonomic classification of ASVs can be performed with tools DADA2, SINTAX, Kraken2 or QIIME2. Multiple taxonomic reference databases are pre-configured for those tools, but user supplied databases are also supported for some tools. Alternatively (or in addition), phylogenetic placement can be used to extract taxonomic classifications. -| Column | Necessity | Description | -| ------------ | --------- | ----------------------------------------------------------------------------- | -| sampleID | required | Unique sample identifiers | -| forwardReads | required | Paths to (forward) reads zipped FastQ files | -| reverseReads | optional | Paths to reverse reads zipped FastQ files, required if the data is paired-end | -| run | optional | If the data was produced by multiple sequencing runs, any string | +In case multiple tools for taxonomic classification are executed in one pipeline run, only the taxonomic classification result of one tool is forwarded to downstream analysis with QIIME2. The priority is `phylogenetic placement` > `DADA2` > `SINTAX` > `Kraken2` > `QIIME2`. -```bash ---input 'path/to/samplesheet.tsv' -``` +Default setting for taxonomic classification is DADA2 with the SILVA reference taxonomy database. -For example, the samplesheet may contain: +Pre-configured reference taxonomy databases are: -| sampleID | forwardReads | reverseReads | run | -| -------- | ------------------------- | ------------------------- | --- | -| sample1 | ./data/S1_R1_001.fastq.gz | ./data/S1_R2_001.fastq.gz | A | -| sample2 | ./data/S2_fw.fastq.gz | ./data/S2_rv.fastq.gz | A | -| sample3 | ./S4x.fastq.gz | ./S4y.fastq.gz | B | -| sample4 | ./a.fastq.gz | ./b.fastq.gz | B | - -Please note the following requirements: - -- 2 to 4 tab-separated columns -- Valid file extension: `.tsv` -- Must contain the header `sampleID` and `forwardReads` -- May contain the header `reverseReads` and `run` -- Sample IDs must be unique -- Sample IDs must not contain a dot `.` -- Sample IDs may not start with a number -- FastQ files must be compressed (`.fastq.gz`, `.fq.gz`) -- Within one samplesheet, only one type of raw data should be specified (same amplicon & sequencing method) +| Database key | DADA2 | SINTAX | Kraken2 | QIIME2 | Target genes | +| ------------ | ----- | ------ | ------- | ------ | --------------------------------------------- | +| silva | + | - | + | + | 16S rRNA | +| gtdb | + | - | - | - | 16S rRNA | +| sbdi-gtdb | + | - | - | - | 16S rRNA | +| rdp | + | - | + | - | 16S rRNA | +| greengenes | - | - | + | (+)¹ | 16S rRNA | +| pr2 | + | - | - | - | 18S rRNA | +| unite-fungi | + | + | - | + | eukaryotic nuclear ribosomal ITS region | +| unite-alleuk | + | + | - | + | eukaryotic nuclear ribosomal ITS region | +| coidb | + | + | - | - | eukaryotic Cytochrome Oxidase I (COI) | +| midori2-co1 | + | - | - | - | eukaryotic Cytochrome Oxidase I (COI) | +| standard | - | - | + | - | any in genomes of archaea, bacteria, viruses² | -An [example samplesheet](../assets/samplesheet.tsv) has been provided with the pipeline. +¹: de-replicated at 85%, only for testing purposes; ²: quality of results might vary -> **Please note:** All characters other than letters, numbers and underline in Sample IDs will be converted to dots `.`. Avoid those conversions, because they might make summary files not merging correctly and will fail to match to metadata (which can be adjusted though). +Special features of taxonomic classification tools: -#### ASV/OTU fasta input - -When pointing at a file ending with `.fasta`, `.fna` or `.fa`, the containing ASV/OTU sequences will be taxonomically classified. -Most of the steps of the pipeline will be skipped, but ITSx & Barrnap & length filtering can be applied before taxonomic classification. -The sequence header line may contain a description, that will be kept as part of the sequence name. However, tabs will be changed into spaces. - -```bash ---input 'path/to/amplicon_sequences.fasta' -``` - -Please note the following requirements: +- DADA2's reference taxonomy databases **can** have regions matching the amplicon extracted with primer sequences. +- Kraken2 is very fast and can use large databases containing complete genomes. +- QIIME2's reference taxonomy databases will have regions matching the amplicon extracted with primer sequences. +- DADA2, Kraken2, and QIIME2 have specific parameters to accept custom databases (but theoretically possible with all classifiers) -- Valid file extensions: `.fasta`, `.fna` or `.fa` +Parameter guidance is given in [nf-core/ampliseq website parameter documentation](https://nf-co.re/ampliseq/parameters/#taxonomic-database). ### Metadata @@ -246,11 +289,15 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: ## Core Nextflow arguments -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: ### `-profile` @@ -258,7 +305,9 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +:::info +We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +::: The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 9b34804d..00000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,530 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import nextflow.Nextflow -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-apptainer', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - Nextflow.error('Exiting!') - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 25a0a74a..01b8653d 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,7 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput class NfcoreTemplate { @@ -128,7 +129,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -222,6 +223,21 @@ class NfcoreTemplate { } } + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def output_pf = new File(output_d, "params_${timestamp}.json") + def jsonStr = JsonOutput.toJson(params) + output_pf.text = JsonOutput.prettyPrint(jsonStr) + } + // // Print pipeline summary on completion // diff --git a/lib/WorkflowAmpliseq.groovy b/lib/WorkflowAmpliseq.groovy index 47454fd0..5e103911 100755 --- a/lib/WorkflowAmpliseq.groovy +++ b/lib/WorkflowAmpliseq.groovy @@ -11,6 +11,14 @@ class WorkflowAmpliseq { // Check and validate parameters // public static void initialise(params, log) { + if ( !params.input && !params.input_fasta && !params.input_folder ) { + Nextflow.error("Missing input declaration: One of `--input`, `--input_fasta`, `--input_folder` is required.") + } + + if ( !params.input_fasta && (!params.FW_primer || !params.RV_primer) && !params.skip_cutadapt ) { + Nextflow.error("Incompatible parameters: `--FW_primer` and `--RV_primer` are required for primer trimming. If primer trimming is not needed, use `--skip_cutadapt`.") + } + if ( params.pacbio || params.iontorrent || params.single_end ) { if (params.trunclenr) { log.warn "Unused parameter: `--trunclenr` is ignored because the data is single end." } } else if (params.trunclenf && !params.trunclenr) { @@ -60,18 +68,20 @@ class WorkflowAmpliseq { } } - if (params.dada_assign_taxlevels && params.sbdiexport) { + if (params.dada_assign_taxlevels && params.sbdiexport && !params.sintax_ref_taxonomy) { Nextflow.error("Incompatible parameters: `--sbdiexport` expects specific taxonomics ranks (default) and therefore excludes modifying those using `--dada_assign_taxlevels`.") } - if (params.skip_dada_addspecies && params.sbdiexport) { - Nextflow.error("Incompatible parameters: `--sbdiexport` expects species annotation and therefore excludes `--skip_dada_addspecies`.") - } - if (params.skip_taxonomy && params.sbdiexport) { Nextflow.error("Incompatible parameters: `--sbdiexport` expects taxa annotation and therefore excludes `--skip_taxonomy`.") } + if (params.skip_dada_taxonomy && params.sbdiexport) { + if (!params.sintax_ref_taxonomy && (params.skip_qiime || !params.qiime_ref_taxonomy)) { + Nextflow.error("Incompatible parameters: `--sbdiexport` expects taxa annotation and therefore annotation with either DADA2, SINTAX, or QIIME2 is needed.") + } + } + if ( (!params.FW_primer || !params.RV_primer) && params.qiime_ref_taxonomy && !params.skip_qiime && !params.skip_taxonomy ) { Nextflow.error("Incompatible parameters: `--FW_primer` and `--RV_primer` are required for cutting the QIIME2 reference database to the amplicon sequences. Please specify primers or do not use `--qiime_ref_taxonomy`.") } @@ -84,13 +94,23 @@ class WorkflowAmpliseq { Nextflow.error("Incompatible parameters: `--qiime_ref_taxonomy` will produce a classifier but `--classifier` points to a precomputed classifier, therefore, only use one of those.") } + if (params.kraken2_ref_tax_custom && !params.kraken2_assign_taxlevels ) { + Nextflow.error("Missing parameter: Taxonomic classification with a user provided database via `--kraken2_ref_tax_custom` requires `--kraken2_assign_taxlevels`") + } + if (params.filter_ssu && params.skip_barrnap) { Nextflow.error("Incompatible parameters: `--filter_ssu` cannot be used with `--skip_barrnap` because filtering for SSU's depends on barrnap.") } - String[] sbdi_compatible_databases = ["coidb","coidb=221216","gtdb","gtdb=R07-RS207","gtdb=R06-RS202","gtdb=R05-RS95","midori2-co1","midori2-co1=gb250","pr2=4.14.0","pr2=4.13.0","rdp","rdp=18","sbdi-gtdb","sbdi-gtdb=R07-RS207-1","silva","silva=138","silva=132","unite-fungi","unite-fungi=9.0","unite-fungi=8.3","unite-fungi=8.2","unite-alleuk","unite-alleuk=9.0","unite-alleuk=8.3","unite-alleuk=8.2"] - if ( params.sbdiexport && !Arrays.stream(sbdi_compatible_databases).anyMatch(entry -> params.dada_ref_taxonomy.toString().equals(entry)) ) { - Nextflow.error("Incompatible parameters: `--sbdiexport` does not work with the chosen database of `--dada_ref_taxonomy`, because the expected taxonomic levels do not match.") + String[] sbdi_compatible_databases = ["coidb","coidb=221216","gtdb","gtdb=R08-RS214","gtdb=R07-RS207","gtdb=R06-RS202","gtdb=R05-RS95","midori2-co1","midori2-co1=gb250","pr2","pr2=5.0.0","pr2=4.14.0","pr2=4.13.0","rdp","rdp=18","sbdi-gtdb","sbdi-gtdb=R07-RS207-1","silva","silva=138","silva=132","unite-fungi","unite-fungi=9.0","unite-fungi=8.3","unite-fungi=8.2","unite-alleuk","unite-alleuk=9.0","unite-alleuk=8.3","unite-alleuk=8.2"] + if (params.sbdiexport){ + if (params.sintax_ref_taxonomy ) { + if (!Arrays.stream(sbdi_compatible_databases).anyMatch(entry -> params.sintax_ref_taxonomy.toString().equals(entry)) ) { + Nextflow.error("Incompatible parameters: `--sbdiexport` does not work with the chosen database of `--sintax_ref_taxonomy` because the expected taxonomic levels do not match.") + } + } else if (!Arrays.stream(sbdi_compatible_databases).anyMatch(entry -> params.dada_ref_taxonomy.toString().equals(entry)) ) { + Nextflow.error("Incompatible parameters: `--sbdiexport` does not work with the chosen database of `--dada_ref_taxonomy` because the expected taxonomic levels do not match.") + } } if (params.addsh && !params.dada_ref_databases[params.dada_ref_taxonomy]["shfile"]) { @@ -113,13 +133,6 @@ class WorkflowAmpliseq { } } - // - // Check string (String s) ends with one entry of an array of strings ("String[] extn") - // - public static boolean checkIfFileHasExtension(String s, String[] extn) { - return Arrays.stream(extn).anyMatch(entry -> s.endsWith(entry)); - } - // // Get workflow summary for MultiQC // @@ -147,15 +160,57 @@ class WorkflowAmpliseq { return yaml_file_text } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // TODO nf-core: Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // TODO Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "

  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() + // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + + def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 3cd7c5fd..7f49735e 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -21,45 +21,19 @@ class WorkflowMain { " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } // // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params) - System.exit(0) - } // Check that keys for reference databases are valid - if (params.dada_ref_taxonomy && !params.skip_taxonomy) { + if (params.dada_ref_taxonomy && !params.skip_taxonomy && !params.skip_dada_taxonomy) { dadareftaxonomyExistsError(params, log) } + if (params.sintax_ref_taxonomy && !params.skip_taxonomy) { + sintaxreftaxonomyExistsError(params, log) + } if (params.qiime_ref_taxonomy && !params.skip_taxonomy && !params.classifier) { qiimereftaxonomyExistsError(params, log) } @@ -71,14 +45,6 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params) - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) @@ -97,7 +63,7 @@ class WorkflowMain { private static void dadareftaxonomyExistsError(params, log) { if (params.dada_ref_databases && params.dada_ref_taxonomy && !params.dada_ref_databases.containsKey(params.dada_ref_taxonomy)) { def error_string = "=============================================================================\n" + - " DADA2 reference database '${params.dada_ref_taxonomy}' not found in any config files provided to the pipeline.\n" + + " DADA2 reference database '${params.dada_ref_taxonomy}' not found in any config file provided to the pipeline.\n" + " Currently, the available reference taxonomy keys for `--dada_ref_taxonomy` are:\n" + " ${params.dada_ref_databases.keySet().join(", ")}\n" + "===================================================================================" @@ -105,12 +71,25 @@ class WorkflowMain { } } // + // Exit pipeline if incorrect --sintax_ref_taxonomy key provided + // + private static void sintaxreftaxonomyExistsError(params, log) { + if (params.sintax_ref_databases && params.sintax_ref_taxonomy && !params.sintax_ref_databases.containsKey(params.sintax_ref_taxonomy)) { + def error_string = "=============================================================================\n" + + " SINTAX reference database '${params.sintax_ref_taxonomy}' not found in any config file provided to the pipeline.\n" + + " Currently, the available reference taxonomy keys for `--sintax_ref_taxonomy` are:\n" + + " ${params.sintax_ref_databases.keySet().join(", ")}\n" + + "===================================================================================" + Nextflow.error(error_string) + } + } + // // Exit pipeline if incorrect --qiime_ref_taxonomy key provided // private static void qiimereftaxonomyExistsError(params, log) { if (params.qiime_ref_databases && params.qiime_ref_taxonomy && !params.qiime_ref_databases.containsKey(params.qiime_ref_taxonomy)) { def error_string = "=============================================================================\n" + - " QIIME2 reference database '${params.qiime_ref_taxonomy}' not found in any config files provided to the pipeline.\n" + + " QIIME2 reference database '${params.qiime_ref_taxonomy}' not found in any config file provided to the pipeline.\n" + " Currently, the available reference taxonomy keys for `--qiime_ref_taxonomy` are:\n" + " ${params.qiime_ref_databases.keySet().join(", ")}\n" + "===================================================================================" diff --git a/main.nf b/main.nf index b47bfe7f..4c484604 100644 --- a/main.nf +++ b/main.nf @@ -12,11 +12,27 @@ nextflow.enable.dsl = 2 /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE & PRINT PARAMETER SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + WorkflowMain.initialise(workflow, params, log) /* diff --git a/modules.json b/modules.json index 622f8797..b85f77f5 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "c7494026693ba1a7db683e1520816709db3f05a0", "installed_by": ["modules"] }, "cutadapt": { @@ -27,7 +27,7 @@ }, "fastqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "c7494026693ba1a7db683e1520816709db3f05a0", "installed_by": ["modules"] }, "gappa/examineassign": { @@ -65,14 +65,30 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["fasta_newick_epang_gappa"] }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"], + "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" + }, "mafft": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "feb29be775d9e41750180539e9a3bdce801d0609", "installed_by": ["fasta_newick_epang_gappa"] }, "multiqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80", + "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", + "installed_by": ["modules"] + }, + "vsearch/cluster": { + "branch": "master", + "git_sha": "89945ea085b94d3413013b6c82e2633b5184f24d", "installed_by": ["modules"] }, "vsearch/sintax": { @@ -91,7 +107,7 @@ "nf-core": { "fasta_newick_epang_gappa": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] } } diff --git a/modules/local/cutadapt_summary_merge.nf b/modules/local/cutadapt_summary_merge.nf index 46b8ef2d..2b25fb42 100644 --- a/modules/local/cutadapt_summary_merge.nf +++ b/modules/local/cutadapt_summary_merge.nf @@ -2,10 +2,10 @@ process CUTADAPT_SUMMARY_MERGE { tag "${files}" label 'process_low' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: val(action) diff --git a/modules/local/dada2_addspecies.nf b/modules/local/dada2_addspecies.nf index 4c83cca2..6f528264 100644 --- a/modules/local/dada2_addspecies.nf +++ b/modules/local/dada2_addspecies.nf @@ -3,10 +3,10 @@ process DADA2_ADDSPECIES { label 'process_high' label 'single_cpu' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: path(taxtable) diff --git a/modules/local/dada2_denoising.nf b/modules/local/dada2_denoising.nf index 29d48186..ea07eadf 100644 --- a/modules/local/dada2_denoising.nf +++ b/modules/local/dada2_denoising.nf @@ -3,10 +3,10 @@ process DADA2_DENOISING { label 'process_medium' label 'process_long' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path("filtered/*"), path(errormodel) diff --git a/modules/local/dada2_err.nf b/modules/local/dada2_err.nf index d4add8e8..cecca72f 100644 --- a/modules/local/dada2_err.nf +++ b/modules/local/dada2_err.nf @@ -2,10 +2,10 @@ process DADA2_ERR { tag "$meta.run" label 'process_medium' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path(reads) diff --git a/modules/local/dada2_filtntrim.nf b/modules/local/dada2_filtntrim.nf index 1a38189f..9aca5912 100644 --- a/modules/local/dada2_filtntrim.nf +++ b/modules/local/dada2_filtntrim.nf @@ -2,19 +2,17 @@ process DADA2_FILTNTRIM { tag "$meta.id" label 'process_medium' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path(reads), val(trunclenf), val(trunclenr) output: - tuple val(meta), path("*.filter_stats.tsv"), emit: log - tuple val(meta), path("*.filt.fastq.gz") , emit: reads + tuple val(meta), path("*.filt.fastq.gz"), path("*.filter_stats.tsv"), path("*.args.txt"), emit: reads_logs_args path "versions.yml" , emit: versions - path "*.args.txt" , emit: args when: task.ext.when == null || task.ext.when @@ -22,6 +20,7 @@ process DADA2_FILTNTRIM { script: def args = task.ext.args ?: '' def in_and_out = meta.single_end ? "\"${reads}\", \"${meta.id}.filt.fastq.gz\"" : "\"${reads[0]}\", \"${meta.id}_1.filt.fastq.gz\", \"${reads[1]}\", \"${meta.id}_2.filt.fastq.gz\"" + def outfiles = meta.single_end ? "\"${meta.id}.filt.fastq.gz\"" : "\"${meta.id}_1.filt.fastq.gz\", \"${meta.id}_2.filt.fastq.gz\"" def trunclenf = trunclenf[1].toInteger() def trunclenr = trunclenr[1].toInteger() def trunc_args = meta.single_end ? "truncLen = $trunclenf" : "truncLen = c($trunclenf, $trunclenr)" @@ -37,6 +36,16 @@ process DADA2_FILTNTRIM { verbose = TRUE) out <- cbind(out, ID = row.names(out)) + # If no reads passed the filter, write an empty GZ file + if(out[2] == '0'){ + for(fp in c($outfiles)){ + print(paste("Writing out an empty file:", fp)) + handle <- gzfile(fp, "w") + write("", handle) + close(handle) + } + } + write.table( out, file = "${meta.id}.filter_stats.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '') write.table(paste('filterAndTrim\t$trunc_args','$args',sep=","), file = "filterAndTrim.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE, na = '') writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0(" dada2: ", packageVersion("dada2")) ), "versions.yml") diff --git a/modules/local/dada2_merge.nf b/modules/local/dada2_merge.nf index a910c6d4..8577dd4a 100644 --- a/modules/local/dada2_merge.nf +++ b/modules/local/dada2_merge.nf @@ -1,6 +1,7 @@ process DADA2_MERGE { label 'process_low' + // https://depot.galaxyproject.org/singularity/bioconductor-dada2=1.28.0--r43hf17093f_0 doesnt contain 'digest', so keep here v1.22.0 conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : diff --git a/modules/local/dada2_quality.nf b/modules/local/dada2_quality.nf index 3211fc6a..ff91c995 100644 --- a/modules/local/dada2_quality.nf +++ b/modules/local/dada2_quality.nf @@ -2,10 +2,10 @@ process DADA2_QUALITY { tag "$meta" label 'process_low' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path(reads) diff --git a/modules/local/dada2_rmchimera.nf b/modules/local/dada2_rmchimera.nf index b25973c9..0f25444f 100644 --- a/modules/local/dada2_rmchimera.nf +++ b/modules/local/dada2_rmchimera.nf @@ -2,10 +2,10 @@ process DADA2_RMCHIMERA { tag "$meta.run" label 'process_medium' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path(seqtab) diff --git a/modules/local/dada2_stats.nf b/modules/local/dada2_stats.nf index f829a99a..2fe792e5 100644 --- a/modules/local/dada2_stats.nf +++ b/modules/local/dada2_stats.nf @@ -2,10 +2,10 @@ process DADA2_STATS { tag "$meta.run" label 'process_low' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path("filter_and_trim_files/*"), path(denoised), path(mergers), path(seqtab_nochim) diff --git a/modules/local/dada2_taxonomy.nf b/modules/local/dada2_taxonomy.nf index b7919185..4be292d2 100644 --- a/modules/local/dada2_taxonomy.nf +++ b/modules/local/dada2_taxonomy.nf @@ -2,10 +2,10 @@ process DADA2_TAXONOMY { tag "${fasta},${database}" label 'process_high' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: path(fasta) diff --git a/modules/local/filter_clusters.nf b/modules/local/filter_clusters.nf new file mode 100644 index 00000000..18b02fcb --- /dev/null +++ b/modules/local/filter_clusters.nf @@ -0,0 +1,35 @@ +process FILTER_CLUSTERS { + tag "${meta.id}" + label 'process_low' + + conda "conda-forge::pandas=1.1.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.1.5': + 'biocontainers/pandas:1.1.5' }" + + input: + tuple val(meta), path(clusters) + path(asv) + + output: + path( "ASV_post_clustering_filtered.table.tsv") , emit: asv + path( "ASV_post_clustering_filtered.fna" ) , emit: fasta + path( "ASV_post_clustering_filtered.stats.tsv") , emit: stats + path( "versions.yml" ) , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "'$meta.id'" + def clusters = "'$clusters'" + """ + filt_clusters.py -t ${asv} -p ${prefix} -c ${clusters} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/filter_codons.nf b/modules/local/filter_codons.nf index 972f5e94..265a29c9 100644 --- a/modules/local/filter_codons.nf +++ b/modules/local/filter_codons.nf @@ -10,13 +10,12 @@ process FILTER_CODONS { input: path(fasta) path(asv) - path(dada2stats) output: - path( "ASV_codon_filtered.table.tsv" ) , emit: asv + path( "ASV_codon_filtered.table.tsv" ) , emit: asv, optional: true path( "ASV_codon_filtered.fna" ) , emit: fasta path( "ASV_codon_filtered.list" ) , emit: list - path( "codon.filtered.stats.tsv" ) , emit: stats + path( "codon.filtered.stats.tsv" ) , emit: stats, optional: true path( "versions.yml" ) , emit: versions when: @@ -24,9 +23,11 @@ process FILTER_CODONS { script: def args = task.ext.args ?: '' + def count_table = asv ? "-t ${asv}" : "" + def make_stats_cmd = asv ? "filt_codon_stats.py ASV_codon_filtered.table.tsv" : "" """ - filt_codons.py -f ${fasta} -t ${asv} -p ASV_codon ${args} - filt_codon_stats.py ASV_codon_filtered.table.tsv + filt_codons.py -f ${fasta} ${count_table} -p ASV_codon ${args} + $make_stats_cmd cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/filter_len_asv.nf b/modules/local/filter_len_asv.nf index 8e9306f1..3cf7d59d 100644 --- a/modules/local/filter_len_asv.nf +++ b/modules/local/filter_len_asv.nf @@ -13,7 +13,7 @@ process FILTER_LEN_ASV { output: path( "stats.len.tsv" ) , emit: stats - path( "ASV_table.len.tsv" ) , emit: asv + path( "ASV_table.len.tsv" ) , emit: asv, optional: true path( "ASV_seqs.len.fasta" ) , emit: fasta path( "ASV_len_orig.tsv" ) , emit: len_orig path( "ASV_len_filt.tsv" ) , emit: len_filt @@ -27,6 +27,7 @@ process FILTER_LEN_ASV { def max_len_asv = params.max_len_asv ?: '1000000' def read_table = table ? "table <- read.table(file = '$table', sep = '\t', comment.char = '', header=TRUE)" : "table <- data.frame(matrix(ncol = 1, nrow = 0))" + def asv_table_filtered = table ? "ASV_table.len.tsv" : "empty_ASV_table.len.tsv" """ #!/usr/bin/env Rscript @@ -53,7 +54,7 @@ process FILTER_LEN_ASV { distribution_after <- data.frame(Length=names(distribution_after),Counts=as.vector(distribution_after)) #write - write.table(filtered_table, file = "ASV_table.len.tsv", row.names=FALSE, sep="\t", col.names = TRUE, quote = FALSE, na = '') + write.table(filtered_table, file = "$asv_table_filtered", row.names=FALSE, sep="\t", col.names = TRUE, quote = FALSE, na = '') write.table(data.frame(s = sprintf(">%s\n%s", filtered_seq\$ID, filtered_seq\$sequence)), 'ASV_seqs.len.fasta', col.names = FALSE, row.names = FALSE, quote = FALSE, na = '') write.table(distribution_before, file = "ASV_len_orig.tsv", row.names=FALSE, sep="\t", col.names = TRUE, quote = FALSE, na = '') write.table(distribution_after, file = "ASV_len_filt.tsv", row.names=FALSE, sep="\t", col.names = TRUE, quote = FALSE, na = '') diff --git a/modules/local/filter_ssu.nf b/modules/local/filter_ssu.nf index ac423f45..5b3c623c 100644 --- a/modules/local/filter_ssu.nf +++ b/modules/local/filter_ssu.nf @@ -13,8 +13,8 @@ process FILTER_SSU { path(barrnap_summary) output: - path( "stats.ssu.tsv" ) , emit: stats - path( "ASV_table.ssu.tsv" ) , emit: asv + path( "stats.ssu.tsv" ) , emit: stats, optional: true + path( "ASV_table.ssu.tsv" ) , emit: asv, optional: true path( "ASV_seqs.ssu.fasta" ) , emit: fasta path "versions.yml" , emit: versions @@ -23,6 +23,9 @@ process FILTER_SSU { script: def kingdom = params.filter_ssu ?: "bac,arc,mito,euk" + def read_table = table ? "table <- read.table(file = '$table', sep = '\t', comment.char = '', header=TRUE)" : "table <- data.frame(matrix(ncol = 1, nrow = 0))" + def asv_table_filtered = table ? "ASV_table.ssu.tsv" : "empty_ASV_table.ssu.tsv" + def stats_file = table ? "stats.ssu.tsv" : "empty_stats.ssu.tsv" """ #!/usr/bin/env Rscript @@ -48,30 +51,32 @@ process FILTER_SSU { if ( nrow(df_filtered) == 0 ) stop("Chosen kingdom(s) by --filter_ssu has no matches. Please choose a different kingdom (domain) or omit filtering.") #read abundance file, first column is ASV_ID - table <- read.table(file = "$table", sep = '\t', comment.char = "", header=TRUE) + $read_table colnames(table)[1] <- "ASV_ID" #read fasta file of ASV sequences seq <- readDNAStringSet("$fasta") seq <- data.frame(ID=names(seq), sequence=paste(seq)) - #check if all ids match - if(!all(id_filtered\$ID %in% seq\$ID)) {stop(paste(paste(files,sep=","),"and","$fasta","dont share all IDs, exit."), call.=FALSE)} - if(!all(id_filtered\$ID %in% table\$ASV_ID)) {stop(paste(paste(files,sep=","),"and","$table","dont share all IDs, exit"), call.=FALSE)} + #make sure that IDs match, this is only relevant when the fasta is from --input_fasta + if(!all(id_filtered\$ASV_ID %in% seq\$ID)) { + seq\$ID <- sub("[[:space:]].*", "",seq\$ID) + if(!all(id_filtered\$ASV_ID %in% seq\$ID)) { stop(paste("ERROR: Some ASV_IDs are not being merged with sequences, please check\n",paste(setdiff(id_filtered\$ASV_ID, seq\$ID),collapse="\n"))) } + } #merge filtered_table <- merge(table, id_filtered, by.x="ASV_ID", by.y="ASV_ID", all.x=FALSE, all.y=TRUE) filtered_seq <- merge(seq, id_filtered, by.x="ID", by.y="ASV_ID", all.x=FALSE, all.y=TRUE) #write - write.table(filtered_table, file = "ASV_table.ssu.tsv", row.names=FALSE, sep="\t", col.names = TRUE, quote = FALSE, na = '') + write.table(filtered_table, file = "$asv_table_filtered", row.names=FALSE, sep="\t", col.names = TRUE, quote = FALSE, na = '') write.table(data.frame(s = sprintf(">%s\n%s", filtered_seq\$ID, filtered_seq\$sequence)), 'ASV_seqs.ssu.fasta', col.names = FALSE, row.names = FALSE, quote = FALSE, na = '') #stats stats <- as.data.frame( t( rbind( colSums(table[-1]), colSums(filtered_table[-1]) ) ) ) stats\$ID <- rownames(stats) colnames(stats) <- c("ssufilter_input","ssufilter_output", "sample") - write.table(stats, file = "stats.ssu.tsv", row.names=FALSE, sep="\t") + write.table(stats, file = "$stats_file", row.names=FALSE, sep="\t") writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0(" Biostrings: ", packageVersion("Biostrings")) ), "versions.yml") """ diff --git a/modules/local/format_pplacetax.nf b/modules/local/format_pplacetax.nf index d159627c..1861a365 100644 --- a/modules/local/format_pplacetax.nf +++ b/modules/local/format_pplacetax.nf @@ -2,10 +2,10 @@ process FORMAT_PPLACETAX { tag "${tax.baseName}" label 'process_high' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path(tax) diff --git a/modules/local/format_taxresults_kraken2.nf b/modules/local/format_taxresults_kraken2.nf new file mode 100644 index 00000000..dd7c7ce2 --- /dev/null +++ b/modules/local/format_taxresults_kraken2.nf @@ -0,0 +1,107 @@ +process FORMAT_TAXRESULTS_KRAKEN2 { + label 'process_low' + + conda "conda-forge::r-base=4.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-base:4.2.1' : + 'biocontainers/r-base:4.2.1' }" + + input: + tuple val(meta), path(report) + tuple val(meta), path(classified_reads_assignment) + val(taxlevels_input) + + output: + path("*.kraken2.keys.tsv") , emit: keys_tsv + path("*.kraken2.complete.tsv") , emit: complete_tsv + path("*.kraken2.tsv") , emit: tsv + path("*.kraken2.into-qiime2.tsv"), emit: qiime2_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def taxlevels = taxlevels_input ? taxlevels_input : "D,P,C,O,F,G,S" + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + + ## report + report <- read.table(file = "$report", header = FALSE, sep = "\t") + colnames(report) <- c("percent","fragments_total","fragments","rank","taxid","taxon") + + taxonomy_standard_labels <- unlist(strsplit("$taxlevels",",")) + + # prepare empty result data frame + headers <- c("rank","taxid","taxon","taxonomy_rank","taxonomy_standard") + headers <- c(headers, taxonomy_standard_labels) + df = data.frame(matrix(nrow = 0, ncol = length(headers))) + colnames(df) = headers + + # go through each line of report and aggregate information + taxonomy_rank <- c("root") + for (i in 1:nrow(report)) { + tax <- report\$taxon[i] + taxnospaces <- gsub("^ *", "", tax) + + # (1) extract full taxonomy with taxa ranks + nspaces = nchar(tax)- nchar(taxnospaces) + indent = nspaces/2 +1 #+1 is to catch also entries without any indent, e.g. "root" + # if this is a lower taxonomic rank, add it, otherwise reset and add + if ( indent > length(taxonomy_rank) ) { + taxonomy_rank <- c(taxonomy_rank,paste0(report\$rank[i],"__",taxnospaces)) + } else { + taxonomy_rank <- taxonomy_rank[1:indent-1] + taxonomy_rank <- c(taxonomy_rank,paste0(report\$rank[i],"__",taxnospaces)) + } + taxonomy_rank_string <- paste(taxonomy_rank,collapse=";") + + # (2) filter taxonomy_rank to only contain entries with taxonomy_standard_labels+"__" + taxonomy_standard = list() + taxonomy_ranks = gsub( "__.*", "", taxonomy_rank ) + for (x in taxonomy_standard_labels) { + taxonomy_ranks_match <- match(x, taxonomy_ranks) + if ( !is.na(taxonomy_ranks_match) ) { + taxonomy_clean <- gsub( ".*__", "", taxonomy_rank[taxonomy_ranks_match] ) + taxonomy_standard <- c(taxonomy_standard,taxonomy_clean) + } else { + taxonomy_standard <- c(taxonomy_standard,"") + } + } + taxonomy_standard_string <- paste(taxonomy_standard,collapse=";") + names(taxonomy_standard) <- taxonomy_standard_labels + + # (3) propagate all in results data frame + results <- c( + rank=report\$rank[i], + taxid=report\$taxid[i], + taxon=taxnospaces, + taxonomy_rank=taxonomy_rank_string, + taxonomy_standard=taxonomy_standard_string, + taxonomy_standard) + df <- rbind(df, results) + } + df\$taxon_taxid <- paste0(df\$taxon," (taxid ",df\$taxid,")") + write.table(df, file = "${prefix}.kraken2.keys.tsv", row.names=FALSE, quote=FALSE, sep="\t") + + # merge with reads + creads <- read.table(file = "$classified_reads_assignment", header = FALSE, sep = "\t") + colnames(creads) <- c("classified","ASV_ID","taxonomy","ASV_length","kmer_LCA") + if ( !all( creads\$taxonomy %in% df\$taxon_taxid ) ) { stop(paste("$classified_reads_assignment","and","$report","dont share all IDs, exit"), call.=FALSE) } + merged <- merge(creads, df, by.x="taxonomy", by.y="taxon_taxid", all.x=TRUE, all.y=FALSE) + write.table(merged, file = "${prefix}.kraken2.complete.tsv", row.names=FALSE, quote=FALSE, sep="\t") + + # get downstream compatible table + merged_reduced <- subset(merged, select = c("ASV_ID",taxonomy_standard_labels,"taxonomy")) + colnames(merged_reduced) <- c("ASV_ID",taxonomy_standard_labels,"lowest_match") + write.table(merged_reduced, file = "${prefix}.kraken2.tsv", row.names=FALSE, quote=FALSE, sep="\t") + + # get QIIME2 downstream compatible table + qiime <- merged[c("ASV_ID", "taxonomy_standard")] + colnames(qiime) <- c("ASV_ID", "taxonomy") + write.table(qiime, file = "${prefix}.kraken2.into-qiime2.tsv", row.names=FALSE, quote=FALSE, sep="\t") + + writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")) ), "versions.yml") + """ +} diff --git a/modules/local/merge_stats.nf b/modules/local/merge_stats.nf index 55b2cf83..b2e2cb8c 100644 --- a/modules/local/merge_stats.nf +++ b/modules/local/merge_stats.nf @@ -1,10 +1,10 @@ process MERGE_STATS { label 'process_low' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: path('file1.tsv') diff --git a/modules/local/metadata_all.nf b/modules/local/metadata_all.nf index 67d148e9..8993d1ca 100644 --- a/modules/local/metadata_all.nf +++ b/modules/local/metadata_all.nf @@ -2,10 +2,10 @@ process METADATA_ALL { tag "$metadata" label 'process_low' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: path(metadata) diff --git a/modules/local/metadata_pairwise.nf b/modules/local/metadata_pairwise.nf index cfa16a8c..6dd9ab11 100644 --- a/modules/local/metadata_pairwise.nf +++ b/modules/local/metadata_pairwise.nf @@ -2,10 +2,10 @@ process METADATA_PAIRWISE { tag "$metadata" label 'process_low' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: path(metadata) diff --git a/modules/local/novaseq_err.nf b/modules/local/novaseq_err.nf index 4a9fbe8e..991fa803 100644 --- a/modules/local/novaseq_err.nf +++ b/modules/local/novaseq_err.nf @@ -2,10 +2,10 @@ process NOVASEQ_ERR { tag "$meta.run" label 'process_medium' - conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30" + conda "bioconda::bioconductor-dada2=1.28.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' : - 'biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.28.0--r43hf17093f_0' : + 'biocontainers/bioconductor-dada2:1.28.0--r43hf17093f_0' }" input: tuple val(meta), path(errormodel) diff --git a/modules/local/phyloseq.nf b/modules/local/phyloseq.nf new file mode 100644 index 00000000..bbc6218b --- /dev/null +++ b/modules/local/phyloseq.nf @@ -0,0 +1,63 @@ +process PHYLOSEQ { + tag "$prefix" + label 'process_low' + + conda "bioconda::bioconductor-phyloseq=1.44.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-phyloseq:1.44.0--r43hdfd78af_0' : + 'biocontainers/bioconductor-phyloseq:1.44.0--r43hdfd78af_0' }" + + input: + tuple val(prefix), path(tax_tsv) + path otu_tsv + path sam_tsv + path tree + + output: + tuple val(prefix), path("*phyloseq.rds"), emit: rds + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def sam_tsv = "\"${sam_tsv}\"" + def otu_tsv = "\"${otu_tsv}\"" + def tax_tsv = "\"${tax_tsv}\"" + def tree = "\"${tree}\"" + def prefix = "\"${prefix}\"" + """ + #!/usr/bin/env Rscript + + suppressPackageStartupMessages(library(phyloseq)) + + otu_df <- read.table($otu_tsv, sep="\\t", header=TRUE, row.names=1) + tax_df <- read.table($tax_tsv, sep="\\t", header=TRUE, row.names=1) + otu_mat <- as.matrix(otu_df) + tax_mat <- as.matrix(tax_df) + + OTU <- otu_table(otu_mat, taxa_are_rows=TRUE) + TAX <- tax_table(tax_mat) + phy_obj <- phyloseq(OTU, TAX) + + if (file.exists($sam_tsv)) { + sam_df <- read.table($sam_tsv, sep="\\t", header=TRUE, row.names=1) + SAM <- sample_data(sam_df) + phy_obj <- merge_phyloseq(phy_obj, SAM) + } + + if (file.exists($tree)) { + TREE <- read_tree($tree) + phy_obj <- merge_phyloseq(phy_obj, TREE) + } + + saveRDS(phy_obj, file = paste0($prefix, "_phyloseq.rds")) + + # Version information + writeLines(c("\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" phyloseq: ", packageVersion("phyloseq"))), + "versions.yml" + ) + """ +} diff --git a/modules/local/phyloseq_inasv.nf b/modules/local/phyloseq_inasv.nf new file mode 100644 index 00000000..f66d1669 --- /dev/null +++ b/modules/local/phyloseq_inasv.nf @@ -0,0 +1,28 @@ +process PHYLOSEQ_INASV { + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path(biom_file) + + output: + path( "*.tsv" ) , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + tail $biom_file -n +2 | sed '1s/#OTU ID/ASV_ID/' > reformat_$biom_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | sed -n 1p | sed 's/GNU bash, version //g') + END_VERSIONS + """ +} diff --git a/modules/local/phyloseq_intax.nf b/modules/local/phyloseq_intax.nf new file mode 100644 index 00000000..6dbd8487 --- /dev/null +++ b/modules/local/phyloseq_intax.nf @@ -0,0 +1,29 @@ +process PHYLOSEQ_INTAX { + label 'process_low' + + conda "conda-forge::pandas=1.1.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.1.5': + 'biocontainers/pandas:1.1.5' }" + + input: + path(tax_tsv) + + output: + path( "*.tsv" ) , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + reformat_tax_for_phyloseq.py $tax_tsv reformat_$tax_tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/picrust.nf b/modules/local/picrust.nf index 0a0d6d2d..0f09a899 100644 --- a/modules/local/picrust.nf +++ b/modules/local/picrust.nf @@ -2,10 +2,10 @@ process PICRUST { tag "${seq},${abund}" label 'process_medium' - conda "bioconda::picrust2=2.5.0" + conda "bioconda::picrust2=2.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picrust2:2.5.0--pyhdfd78af_0' : - 'biocontainers/picrust2:2.5.0--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/picrust2:2.5.2--pyhdfd78af_0' : + 'biocontainers/picrust2:2.5.2--pyhdfd78af_0' }" input: path(seq) diff --git a/modules/local/qiime2_alphararefaction.nf b/modules/local/qiime2_alphararefaction.nf index 9d656840..273749ba 100644 --- a/modules/local/qiime2_alphararefaction.nf +++ b/modules/local/qiime2_alphararefaction.nf @@ -1,7 +1,7 @@ process QIIME2_ALPHARAREFACTION { label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_ancom_asv.nf b/modules/local/qiime2_ancom_asv.nf index 322b414e..52c599c4 100644 --- a/modules/local/qiime2_ancom_asv.nf +++ b/modules/local/qiime2_ancom_asv.nf @@ -5,7 +5,7 @@ process QIIME2_ANCOM_ASV { label 'process_long' label 'error_ignore' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_ancom_tax.nf b/modules/local/qiime2_ancom_tax.nf index 9f5392ef..3c065bb3 100644 --- a/modules/local/qiime2_ancom_tax.nf +++ b/modules/local/qiime2_ancom_tax.nf @@ -3,7 +3,7 @@ process QIIME2_ANCOM_TAX { label 'process_medium' label 'single_cpu' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_barplot.nf b/modules/local/qiime2_barplot.nf index 3e83ab02..a06100d4 100644 --- a/modules/local/qiime2_barplot.nf +++ b/modules/local/qiime2_barplot.nf @@ -1,7 +1,7 @@ process QIIME2_BARPLOT { label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_classify.nf b/modules/local/qiime2_classify.nf index f5a4824d..96910c9a 100644 --- a/modules/local/qiime2_classify.nf +++ b/modules/local/qiime2_classify.nf @@ -2,7 +2,7 @@ process QIIME2_CLASSIFY { tag "${repseq},${trained_classifier}" label 'process_high' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_diversity_adonis.nf b/modules/local/qiime2_diversity_adonis.nf index 25bc95f8..6b81f00e 100644 --- a/modules/local/qiime2_diversity_adonis.nf +++ b/modules/local/qiime2_diversity_adonis.nf @@ -2,7 +2,7 @@ process QIIME2_DIVERSITY_ADONIS { tag "${core.baseName} - ${formula}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_diversity_alpha.nf b/modules/local/qiime2_diversity_alpha.nf index dff59e3e..baebc03e 100644 --- a/modules/local/qiime2_diversity_alpha.nf +++ b/modules/local/qiime2_diversity_alpha.nf @@ -2,7 +2,7 @@ process QIIME2_DIVERSITY_ALPHA { tag "${core.baseName}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_diversity_beta.nf b/modules/local/qiime2_diversity_beta.nf index f6fc5ee7..44df25f9 100644 --- a/modules/local/qiime2_diversity_beta.nf +++ b/modules/local/qiime2_diversity_beta.nf @@ -2,7 +2,7 @@ process QIIME2_DIVERSITY_BETA { tag "${core.baseName} - ${category}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_diversity_betaord.nf b/modules/local/qiime2_diversity_betaord.nf index 7b2699a4..b29bd99e 100644 --- a/modules/local/qiime2_diversity_betaord.nf +++ b/modules/local/qiime2_diversity_betaord.nf @@ -2,7 +2,7 @@ process QIIME2_DIVERSITY_BETAORD { tag "${core.baseName}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_diversity_core.nf b/modules/local/qiime2_diversity_core.nf index 99fe9280..22235dfb 100644 --- a/modules/local/qiime2_diversity_core.nf +++ b/modules/local/qiime2_diversity_core.nf @@ -1,7 +1,7 @@ process QIIME2_DIVERSITY_CORE { label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { @@ -27,6 +27,10 @@ process QIIME2_DIVERSITY_CORE { script: """ + # FIX: detecting a viable GPU on your system, but the GPU is unavailable for compute, causing UniFrac to fail. + # COMMENT: might be fixed in version after QIIME2 2023.5 + export UNIFRAC_USE_GPU=N + export XDG_CONFIG_HOME="\${PWD}/HOME" mindepth=\$(count_table_minmax_reads.py $stats minimum 2>&1) diff --git a/modules/local/qiime2_export_absolute.nf b/modules/local/qiime2_export_absolute.nf index 624547d5..57c03a5e 100644 --- a/modules/local/qiime2_export_absolute.nf +++ b/modules/local/qiime2_export_absolute.nf @@ -1,7 +1,7 @@ process QIIME2_EXPORT_ABSOLUTE { label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_export_relasv.nf b/modules/local/qiime2_export_relasv.nf index a5b81388..58e05b58 100644 --- a/modules/local/qiime2_export_relasv.nf +++ b/modules/local/qiime2_export_relasv.nf @@ -1,7 +1,7 @@ process QIIME2_EXPORT_RELASV { label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_export_reltax.nf b/modules/local/qiime2_export_reltax.nf index 8f090b07..c5b93d7c 100644 --- a/modules/local/qiime2_export_reltax.nf +++ b/modules/local/qiime2_export_reltax.nf @@ -1,7 +1,7 @@ process QIIME2_EXPORT_RELTAX { label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_extract.nf b/modules/local/qiime2_extract.nf index 6f686906..6889befb 100644 --- a/modules/local/qiime2_extract.nf +++ b/modules/local/qiime2_extract.nf @@ -3,7 +3,7 @@ process QIIME2_EXTRACT { label 'process_low' label 'single_cpu' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_featuretable_group.nf b/modules/local/qiime2_featuretable_group.nf index 71e9a9b2..d47ee14f 100644 --- a/modules/local/qiime2_featuretable_group.nf +++ b/modules/local/qiime2_featuretable_group.nf @@ -2,7 +2,7 @@ process QIIME2_FEATURETABLE_GROUP { tag "${category}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_filtersamples.nf b/modules/local/qiime2_filtersamples.nf index 6a4a7310..7f64e7e2 100644 --- a/modules/local/qiime2_filtersamples.nf +++ b/modules/local/qiime2_filtersamples.nf @@ -2,7 +2,7 @@ process QIIME2_FILTERSAMPLES { tag "${filter}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_filtertaxa.nf b/modules/local/qiime2_filtertaxa.nf index 0a25803e..39cbb82c 100644 --- a/modules/local/qiime2_filtertaxa.nf +++ b/modules/local/qiime2_filtertaxa.nf @@ -2,7 +2,7 @@ process QIIME2_FILTERTAXA { tag "taxa:${exclude_taxa};min-freq:${min_frequency};min-samples:${min_samples}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_inasv.nf b/modules/local/qiime2_inasv.nf index 348aea87..d4848ba7 100644 --- a/modules/local/qiime2_inasv.nf +++ b/modules/local/qiime2_inasv.nf @@ -2,7 +2,7 @@ process QIIME2_INASV { tag "${asv}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_inseq.nf b/modules/local/qiime2_inseq.nf index a0504053..7c7c2007 100644 --- a/modules/local/qiime2_inseq.nf +++ b/modules/local/qiime2_inseq.nf @@ -2,7 +2,7 @@ process QIIME2_INSEQ { tag "${seq}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_intax.nf b/modules/local/qiime2_intax.nf index 0e6c69e1..b9c3b039 100644 --- a/modules/local/qiime2_intax.nf +++ b/modules/local/qiime2_intax.nf @@ -2,7 +2,7 @@ process QIIME2_INTAX { tag "${tax}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { @@ -11,6 +11,7 @@ process QIIME2_INTAX { input: path(tax) //ASV_tax_species.tsv + val(script) output: path("taxonomy.qza") , emit: qza @@ -20,8 +21,9 @@ process QIIME2_INTAX { task.ext.when == null || task.ext.when script: + def script_cmd = script ? "$script $tax" : "cp $tax tax.tsv" """ - parse_dada2_taxonomy.r $tax + $script_cmd qiime tools import \\ --type 'FeatureData[Taxonomy]' \\ diff --git a/modules/local/qiime2_intree.nf b/modules/local/qiime2_intree.nf index f9f35b97..a7a3ff3b 100644 --- a/modules/local/qiime2_intree.nf +++ b/modules/local/qiime2_intree.nf @@ -2,7 +2,7 @@ process QIIME2_INTREE { tag "${meta.id}:${meta.model}" label 'process_low' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_train.nf b/modules/local/qiime2_train.nf index 254118f8..4c4d69b2 100644 --- a/modules/local/qiime2_train.nf +++ b/modules/local/qiime2_train.nf @@ -3,7 +3,7 @@ process QIIME2_TRAIN { label 'process_high' label 'single_cpu' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/qiime2_tree.nf b/modules/local/qiime2_tree.nf index 5fc32fed..6e7a89fe 100644 --- a/modules/local/qiime2_tree.nf +++ b/modules/local/qiime2_tree.nf @@ -1,7 +1,7 @@ process QIIME2_TREE { label 'process_medium' - container "quay.io/qiime2/core:2022.11" + container "qiime2/core:2023.7" // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { diff --git a/modules/local/summary_report.nf b/modules/local/summary_report.nf new file mode 100644 index 00000000..d886f19b --- /dev/null +++ b/modules/local/summary_report.nf @@ -0,0 +1,155 @@ +process SUMMARY_REPORT { + label 'process_low' + + conda "conda-forge::r-base=4.2.3 conda-forge::r-rmarkdown=2.22 conda-forge::r-tidyverse=2.0.0 conda-forge::r-knitr=1.43 conda-forge::r-dt=0.28 conda-forge::r-dtplyr=1.3.1 conda-forge::r-formattable=0.2.1 conda-forge::r-purrr=1.0.1 conda-forge::r-vegan=2.6_4 conda-forge::r-optparse=1.7.3 conda-forge::r-ggplot2=3.4.2 conda-forge::r-dplyr=1.1.2 conda-forge::r-data.table=1.14.8 conda-forge::r-patchwork=1.1.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + path(report_template) + path(report_styles) + path(report_logo) + path(report_abstract) + path(metadata) + path(input_samplesheet) + path(input_fasta) + path(mqc_plots) + path(cutadapt_summary) + val(find_truncation_values) + path(dada_filtntrim_args) + path(dada_qual_stats) + path(dada_pp_qual_stats) + tuple val(meta), path(dada_err_svgs) + path(dada_asv_table) + path(dada_asv_fa) + path(dada_tab) + path(dada_stats) + path(vsearch_cluster) + path(barrnap_summary) + path(filter_ssu_stats) + path(filter_ssu_asv) + path(filter_len_asv_stats) + path(filter_len_asv_len_orig) + path(filter_codons_fasta) + path(filter_codons_stats) + path(itsx_cutasv_summary) + path(dada2_tax) + tuple val(meta_ref), path(cut_dada_ref_taxonomy) // cutadapt log when params.cut_dada_ref_taxonomy + path(sintax_tax) + path(kraken2_tax) + path(pplace_tax) + tuple val(meta_pplace), path(pplace_heattree) + path(qiime2_tax) + val(run_qiime2) + val(val_used_taxonomy) + val(qiime2_filtertaxa) // , + path(filter_stats_tsv) + path(barplot) + path(abundance_tables, stageAs: 'abundance_tables/*') + val(alpha_rarefaction) + path(diversity_indices) + path(diversity_indices_alpha, stageAs: 'alpha_diversity/*') // prevent folder name collisons + path(diversity_indices_beta, stageAs: 'beta_diversity/*') // prevent folder name collisons + path(diversity_indices_adonis, stageAs: 'beta_diversity/adonis/*') // prevent folder name collisons + path(ancom) + path(picrust_pathways) + path(sbdi, stageAs: 'sbdi/*') + path(phyloseq, stageAs: 'phyloseq/*') + + output: + path "*.svg" , emit: svg, optional: true + path "summary_report.html" , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // make named R list (comma separated) + // all non-boolean or non-numeric values must be encumbered by single quotes (')! + // all elements must have a value, i.e. booleans also need to be set to TRUE + def params_list_named = [ + "css='$report_styles'", + "report_logo='$report_logo'", + "workflow_manifest_version='${workflow.manifest.version}'", + "workflow_scriptid='${workflow.scriptId.substring(0,10)}'", + params.report_title ? "report_title='$params.report_title'" : "", + report_abstract ? "report_abstract='$params.report_abstract'" : "", + meta.single_end ? "flag_single_end=TRUE" : "", + metadata ? "metadata='$metadata'" : "", + input_samplesheet ? "input_samplesheet='$input_samplesheet'" : "", + input_fasta ? "input_fasta='$input_fasta'" : "", + !input_fasta && !input_samplesheet ? "input_folder='$params.input_folder'" : "", + mqc_plots ? "mqc_plot='${mqc_plots}/svg/mqc_fastqc_per_sequence_quality_scores_plot_1.svg'" : "", + cutadapt_summary ? + params.retain_untrimmed ? "flag_retain_untrimmed=TRUE,cutadapt_summary='$cutadapt_summary'" : + "cutadapt_summary='$cutadapt_summary'" : "", + find_truncation_values ? "trunc_qmin=$params.trunc_qmin,trunc_rmin=$params.trunc_rmin" : "", + "trunclenf='$params.trunclenf'", + "trunclenr='$params.trunclenr'", + "max_ee=$params.max_ee", + dada_qual_stats && meta.single_end ? "dada_qc_f_path='$dada_qual_stats',dada_pp_qc_f_path='$dada_pp_qual_stats'" : + dada_qual_stats ? "dada_qc_f_path='FW_qual_stats.svg',dada_qc_r_path='RV_qual_stats.svg',dada_pp_qc_f_path='FW_preprocessed_qual_stats.svg',dada_pp_qc_r_path='RV_preprocessed_qual_stats.svg'" : "", + dada_filtntrim_args ? "dada_filtntrim_args='$dada_filtntrim_args'" : "", + "dada_sample_inference='$params.sample_inference'", + dada_err_svgs && meta.run.size() == 1 && meta.single_end ? + "dada_err_path='$dada_err_svgs',dada_err_run='"+meta.run+"'" : + dada_err_svgs ? "dada_err_path='"+dada_err_svgs.join(',')+"',dada_err_run='"+meta.run.join(',')+"'" : "", + dada_asv_table ? "asv_table_path='$dada_asv_table'" : "", + dada_asv_fa ? "path_asv_fa='$dada_asv_fa'": "", + dada_tab ? "path_dada2_tab='$dada_tab'" : "", + dada_stats ? "dada_stats_path='$dada_stats'" : "", + vsearch_cluster ? "vsearch_cluster='$vsearch_cluster',vsearch_cluster_id='$params.vsearch_cluster_id'" : "", + params.skip_barrnap ? "" : "path_barrnap_sum='$barrnap_summary'", + filter_ssu_stats ? "filter_ssu_stats='$filter_ssu_stats'" : "", + filter_ssu_asv ? "filter_ssu_asv='$filter_ssu_asv',filter_ssu='$params.filter_ssu'" : "", + filter_len_asv_stats ? "filter_len_asv='$filter_len_asv_stats'" : "", + filter_len_asv_len_orig ? "filter_len_asv_len_orig='$filter_len_asv_len_orig'" : "", + params.min_len_asv ? "min_len_asv=$params.min_len_asv" : "min_len_asv=0", + params.max_len_asv ? "max_len_asv=$params.max_len_asv" : "max_len_asv=0", + filter_codons_fasta ? "filter_codons_fasta='$filter_codons_fasta',stop_codons='$params.stop_codons'" : "", + filter_codons_stats ? "filter_codons_stats='$filter_codons_stats'" : "", + itsx_cutasv_summary ? "itsx_cutasv_summary='$itsx_cutasv_summary',cut_its='$params.cut_its'" : "", + dada2_tax ? "dada2_taxonomy='$dada2_tax'" : "", + dada2_tax && !params.dada_ref_tax_custom ? "dada2_ref_tax_title='${params.dada_ref_databases[params.dada_ref_taxonomy]["title"]}',dada2_ref_tax_file='${params.dada_ref_databases[params.dada_ref_taxonomy]["file"]}',dada2_ref_tax_citation='${params.dada_ref_databases[params.dada_ref_taxonomy]["citation"]}'" : "", + cut_dada_ref_taxonomy ? "cut_dada_ref_taxonomy='$cut_dada_ref_taxonomy'" : "", + sintax_tax ? "sintax_taxonomy='$sintax_tax',sintax_ref_tax_title='${params.sintax_ref_databases[params.sintax_ref_taxonomy]["title"]}',sintax_ref_tax_file='${params.sintax_ref_databases[params.sintax_ref_taxonomy]["file"]}',sintax_ref_tax_citation='${params.sintax_ref_databases[params.sintax_ref_taxonomy]["citation"]}'" : "", + kraken2_tax ? "kraken2_taxonomy='$kraken2_tax',kraken2_confidence='$params.kraken2_confidence'" : "", + kraken2_tax && !params.kraken2_ref_tax_custom ? "kraken2_ref_tax_title='${params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["title"]}',kraken2_ref_tax_file='${params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["file"]}',kraken2_ref_tax_citation='${params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["citation"]}'" : "", + pplace_tax ? "pplace_taxonomy='$pplace_tax',pplace_heattree='$pplace_heattree'" : "", + qiime2_tax ? "qiime2_taxonomy='$qiime2_tax',qiime2_ref_tax_title='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["title"]}',qiime2_ref_tax_file='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["file"]}',qiime2_ref_tax_citation='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["citation"]}'" : "", + run_qiime2 ? "val_used_taxonomy='$val_used_taxonomy'" : "", + filter_stats_tsv ? "filter_stats_tsv='$filter_stats_tsv',qiime2_filtertaxa='$qiime2_filtertaxa',exclude_taxa='$params.exclude_taxa',min_frequency='$params.min_frequency',min_samples='$params.min_samples'" : "", + barplot ? "barplot=TRUE" : "", + barplot && params.metadata_category_barplot ? "metadata_category_barplot='$params.metadata_category_barplot'" : "", + abundance_tables ? "abundance_tables=TRUE" : "", + alpha_rarefaction ? "alpha_rarefaction=TRUE" : "", + diversity_indices ? "diversity_indices_depth='$diversity_indices'": "", + diversity_indices_alpha ? "diversity_indices_alpha=TRUE" : "", + diversity_indices_beta ? "diversity_indices_beta='"+ diversity_indices_beta.join(",") +"'" : "", + diversity_indices_adonis ? "diversity_indices_adonis='"+ diversity_indices_adonis.join(",") +"',qiime_adonis_formula='$params.qiime_adonis_formula'" : "", + ancom ? "ancom='"+ ancom.join(",") +"'" : "", + sbdi ? "sbdi='"+ sbdi.join(",") +"'" : "", + phyloseq ? "phyloseq='"+ phyloseq.join(",") +"'" : "", + ] + // groovy list to R named list string; findAll removes empty entries + params_list_named_string = params_list_named.findAll().join(',').trim() + """ + #!/usr/bin/env Rscript + library(rmarkdown) + + # Work around https://github.com/rstudio/rmarkdown/issues/1508 + # If the symbolic link is not replaced by a physical file + # output- and temporary files will be written to the original directory. + file.copy("./${report_template}", "./template.Rmd", overwrite = TRUE) + + rmarkdown::render("template.Rmd", output_file = "summary_report.html", params = list($params_list_named_string), envir = new.env()) + + writeLines(c("\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" rmarkdown: ", packageVersion("rmarkdown")), + paste0(" knitr: ", packageVersion("knitr")) ), + "versions.yml") + """ +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..c9d014b1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 00000000..eec1db10 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 00000000..8713b921 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,27 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "software_versions.yml:md5,a027f820f30b8191a20ca16465daaf37" + ], + "1": [ + "software_versions_mqc.yml:md5,ee4a1d028ad29987f9ac511f4668f17c" + ], + "2": [ + "versions.yml:md5,f47ebd22aba1dd987b7e5d5247b766c3" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,ee4a1d028ad29987f9ac511f4668f17c" + ], + "versions": [ + "versions.yml:md5,f47ebd22aba1dd987b7e5d5247b766c3" + ], + "yml": [ + "software_versions.yml:md5,a027f820f30b8191a20ca16465daaf37" + ] + } + ], + "timestamp": "2023-10-11T17:10:02.930699" + } +} diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 07d5e433..67209f79 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "bioconda::fastqc=0.12.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) @@ -29,7 +29,11 @@ process FASTQC { printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done - fastqc $args --threads $task.cpus $renamed_files + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test new file mode 100644 index 00000000..6437a144 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -0,0 +1,41 @@ +nextflow_process { + + name "Test Process FASTQC" + script "../main.nf" + process "FASTQC" + tag "modules" + tag "modules_nfcore" + tag "fastqc" + + test("Single-Read") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
    Mon 2 Oct 2023
    test.gz
    + // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, + { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + ) + } + } +} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 00000000..636a32ce --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-10-09T23:40:54+0000" + } +} \ No newline at end of file diff --git a/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff new file mode 100644 index 00000000..4ad9339e --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff @@ -0,0 +1,13 @@ +Changes in module 'nf-core/kraken2/kraken2' +--- modules/nf-core/kraken2/kraken2/main.nf ++++ modules/nf-core/kraken2/kraken2/main.nf +@@ -39,7 +39,6 @@ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ +- --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + +************************************************************ diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 00000000..1afdcfbe --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,57 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 00000000..4721f45b --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,75 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: string + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: string + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/mafft/main.nf b/modules/nf-core/mafft/main.nf index 420b6484..9b7d27c5 100644 --- a/modules/nf-core/mafft/main.nf +++ b/modules/nf-core/mafft/main.nf @@ -2,10 +2,10 @@ process MAFFT { tag "$meta.id" label 'process_high' - conda "bioconda::mafft=7.508" + conda "bioconda::mafft=7.520" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mafft:7.508--hec16e2b_0': - 'biocontainers/mafft:7.508--hec16e2b_0' }" + 'https://depot.galaxyproject.org/singularity/mafft:7.520--hec16e2b_1': + 'biocontainers/mafft:7.520--hec16e2b_1' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/mafft/meta.yml b/modules/nf-core/mafft/meta.yml index a366a4b8..7cbf1087 100644 --- a/modules/nf-core/mafft/meta.yml +++ b/modules/nf-core/mafft/meta.yml @@ -1,6 +1,7 @@ name: mafft description: Multiple sequence alignment using MAFFT keywords: + - fasta - msa - multiple sequence alignment tools: diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 1fc387be..65d7dd0d 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 00000000..61461c39 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7 conda-forge::grep=3.11 conda-forge::tar=1.34" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 00000000..db241a6e --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,41 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf new file mode 100644 index 00000000..922207aa --- /dev/null +++ b/modules/nf-core/vsearch/cluster/main.nf @@ -0,0 +1,76 @@ +process VSEARCH_CLUSTER { + tag "$meta.id" + label 'process_low' + + conda "bioconda::vsearch=2.21.1 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0': + 'biocontainers/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.aln.gz') , optional: true, emit: aln + tuple val(meta), path('*.biom.gz') , optional: true, emit: biom + tuple val(meta), path('*.mothur.tsv.gz') , optional: true, emit: mothur + tuple val(meta), path('*.otu.tsv.gz') , optional: true, emit: otu + tuple val(meta), path('*.bam') , optional: true, emit: bam + tuple val(meta), path('*.out.tsv.gz') , optional: true, emit: out + tuple val(meta), path('*.blast.tsv.gz') , optional: true, emit: blast + tuple val(meta), path('*.uc.tsv.gz') , optional: true, emit: uc + tuple val(meta), path('*.centroids.fasta.gz') , optional: true, emit: centroids + tuple val(meta), path('*.clusters.fasta*.gz') , optional: true, emit: clusters + tuple val(meta), path('*.profile.txt.gz') , optional: true, emit: profile + tuple val(meta), path('*.msa.fasta.gz') , optional: true, emit: msa + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { + error "Unknown clustering option provided (${args2})" + } + def out_ext = args3.contains("--alnout") ? "aln" : + args3.contains("--biomout") ? "biom" : + args3.contains("--blast6out") ? "blast.tsv" : + args3.contains("--centroids") ? "centroids.fasta" : + args3.contains("--clusters") ? "clusters.fasta" : + args3.contains("--mothur_shared_out") ? "mothur.tsv" : + args3.contains("--msaout") ? "msa.fasta" : + args3.contains("--otutabout") ? "otu.tsv" : + args3.contains("--profile") ? "profile.txt" : + args3.contains("--samout") ? "sam" : + args3.contains("--uc") ? "uc.tsv" : + args3.contains("--userout") ? "out.tsv" : + "" + if (out_ext == "") { error "Unknown output file format provided (${args3})" } + """ + vsearch \\ + $args2 $fasta \\ + $args3 ${prefix}.${out_ext} \\ + --threads $task.cpus \\ + $args + + if [[ $args3 == "--clusters" ]] + then + gzip -n ${prefix}.${out_ext}* + elif [[ $args3 != "--samout" ]] + then + gzip -n ${prefix}.${out_ext} + else + samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vsearch/cluster/meta.yml b/modules/nf-core/vsearch/cluster/meta.yml new file mode 100644 index 00000000..469eb8cd --- /dev/null +++ b/modules/nf-core/vsearch/cluster/meta.yml @@ -0,0 +1,69 @@ +name: "vsearch_cluster" +description: Cluster sequences using a single-pass, greedy centroid-based clustering algorithm. +keywords: + - vsearch + - clustering + - microbiome +tools: + - vsearch: + description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative) + homepage: https://github.com/torognes/vsearch + documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf + tool_dev_url: https://github.com/torognes/vsearch + doi: 10.7717/peerj.2584 + licence: ["GPL v3-or-later OR BSD-2-clause"] + +input: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test' ] + - fasta: + type: file + description: Sequences to cluster in FASTA format + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - aln: + type: file + description: Results in pairwise alignment format + pattern: "*.aln.gz" + - biom: + type: file + description: Results in an OTU table in the biom version 1.0 file format + pattern: "*.biom.gz" + - mothur: + type: file + description: Results in an OTU table in the mothur ’shared’ tab-separated plain text file format + pattern: "*.mothur.tsv.gz" + - otu: + type: file + description: Results in an OTU table in the classic tab-separated plain text format + pattern: "*.otu.tsv.gz" + - bam: + type: file + description: Results written in bam format + pattern: "*.bam" + - out: + type: file + description: Results in tab-separated output, columns defined by user + pattern: "*.out.tsv.gz" + - blast: + type: file + description: Tab delimited results in blast-like tabular format + pattern: "*.blast.tsv.gz" + - uc: + type: file + description: Tab delimited results in a uclust-like format with 10 columns + pattern: "*.uc.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mirpedrol" diff --git a/nextflow.config b/nextflow.config index a29f2d8b..7b90f100 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,8 @@ params { // Input options input = null + input_fasta = null + input_folder = null extension = "/*_R{1,2}_001.fastq.gz" pacbio = false iontorrent = false @@ -26,6 +28,7 @@ params { trunclenr = null max_ee = 2 max_len = null + ignore_failed_filtering = false min_len = 50 metadata_category = null metadata_category_barplot = null @@ -69,6 +72,15 @@ params { pplace_name = null diversity_rarefaction_depth = 500 ancom_sample_min_count = 1 + vsearch_cluster = null + vsearch_cluster_id= 0.97 + + // Report options + report_template = "${projectDir}/assets/report_template.Rmd" + report_css = "${projectDir}/assets/nf-core_style.css" + report_logo = "${projectDir}/assets/nf-core-ampliseq_logo_light_long.png" + report_title = "Summary of analysis results" + report_abstract = null // Skipping options skip_cutadapt = false @@ -86,15 +98,20 @@ params { skip_diversity_indices = false skip_ancom = false skip_multiqc = false + skip_report = false // Database options - dada_ref_taxonomy = "silva=138" - dada_assign_taxlevels = null - dada_ref_tax_custom = null - dada_ref_tax_custom_sp = null - cut_dada_ref_taxonomy = false - sintax_ref_taxonomy = null - qiime_ref_taxonomy = null + dada_ref_taxonomy = "silva=138" + dada_assign_taxlevels = null + dada_ref_tax_custom = null + dada_ref_tax_custom_sp = null + cut_dada_ref_taxonomy = false + sintax_ref_taxonomy = null + qiime_ref_taxonomy = null + kraken2_ref_taxonomy = null + kraken2_assign_taxlevels = null + kraken2_ref_tax_custom = null + kraken2_confidence = 0 // MultiQC options multiqc_config = null @@ -105,7 +122,6 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -114,18 +130,14 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'dada_ref_databases,qiime_ref_databases,sintax_ref_databases,igenomes_base' - // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null // Max resource options @@ -134,6 +146,13 @@ params { max_cpus = 16 max_time = '240.h' + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'dada_ref_databases,qiime_ref_databases,sintax_ref_databases,kraken2_ref_databases,genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -153,13 +172,11 @@ try { // } catch (Exception e) { // System.err.println("WARNING: Could not load nf-core/config/ampliseq profiles: ${params.custom_config_base}/pipeline/ampliseq.config") // } - - profiles { debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' - cleanup = false + cleanup = false } conda { conda.enabled = true @@ -232,6 +249,7 @@ profiles { } apptainer { apptainer.enabled = true + apptainer.autoMounts = true conda.enabled = false docker.enabled = false singularity.enabled = false @@ -241,8 +259,8 @@ profiles { } gitpod { executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test.config' } test_single { includeConfig 'conf/test_single.config' } @@ -251,6 +269,7 @@ profiles { test_pacbio_its { includeConfig 'conf/test_pacbio_its.config' } test_iontorrent { includeConfig 'conf/test_iontorrent.config' } test_fasta { includeConfig 'conf/test_fasta.config' } + test_failed { includeConfig 'conf/test_failed.config' } test_full { includeConfig 'conf/test_full.config' } test_reftaxcustom { includeConfig 'conf/test_reftaxcustom.config' } test_novaseq { includeConfig 'conf/test_novaseq.config' } @@ -258,6 +277,18 @@ profiles { test_sintax { includeConfig 'conf/test_sintax.config' } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. @@ -273,29 +304,22 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -// Set default registry for Docker, Singularity and Podman independent of -profile -// Will not be used unless Docker, Singularity and Podman are enabled -// Set to your registry if you have a mirror of containers -docker.registry = 'quay.io' -podman.registry = 'quay.io' -singularity.registry = 'quay.io' - def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } manifest { @@ -305,8 +329,8 @@ manifest { description = """Amplicon sequencing analysis workflow using DADA2 and QIIME2""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '2.6.1' - doi = '10.3389/fmicb.2020.550420' + version = '2.7.0' + doi = '10.5281/zenodo.1493841' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 7d733cf8..9652e68b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,8 +15,23 @@ "type": "string", "mimetype": "text/tsv", "fa_icon": "fas fa-dna", - "description": "Either a tab-separated sample sheet, a fasta file, or a folder containing zipped FastQ files", - "help_text": "Points to the main pipeline input, one of the following:\n- folder containing compressed fastq files\n- sample sheet ending with `.tsv` that points towards compressed fastq files\n- fasta file ending with `.fasta`, `.fna` or `.fa` that will be taxonomically classified\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--multiple_sequencing_runs` (folder input only) if the sequencing data originates from multiple sequencing runs\n- `--extension` (folder input only) if the sequencing file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1). If the `--sintax_ref_taxonomy` is given, taxonomic assignment is performed using the USEARCH sintax method in addition to DADA2 assignTaxonomy (default: DADA2 assignTaxonomy and 16S rRNA sequence database)\n\n##### Folder containing zipped FastQ files\n\nFor example:\n\n```bash\n--input 'path/to/data'\n```\n\nExample for input data organization from one sequencing run with two samples, paired-end data:\n\n```bash\ndata\n \u251c\u2500sample1_1_L001_R1_001.fastq.gz\n \u251c\u2500sample1_1_L001_R2_001.fastq.gz\n \u251c\u2500sample2_1_L001_R1_001.fastq.gz\n \u2514\u2500sample2_1_L001_R2_001.fastq.gz\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The folder must contain gzip compressed demultiplexed fastq files. If the file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`), please check `--extension`.\n3. Sample identifiers are extracted from file names, i.e. the string before the first underscore `_`, these must be unique\n4. If your data is scattered, produce a sample sheet\n5. All sequencing data should originate from one sequencing run, because processing relies on run-specific error models that are unreliable when data from several sequencing runs are mixed. Sequencing data originating from multiple sequencing runs requires additionally the parameter `--multiple_sequencing_runs` and a specific folder structure.\n\n##### Sample sheet\n\nThe sample sheet file is an alternative way to provide input reads, it must be a tab-separated file ending with `.tsv` that must have two to four columns with the following headers: \n- `sampleID` (required): Unique sample identifiers, any unique string (may not contain dots `.`, must not start with a number)\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nFor example:\n\n```bash\n--input 'path/to/samplesheet.tsv'\n```\n\n##### Fasta file\n\nWhen pointing at a file ending with `.fasta`, `.fna` or `.fa`, the containing sequences will be taxonomically classified. All other pipeline steps will be skipped.\n\nThe sequence header line may contain a description, that will be kept as part of the sequence name. However, tabs will be changed into spaces.\n\nThe fasta file input option can be used to taxonomically classify previously produced ASV/OTU sequences.\n\nFor example:\n\n```bash\n--input 'path/to/amplicon_sequences.fasta'\n```" + "description": "Path to tab-separated sample sheet", + "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have two to four tab-separated columns/entries with the following headers: \n- `sampleID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)", + "schema": "assets/schema_input.json" + }, + "input_fasta": { + "type": "string", + "mimetype": "text/tsv", + "fa_icon": "fas fa-dna", + "description": "Path to ASV/OTU fasta file", + "help_text": "Path to fasta format file with sequences that will be taxonomically classified. The fasta file input option can be used to taxonomically classify previously produced ASV/OTU sequences.\n\nThe fasta sequence header line may contain a description, that will be kept as part of the sequence name. However, tabs will be changed into spaces.\n\nRelated parameters are:\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)" + }, + "input_folder": { + "type": "string", + "mimetype": "text/tsv", + "fa_icon": "fas fa-dna", + "description": "Path to folder containing zipped FastQ files", + "help_text": "Path to folder containing compressed fastq files.\n\nExample for input data organization from one sequencing run with two samples, paired-end data:\n\n```bash\ndata\n \u251c\u2500sample1_1_L001_R1_001.fastq.gz\n \u251c\u2500sample1_1_L001_R2_001.fastq.gz\n \u251c\u2500sample2_1_L001_R1_001.fastq.gz\n \u2514\u2500sample2_1_L001_R2_001.fastq.gz\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The folder must contain gzip compressed demultiplexed fastq files. If the file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`), please check `--extension`.\n3. Sample identifiers are extracted from file names, i.e. the string before the first underscore `_`, these must be unique\n4. If your data is scattered, produce a sample sheet\n5. All sequencing data should originate from one sequencing run, because processing relies on run-specific error models that are unreliable when data from several sequencing runs are mixed. Sequencing data originating from multiple sequencing runs requires additionally the parameter `--multiple_sequencing_runs` and a specific folder structure.\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--multiple_sequencing_runs` if the sequencing data originates from multiple sequencing runs\n- `--extension` if the sequencing file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)" }, "FW_primer": { "type": "string", @@ -41,9 +56,16 @@ "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" } }, - "required": ["input", "outdir"], + "required": ["outdir"], "fa_icon": "fas fa-terminal" }, "sequencing_input": { @@ -66,7 +88,8 @@ }, "single_end": { "type": "boolean", - "description": "If data is single-ended Illumina reads instead of paired-end" + "description": "If data is single-ended Illumina reads instead of paired-end", + "help_text": "When using a sample sheet with `--input` containing forward and reverse reads, specifying `--single_end` will only extract forward reads and treat the data as single ended instead of extracting forward and reverse reads." }, "illumina_pe_its": { "type": "boolean", @@ -186,6 +209,11 @@ "description": "DADA2 read filtering option", "fa_icon": "fas fa-less-than-equal", "help_text": "Remove reads with length greater than `max_len` after trimming and truncation. Must be a positive integer." + }, + "ignore_failed_filtering": { + "type": "boolean", + "description": "Ignore files with too few reads after quality filtering.", + "help_text": "Ignore files with fewer reads than specified by `--min_read_counts` after trimming and continue the pipeline without those samples. Please review all quality trimming and filtering options before using this parameter. For example, one sample with shorter sequences than other samples might loose all sequences due to minimum length requirements by read truncation (see --trunclenf)." } } }, @@ -217,7 +245,7 @@ "properties": { "dada_ref_taxonomy": { "type": "string", - "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `silva=138`) . This will download the desired database, format it to produce a file that is compatible with DADA2's assignTaxonomy and another file that is compatible with DADA2's addSpecies.\n\nThe following databases are supported:\n- GTDB - Genome Taxonomy Database - 16S rRNA\n- PR2 - Protist Reference Ribosomal Database - 18S rRNA\n- RDP - Ribosomal Database Project - 16S rRNA\n- SILVA ribosomal RNA gene database project - 16S rRNA\n- UNITE - eukaryotic nuclear ribosomal ITS region - ITS\n- COIDB - eukaryotic Cytochrome Oxidase I (COI) from The Barcode of Life Data System (BOLD) - COI\n\nGenerally, using `gtdb`, `pr2`, `rdp`, `sbdi-gtdb`, `silva`, `coidb`, `unite-fungi`, or `unite-alleuk` will select the most recent supported version. For details on what values are valid, please either use an invalid value such as `x` (causing the pipeline to send an error message with a list of all valid values) or see `conf/ref_databases.config`.\n\nPlease note that commercial/non-academic entities [require licensing](https://www.arb-silva.de/silva-license-information) for SILVA v132 database (non-default) but not from v138 on (default).", + "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `silva=138`) . This will download the desired database, format it to produce a file that is compatible with DADA2's assignTaxonomy and another file that is compatible with DADA2's addSpecies.\n\nThe following databases are supported:\n- GTDB - Genome Taxonomy Database - 16S rRNA\n- SBDI-GTDB, a Sativa-vetted version of the GTDB 16S rRNA\n- PR2 - Protist Reference Ribosomal Database - 18S rRNA\n- RDP - Ribosomal Database Project - 16S rRNA\n- SILVA ribosomal RNA gene database project - 16S rRNA\n- UNITE - eukaryotic nuclear ribosomal ITS region - ITS\n- COIDB - eukaryotic Cytochrome Oxidase I (COI) from The Barcode of Life Data System (BOLD) - COI\n\nGenerally, using `gtdb`, `pr2`, `rdp`, `sbdi-gtdb`, `silva`, `coidb`, `unite-fungi`, or `unite-alleuk` will select the most recent supported version.\n\nPlease note that commercial/non-academic entities [require licensing](https://www.arb-silva.de/silva-license-information) for SILVA v132 database (non-default) but not from v138 on (default).", "description": "Name of supported database, and optionally also version number", "default": "silva=138", "enum": [ @@ -226,6 +254,7 @@ "gtdb=R05-RS95", "gtdb=R06-RS202", "gtdb=R07-RS207", + "gtdb=R08-RS214", "gtdb", "coidb", "coidb=221216", @@ -322,9 +351,43 @@ "description": "Path to QIIME2 trained classifier file (typically *-classifier.qza)", "help_text": "If you have trained a compatible classifier before, from sources such as SILVA (https://www.arb-silva.de/), Greengenes (http://greengenes.secondgenome.com/downloads) or RDP (https://rdp.cme.msu.edu/). \n\nFor example:\n\n```bash\n--classifier \"FW_primer-RV_primer-classifier.qza\"\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The classifier is a Naive Bayes classifier produced by `qiime feature-classifier fit-classifier-naive-bayes` (e.g. by this pipeline)\n3. The primer pair for the amplicon PCR and the computing of the classifier are exactly the same (or full-length, potentially lower performance)\n4. The classifier has to be trained by the same version of scikit-learn as this version of the pipeline uses" }, + "kraken2_ref_taxonomy": { + "type": "string", + "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `silve=138`) . This will download the desired database and initiate taxonomic classification with Kraken2 and the chosen database.\n\nConsider using `--kraken2_confidence` to set a confidence score threshold.\n\nThe following databases are supported:\n- RDP - Ribosomal Database Project - 16S rRNA\n- SILVA ribosomal RNA gene database project - 16S rRNA\n- Greengenes - 16S rRNA\n- Standard Kraken2 database (RefSeq archaea, bacteria, viral, plasmid, human, UniVec_Core) - any amplicon\n\nGenerally, using `rdp`, `silva`, `greengenes`, `standard` will select the most recent supported version.\n\nPlease note that commercial/non-academic entities [require licensing](https://www.arb-silva.de/silva-license-information) for SILVA v132 database (non-default) but not from v138 on.", + "description": "Name of supported database, and optionally also version number", + "enum": [ + "silva", + "silva=138", + "silva=132", + "rdp", + "rdp=18", + "greengenes", + "greengenes=13.5", + "standard", + "standard=20230605" + ] + }, + "kraken2_ref_tax_custom": { + "type": "string", + "help_text": "Is preferred over `--kraken2_ref_taxonomy`. Consider also setting `--kraken2_assign_taxlevels`. Can be compressed tar archive (.tar.gz|.tgz) or folder containing the database. See also https://benlangmead.github.io/aws-indexes/k2.", + "description": "Path to a custom Kraken2 reference taxonomy database (*.tar.gz|*.tgz archive or folder)" + }, + "kraken2_assign_taxlevels": { + "type": "string", + "help_text": "Typically useful when providing a custom Kraken2 reference taxonomy database with `--kraken2_ref_tax_custom`. In case a database is given with `--kraken2_ref_taxonomy`, the default taxonomic levels will be overwritten with `--kraken2_assign_taxlevels`.", + "description": "Comma separated list of taxonomic levels used in Kraken2. Will overwrite default values." + }, + "kraken2_confidence": { + "type": "number", + "default": 0, + "help_text": "Increasing the threshold will require more k-mers to match at a taxonomic levels and reduce the taxonomic levels shown until the threshold is met.", + "description": "Confidence score threshold for taxonomic classification.", + "minimum": 0, + "maximum": 1 + }, "sintax_ref_taxonomy": { "type": "string", - "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `coidb=221216`) . This will download the desired database and initiate taxonomic classification with USEARCH sintax and the chosen database, which if needed is formatted to produce a file that is compatible with USEARCH sintax.\n\nThe following databases are supported:\n- COIDB - eukaryotic Cytochrome Oxidase I (COI) from The Barcode of Life Data System (BOLD) - COI\n- UNITE - eukaryotic nuclear ribosomal ITS region - ITS\n\nGenerally, using `coidb`, `unite-fungi`, or `unite-alleuk` will select the most recent supported version. For details on what values are valid, please either use an invalid value such as `x` (causing the pipeline to send an error message with a list of all valid values) or see `conf/ref_databases.config`.", + "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `coidb=221216`) . This will download the desired database and initiate taxonomic classification with VSEARCH sintax and the chosen database, which if needed is formatted to produce a file that is compatible with VSEARCH sintax.\n\nThe following databases are supported:\n- COIDB - eukaryotic Cytochrome Oxidase I (COI) from The Barcode of Life Data System (BOLD) - COI\n- UNITE - eukaryotic nuclear ribosomal ITS region - ITS\n\nGenerally, using `coidb`, `unite-fungi`, or `unite-alleuk` will select the most recent supported version.", "description": "Name of supported database, and optionally also version number", "enum": [ "coidb", @@ -365,6 +428,19 @@ "description": "", "default": "", "properties": { + "vsearch_cluster": { + "type": "boolean", + "description": "Post-cluster ASVs with VSEARCH", + "help_text": "ASVs will be clustered with VSEARCH using the id value found in `--vsearch_cluster_id`." + }, + "vsearch_cluster_id": { + "type": "number", + "default": 0.97, + "minimum": 0.0, + "maximum": 1.0, + "description": "Pairwise Identity value used when post-clustering ASVs if `--vsearch_cluster` option is used (default: 0.97).", + "help_text": "Lowering or increasing this value can change the number ASVs left over after clustering." + }, "filter_ssu": { "type": "string", "description": "Enable SSU filtering. Comma separated list of kingdoms (domains) in Barrnap, a combination (or one) of \"bac\", \"arc\", \"mito\", and \"euk\". ASVs that have their lowest evalue in that kingdoms are kept.", @@ -496,6 +572,39 @@ } } }, + "pipeline_report": { + "title": "Pipeline summary report", + "type": "object", + "description": "", + "default": "", + "properties": { + "report_template": { + "type": "string", + "default": "${projectDir}/assets/report_template.Rmd", + "description": "Path to Markdown file (Rmd)" + }, + "report_css": { + "type": "string", + "default": "${projectDir}/assets/nf-core_style.css", + "description": "Path to style file (css)" + }, + "report_logo": { + "type": "string", + "default": "${projectDir}/assets/nf-core-ampliseq_logo_light_long.png", + "description": "Path to logo file (png)" + }, + "report_title": { + "type": "string", + "default": "Summary of analysis results", + "description": "String used as report title" + }, + "report_abstract": { + "type": "string", + "default": null, + "description": "Path to Markdown file (md) that replaces the 'Abstract' section" + } + } + }, "skipping_specific_steps": { "title": "Skipping specific steps", "type": "object", @@ -557,6 +666,10 @@ "skip_multiqc": { "type": "boolean", "description": "Skip MultiQC reporting" + }, + "skip_report": { + "type": "boolean", + "description": "Skip Markdown summary report" } } }, @@ -572,19 +685,6 @@ "default": 100, "description": "Specifies the random seed." }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, "help": { "type": "boolean", "description": "Display help text.", @@ -643,6 +743,7 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true @@ -658,19 +759,33 @@ "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", "default": true, "fa_icon": "fas fa-check-square", "hidden": true + }, + "validationShowHiddenParams": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warning." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } }, @@ -683,7 +798,7 @@ "properties": { "max_cpus": { "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", + "description": "Maximum number of CPUs that can be requested for any single job.", "default": 16, "fa_icon": "fas fa-microchip", "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" @@ -701,7 +816,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } } @@ -786,6 +901,9 @@ { "$ref": "#/definitions/downstream_analysis" }, + { + "$ref": "#/definitions/pipeline_report" + }, { "$ref": "#/definitions/skipping_specific_steps" }, diff --git a/subworkflows/local/dada2_preprocessing.nf b/subworkflows/local/dada2_preprocessing.nf index fb2b44f3..b79b7035 100644 --- a/subworkflows/local/dada2_preprocessing.nf +++ b/subworkflows/local/dada2_preprocessing.nf @@ -41,10 +41,12 @@ workflow DADA2_PREPROCESSING { .set { ch_all_trimmed_reads } } + ch_DADA2_QUALITY1_SVG = Channel.empty() if ( !params.skip_dada_quality ) { DADA2_QUALITY1 ( ch_all_trimmed_reads.dump(tag: 'into_dada2_quality') ) ch_versions_dada2_preprocessing = ch_versions_dada2_preprocessing.mix(DADA2_QUALITY1.out.versions) DADA2_QUALITY1.out.warning.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn it.baseName.toString().replace("WARNING ","DADA2_QUALITY1: ") } + ch_DADA2_QUALITY1_SVG = DADA2_QUALITY1.out.svg } //find truncation values in case they are not supplied @@ -72,20 +74,52 @@ workflow DADA2_PREPROCESSING { DADA2_FILTNTRIM ( ch_trimmed_reads.dump(tag: 'into_filtntrim') ) ch_versions_dada2_preprocessing = ch_versions_dada2_preprocessing.mix(DADA2_FILTNTRIM.out.versions.first()) + //Filter empty files + DADA2_FILTNTRIM.out.reads_logs_args + .branch { + failed: it[0].single_end ? it[1].countFastq() < params.min_read_counts : it[1][0].countFastq() < params.min_read_counts || it[1][1].countFastq() < params.min_read_counts + passed: true + } + .set { ch_dada2_filtntrim_results } + ch_dada2_filtntrim_results.passed.set { ch_dada2_filtntrim_results_passed } + ch_dada2_filtntrim_results.failed + .map { meta, reads, logs, args -> [ meta.id ] } + .collect() + .subscribe { + samples = it.join("\n") + if (params.ignore_failed_filtering) { + log.warn "The following samples had too few reads (<$params.min_read_counts) after quality filtering with DADA2:\n$samples\nIgnoring failed samples and continue!\n" + } else { + error("The following samples had too few reads (<$params.min_read_counts) after quality filtering with DADA2:\n$samples\nPlease check whether the correct primer sequences for trimming were supplied. Ignore that samples using `--ignore_failed_filtering` or adjust the threshold with `--min_read_counts`.") + } + } + + // Break apart the reads and logs so that only the samples + // which pass filtering are retained + ch_dada2_filtntrim_results_passed + .map{ meta, reads, logs, args -> [meta, reads] } + .set{ ch_dada2_filtntrim_reads_passed } + ch_dada2_filtntrim_results_passed + .map{ meta, reads, logs, args -> [meta, logs] } + .set{ ch_dada2_filtntrim_logs_passed } + ch_dada2_filtntrim_results_passed + .map{ meta, reads, logs, args -> args } + .set{ ch_dada2_filtntrim_args_passed } + //plot post-processing, aggregated quality profile for forward and reverse reads separately if (single_end) { - DADA2_FILTNTRIM.out.reads + ch_dada2_filtntrim_reads_passed .map { meta, reads -> [ reads ] } .collect() .map { reads -> [ "single_end", reads ] } .set { ch_all_preprocessed_reads } } else { - DADA2_FILTNTRIM.out.reads + ch_dada2_filtntrim_reads_passed .map { meta, reads -> [ reads[0] ] } .collect() .map { reads -> [ "FW", reads ] } .set { ch_all_preprocessed_fw } - DADA2_FILTNTRIM.out.reads + ch_dada2_filtntrim_reads_passed .map { meta, reads -> [ reads[1] ] } .collect() .map { reads -> [ "RV", reads ] } @@ -94,13 +128,17 @@ workflow DADA2_PREPROCESSING { .mix ( ch_all_preprocessed_rv ) .set { ch_all_preprocessed_reads } } + + ch_DADA2_QUALITY2_SVG = Channel.empty() if ( !params.skip_dada_quality ) { DADA2_QUALITY2 ( ch_all_preprocessed_reads.dump(tag: 'into_dada2_quality2') ) DADA2_QUALITY2.out.warning.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn it.baseName.toString().replace("WARNING ","DADA2_QUALITY2: ") } + ch_DADA2_QUALITY2_SVG = DADA2_QUALITY2.out.svg } - //group by sequencing run - DADA2_FILTNTRIM.out.reads + //group reads by sequencing run + // 'groupTuple', 'size' or 'groupKey' should be used but to produce it we need to know how many elements to group but some can be lost here, so no way knowing before + ch_dada2_filtntrim_reads_passed .map { info, reads -> def meta = [:] @@ -117,8 +155,30 @@ workflow DADA2_PREPROCESSING { [ meta, reads.flatten().sort() ] } .set { ch_filt_reads } + //group logs by sequencing run + //for 'groupTuple', 'size' or 'groupKey' should be used but to produce it we need to know how many elements to group but some can be lost here, so no way knowing before + ch_dada2_filtntrim_logs_passed + .map { + info, reads -> + def meta = [:] + meta.run = info.run + meta.single_end = info.single_end + [ meta, reads, info.id ] } + .groupTuple(by: 0 ) + .map { + info, reads, ids -> + def meta = [:] + meta.run = info.run + meta.single_end = info.single_end + meta.id = ids.flatten().sort() + [ meta, reads.flatten().sort() ] } + .set { ch_filt_logs } + emit: - reads = ch_filt_reads - logs = DADA2_FILTNTRIM.out.log - versions = ch_versions_dada2_preprocessing + reads = ch_filt_reads + logs = ch_filt_logs + args = ch_dada2_filtntrim_args_passed + qc_svg = ch_DADA2_QUALITY1_SVG.collect() + qc_svg_preprocessed = ch_DADA2_QUALITY2_SVG.collect() + versions = ch_versions_dada2_preprocessing } diff --git a/subworkflows/local/dada2_taxonomy_wf.nf b/subworkflows/local/dada2_taxonomy_wf.nf index c5259e6c..9673b45e 100644 --- a/subworkflows/local/dada2_taxonomy_wf.nf +++ b/subworkflows/local/dada2_taxonomy_wf.nf @@ -104,6 +104,7 @@ workflow DADA2_TAXONOMY_WF { } emit: + cut_tax = params.cut_dada_ref_taxonomy ? CUTADAPT_TAXONOMY.out.log : [[],[]] tax = ch_dada2_tax versions = ch_versions_dada_taxonomy } diff --git a/subworkflows/local/kraken2_taxonomy_wf.nf b/subworkflows/local/kraken2_taxonomy_wf.nf new file mode 100644 index 00000000..f7d2ac48 --- /dev/null +++ b/subworkflows/local/kraken2_taxonomy_wf.nf @@ -0,0 +1,57 @@ +/* + * Taxonomic classification with Kraken2 + */ + +include { UNTAR } from '../../modules/nf-core/untar/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { FORMAT_TAXRESULTS_KRAKEN2 } from '../../modules/local/format_taxresults_kraken2' + +workflow KRAKEN2_TAXONOMY_WF { + take: + ch_kraken2_ref_taxonomy + val_kraken2_ref_taxonomy + ch_fasta + kraken2_taxlevels + + main: + ch_versions_kraken2_taxonomy = Channel.empty() + + // format taxonomy file + ch_kraken2_ref_taxonomy + .branch { + tar: it.isFile() && ( it.getName().endsWith(".tar.gz") || it.getName().endsWith (".tgz") ) + dir: it.isDirectory() + failed: true + }.set { ch_kraken2_ref_taxonomy } + ch_kraken2_ref_taxonomy.failed.subscribe { error "$it is neither a directory nor a file that ends in '.tar.gz' or '.tgz'. Please review input." } + + UNTAR ( + ch_kraken2_ref_taxonomy.tar + .map { + db -> + def meta = [:] + meta.id = val_kraken2_ref_taxonomy + [ meta, db ] } ) + ch_kraken2db = UNTAR.out.untar.map{ it[1] } + ch_kraken2db = ch_kraken2db.mix(ch_kraken2_ref_taxonomy.dir) + + // search taxonomy database with kraken2 + ch_fasta + .map { + fasta -> + def meta = [:] + meta.id = "ASV_tax.${val_kraken2_ref_taxonomy}" + meta.single_end = true + [ meta, fasta ] } + .set { ch_fasta_kraken2 } + KRAKEN2_KRAKEN2( ch_fasta_kraken2, ch_kraken2db, false, true ) + ch_versions_kraken2_taxonomy = ch_versions_kraken2_taxonomy.mix(KRAKEN2_KRAKEN2.out.versions) + + // convert kraken2 output to ASV taxonomy table + FORMAT_TAXRESULTS_KRAKEN2( KRAKEN2_KRAKEN2.out.report, KRAKEN2_KRAKEN2.out.classified_reads_assignment, kraken2_taxlevels ) + + emit: + qiime2_tsv = FORMAT_TAXRESULTS_KRAKEN2.out.qiime2_tsv + tax_tsv = FORMAT_TAXRESULTS_KRAKEN2.out.tsv + versions = ch_versions_kraken2_taxonomy +} diff --git a/subworkflows/local/parse_input.nf b/subworkflows/local/parse_input.nf index 33e09978..ba8aa484 100644 --- a/subworkflows/local/parse_input.nf +++ b/subworkflows/local/parse_input.nf @@ -1,167 +1,85 @@ -// -// Check input samplesheet or folder and get read channels -// - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def parse_samplesheet(LinkedHashMap row, single_end) { - //Check if samplesheet contains column sampleID & forwardReads - if (row.sampleID == null || row.forwardReads == null) { - error("ERROR: Please check input samplesheet -> Column 'sampleID' and 'forwardReads' are required but not detected.") - } - //Check if samplesheet contains a column for reverse reads - if (row.reverseReads == null && !single_end) { - error("ERROR: Please check input samplesheet -> Column 'reverseReads' is missing. In case you do have only single ended reads, please specify '--single_end', '--pacbio', or '--iontorrent'.") - } - //Check if samplesheet contains a column run and empty fields - if (row.run != null && row.run == "") { - error("ERROR: Please check input samplesheet -> Column 'run' contains an empty field. Either remove column 'run' or fill each field with a value.") - } - //read meta info - def meta = [:] - meta.id = row.sampleID - meta.single_end = single_end.toBoolean() - meta.run = row.run == null ? "1" : row.run - //read data info - def array = [] - if (!file(row.forwardReads).exists()) { - error("ERROR: Please check input samplesheet -> Forward read FastQ file does not exist!\n${row.forwardReads}") - } - if (meta.single_end) { - array = [ meta, file(row.forwardReads) ] - } else { - if (!file(row.reverseReads).exists()) { - error("ERROR: Please check input samplesheet -> Reverse read FastQ file does not exist!\n${row.reverseReads}") - } - array = [ meta, [ file(row.forwardReads), file(row.reverseReads) ] ] - } - return array -} - workflow PARSE_INPUT { take: - input // file.tsv or folder - is_fasta_input + input // folder single_end multiple_sequencing_runs extension main: - if ( is_fasta_input ) { - // Fasta input directely for classification - ch_fasta = Channel.fromPath(input, checkIfExists: true) - ch_reads_passed = Channel.empty() - } else { - ch_fasta = Channel.empty() - - if ( input.toString().toLowerCase().endsWith("tsv") ) { - // Sample sheet input + // Folder input - tsvFile = file(input).getName() - // extracts read files from TSV and distribute into channels - Channel - .fromPath(input) - .ifEmpty { error("Cannot find path file ${tsvFile}") } - .splitCsv(header:true, sep:'\t') - .map { parse_samplesheet(it, single_end) } - .set { ch_reads } - } else { - // Folder input - - //Check folders in folder when multiple_sequencing_runs - folders = multiple_sequencing_runs ? "/*" : "" - error_message = "\nCannot find any reads matching: \"${input}${folders}${extension}\"\n" - error_message += "Please revise the input folder (\"--input\"): \"${input}\"\n" - error_message += "and the input file pattern (\"--extension\"): \"${extension}\"\n" - error_message += "*Please note: Path needs to be enclosed in quotes!*\n" - error_message += multiple_sequencing_runs ? "If you do not have multiple sequencing runs, please do not use \"--multiple_sequencing_runs\"!\n" : "If you have multiple sequencing runs, please add \"--multiple_sequencing_runs\"!\n" - error_message += "In any case, please consult the pipeline documentation.\n" - if ( single_end ) { - //Get files - single end - Channel - .fromPath( input + folders + extension ) - .ifEmpty { error("${error_message}") } - .map { read -> - def meta = [:] - meta.id = read.baseName.toString().indexOf("_") != -1 ? read.baseName.toString().take(read.baseName.toString().indexOf("_")) : read.baseName - meta.single_end = single_end.toBoolean() - meta.run = multiple_sequencing_runs ? read.take(read.findLastIndexOf{"/"})[-1] : "1" - [ meta, read ] } - .set { ch_reads } - } else { - //Get files - paired end - Channel - .fromFilePairs( input + folders + extension, size: 2 ) - .ifEmpty { error("${error_message}") } - .map { name, reads -> - def meta = [:] - meta.id = name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name - meta.single_end = single_end.toBoolean() - meta.run = multiple_sequencing_runs ? reads[0].take(reads[0].findLastIndexOf{"/"})[-1] : "1" - [ meta, reads ] } - .set { ch_reads } - } - if (multiple_sequencing_runs) { - //Get folder information - ch_reads - .flatMap { meta, reads -> [ meta.run ] } - .unique() - .set { ch_folders } - //Report folders with sequencing files - ch_folders - .collect() - .subscribe { - String folders = it.toString().replace("[", "").replace("]","") - log.info "\nFound the folder(s) \"$folders\" containing sequencing read files matching \"${extension}\" in \"${input}\".\n" } - //Stop if folder count is 1 and multiple_sequencing_runs - ch_folders - .count() - .subscribe { if ( it == 1 ) error("Found only one folder with read data but \"--multiple_sequencing_runs\" was specified. Please review data input.") } - } - } - - //Check whether all sampleID = meta.id are unique + //Check folders in folder when multiple_sequencing_runs + folders = multiple_sequencing_runs ? "/*" : "" + error_message = "\nCannot find any reads matching: \"${input}${folders}${extension}\"\n" + error_message += "Please revise the input folder (\"--input\"): \"${input}\"\n" + error_message += "and the input file pattern (\"--extension\"): \"${extension}\"\n" + error_message += "*Please note: Path needs to be enclosed in quotes!*\n" + error_message += multiple_sequencing_runs ? "If you do not have multiple sequencing runs, please do not use \"--multiple_sequencing_runs\"!\n" : "If you have multiple sequencing runs, please add \"--multiple_sequencing_runs\"!\n" + error_message += "In any case, please consult the pipeline documentation.\n" + if ( single_end ) { + //Get files - single end + Channel + .fromPath( input + folders + extension ) + .ifEmpty { error("${error_message}") } + .map { read -> + def meta = [:] + meta.id = read.baseName.toString().indexOf("_") != -1 ? read.baseName.toString().take(read.baseName.toString().indexOf("_")) : read.baseName + meta.single_end = single_end.toBoolean() + meta.run = multiple_sequencing_runs ? read.take(read.findLastIndexOf{"/"})[-1] : "1" + [ meta, read ] } + .set { ch_reads } + } else { + //Get files - paired end + Channel + .fromFilePairs( input + folders + extension, size: 2 ) + .ifEmpty { error("${error_message}") } + .map { name, reads -> + def meta = [:] + meta.id = name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name + meta.single_end = single_end.toBoolean() + meta.run = multiple_sequencing_runs ? reads[0].take(reads[0].findLastIndexOf{"/"})[-1] : "1" + [ meta, reads ] } + .set { ch_reads } + } + if (multiple_sequencing_runs) { + //Get folder information ch_reads - .map { meta, reads -> [ meta.id ] } - .toList() + .flatMap { meta, reads -> [ meta.run ] } + .unique() + .set { ch_folders } + //Report folders with sequencing files + ch_folders + .collect() .subscribe { - if( it.size() != it.unique().size() ) { - ids = it.take(10); - error("Please review data input, sample IDs are not unique! First IDs are $ids") - } - } + String folders = it.toString().replace("[", "").replace("]","") + log.info "\nFound the folder(s) \"$folders\" containing sequencing read files matching \"${extension}\" in \"${input}\".\n" } + //Stop if folder count is 1 and multiple_sequencing_runs + ch_folders + .count() + .subscribe { if ( it == 1 ) error("Found only one folder with read data but \"--multiple_sequencing_runs\" was specified. Please review data input.") } + } - //Check that no dots "." are in sampleID - ch_reads - .map { meta, reads -> meta.id } - .subscribe { if ( "$it".contains(".") ) error("Please review data input, sampleIDs may not contain dots, but \"$it\" does.") } + //Check whether all sampleID = meta.id are unique + ch_reads + .map { meta, reads -> [ meta.id ] } + .toList() + .subscribe { + if( it.size() != it.unique().size() ) { + ids = it.take(10); + error("Please review data input, sample IDs are not unique! First IDs are $ids") + } + } - //Check that sampleIDs do not start with a number when using metadata (sampleID gets X prepended by R and metadata wont match any more!) - ch_reads - .map { meta, reads -> meta.id } - .subscribe { if ( params.metadata && "$it"[0].isNumber() ) error("Please review data input, sampleIDs may not start with a number, but \"$it\" does. The pipeline unintentionally modifies such strings and the metadata will not match any more.") } + //Check that no dots "." are in sampleID + ch_reads + .map { meta, reads -> meta.id } + .subscribe { if ( "$it".contains(".") ) error("Please review data input, sampleIDs may not contain dots, but \"$it\" does.") } - //Filter empty files - ch_reads.dump(tag:'parse_input.nf: ch_reads') - .branch { - failed: it[0].single_end ? it[1].countFastq() < params.min_read_counts : it[1][0].countFastq() < params.min_read_counts || it[1][1].countFastq() < params.min_read_counts - passed: true - } - .set { ch_reads_result } - ch_reads_result.passed.set { ch_reads_passed } - ch_reads_result.failed - .map { meta, reads -> [ meta.id ] } - .collect() - .subscribe { - samples = it.join("\n") - if (params.ignore_empty_input_files) { - log.warn "At least one input file for the following sample(s) had too few reads (<$params.min_read_counts):\n$samples\nThe threshold can be adjusted with `--min_read_counts`. Ignoring failed samples and continue!\n" - } else { - error("At least one input file for the following sample(s) had too few reads (<$params.min_read_counts):\n$samples\nEither remove those samples, adjust the threshold with `--min_read_counts`, or ignore that samples using `--ignore_empty_input_files`.") - } - } - } + //Check that sampleIDs do not start with a number when using metadata (sampleID gets X prepended by R and metadata wont match any more!) + ch_reads + .map { meta, reads -> meta.id } + .subscribe { if ( params.metadata && "$it"[0].isNumber() ) error("Please review data input, sampleIDs may not start with a number, but \"$it\" does. The pipeline unintentionally modifies such strings and the metadata will not match any more.") } emit: - reads = ch_reads_passed - fasta = ch_fasta + reads = ch_reads } diff --git a/subworkflows/local/phyloseq_workflow.nf b/subworkflows/local/phyloseq_workflow.nf new file mode 100644 index 00000000..adf208b7 --- /dev/null +++ b/subworkflows/local/phyloseq_workflow.nf @@ -0,0 +1,44 @@ +/* + * Create phyloseq objects + */ + +include { PHYLOSEQ } from '../../modules/local/phyloseq' +include { PHYLOSEQ_INASV } from '../../modules/local/phyloseq_inasv' + +workflow PHYLOSEQ_WORKFLOW { + take: + ch_tax + ch_tsv + ch_meta + ch_tree + run_qiime2 + + main: + if ( params.metadata ) { + ch_phyloseq_inmeta = ch_meta.first() // The .first() is to make sure it's a value channel + } else { + ch_phyloseq_inmeta = [] + } + + if ( params.pplace_tree ) { + ch_phyloseq_intree = ch_tree.map { it = it[1] }.first() + } else { + ch_phyloseq_intree = [] + } + + if ( run_qiime2 ) { + if ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) { + ch_phyloseq_inasv = PHYLOSEQ_INASV ( ch_tsv ).tsv + } else { + ch_phyloseq_inasv = ch_tsv + } + } else { + ch_phyloseq_inasv = ch_tsv + } + + PHYLOSEQ ( ch_tax, ch_phyloseq_inasv, ch_phyloseq_inmeta, ch_phyloseq_intree ) + + emit: + rds = PHYLOSEQ.out.rds + versions= PHYLOSEQ.out.versions +} diff --git a/subworkflows/local/qiime2_ancom.nf b/subworkflows/local/qiime2_ancom.nf index af83733d..ce308d78 100644 --- a/subworkflows/local/qiime2_ancom.nf +++ b/subworkflows/local/qiime2_ancom.nf @@ -34,4 +34,7 @@ workflow QIIME2_ANCOM { QIIME2_ANCOM_TAX.out.ancom.subscribe { if ( it.baseName[0].toString().startsWith("WARNING") ) log.warn it.baseName[0].toString().replace("WARNING ","QIIME2_ANCOM_TAX: ") } QIIME2_ANCOM_ASV ( ch_metadata.combine( QIIME2_FILTERSAMPLES_ANCOM.out.qza.flatten() ) ) + + emit: + ancom = QIIME2_ANCOM_ASV.out.ancom.mix(QIIME2_ANCOM_TAX.out.ancom) } diff --git a/subworkflows/local/qiime2_diversity.nf b/subworkflows/local/qiime2_diversity.nf index b3d7f64b..02f0d91e 100644 --- a/subworkflows/local/qiime2_diversity.nf +++ b/subworkflows/local/qiime2_diversity.nf @@ -71,4 +71,11 @@ workflow QIIME2_DIVERSITY { .set{ ch_to_diversity_betaord } QIIME2_DIVERSITY_BETAORD ( ch_to_diversity_betaord ) } + + emit: + depth = !skip_diversity_indices ? QIIME2_DIVERSITY_CORE.out.depth : [] + alpha = !skip_diversity_indices ? QIIME2_DIVERSITY_ALPHA.out.alpha : [] + beta = !skip_diversity_indices ? QIIME2_DIVERSITY_BETA.out.beta : [] + betaord = !skip_diversity_indices ? QIIME2_DIVERSITY_BETAORD.out.beta : [] + adonis = !skip_diversity_indices && params.qiime_adonis_formula ? QIIME2_DIVERSITY_ADONIS.out.html : [] } diff --git a/subworkflows/nf-core/fasta_newick_epang_gappa/meta.yml b/subworkflows/nf-core/fasta_newick_epang_gappa/meta.yml index 2337aa58..60002c82 100644 --- a/subworkflows/nf-core/fasta_newick_epang_gappa/meta.yml +++ b/subworkflows/nf-core/fasta_newick_epang_gappa/meta.yml @@ -7,7 +7,7 @@ keywords: - alignment - fasta - newick -modules: +components: - hmmer/hmmbuild - hmmer/hmmalign - hmmer/eslalimask diff --git a/tests/pipeline/doubleprimers.nf.test b/tests/pipeline/doubleprimers.nf.test index cd810025..3925c988 100644 --- a/tests/pipeline/doubleprimers.nf.test +++ b/tests/pipeline/doubleprimers.nf.test @@ -29,11 +29,15 @@ nextflow_pipeline { path("$outputDir/dada2/DADA2_stats.tsv"), path("$outputDir/dada2/DADA2_table.rds"), path("$outputDir/dada2/DADA2_table.tsv")).match("dada2") }, - { assert new File("$outputDir/qiime2/input/rep-seqs.qza").exists() }, - { assert new File("$outputDir/qiime2/input/table.qza").exists() }, { assert snapshot(path("$outputDir/input/Samplesheet_double_primer.tsv")).match("input") }, + { assert new File("$outputDir/qiime2/abundance_tables/feature-table.tsv").exists() }, + { assert new File("$outputDir/phyloseq/kraken2_phyloseq.rds").exists() }, + { assert snapshot(path("$outputDir/kraken2/ASV_tax.greengenes.kraken2.classifiedreads.txt"), + path("$outputDir/kraken2/ASV_tax.greengenes.kraken2.complete.tsv"), + path("$outputDir/kraken2/ASV_tax.greengenes.kraken2.tsv")).match("kraken2") }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() } ) } } diff --git a/tests/pipeline/doubleprimers.nf.test.snap b/tests/pipeline/doubleprimers.nf.test.snap index ff20b72e..7c59fc97 100644 --- a/tests/pipeline/doubleprimers.nf.test.snap +++ b/tests/pipeline/doubleprimers.nf.test.snap @@ -13,13 +13,13 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, KRAKEN2_KRAKEN2={kraken2=2.1.2, pigz=2.6}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2023.7.0}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], - "timestamp": "2023-05-28T21:08:54+0000" + "timestamp": "2023-07-27T13:49:03+0000" }, "overall_summary_tsv": { "content": [ - "overall_summary.tsv:md5,f0dd8c3dc3f45b84a1e32f88aabe7d08" + "overall_summary.tsv:md5,19d8b19b8f406d238011c2b3d384b7d3" ], "timestamp": "2023-05-28T21:08:54+0000" }, @@ -42,6 +42,14 @@ ], "timestamp": "2023-05-28T21:08:54+0000" }, + "kraken2": { + "content": [ + "ASV_tax.greengenes.kraken2.classifiedreads.txt:md5,9cbbe5e45608f14fe4d8ca32c3ad6518", + "ASV_tax.greengenes.kraken2.complete.tsv:md5,cbdd1db05ba897ac9972b106a8138956", + "ASV_tax.greengenes.kraken2.tsv:md5,87f748440bf5b3ce1c73029c495ff6e9" + ], + "timestamp": "2023-09-15T21:16:26+0000" + }, "multiqc": { "content": [ "multiqc_general_stats.txt:md5,8429be0a16adf09b6634bf31b430bfac", diff --git a/tests/pipeline/failed.nf.test b/tests/pipeline/failed.nf.test new file mode 100644 index 00000000..4c02b944 --- /dev/null +++ b/tests/pipeline/failed.nf.test @@ -0,0 +1,36 @@ +nextflow_pipeline { + + name "Test Workflow main.nf" + script "main.nf" + tag "test_failed" + tag "pipeline" + + test("Failing samples") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert new File("$outputDir/overall_summary.tsv").exists() }, + { assert new File("$outputDir/barrnap/summary.tsv").exists() }, + { assert new File("$outputDir/cutadapt/cutadapt_summary.tsv").exists() }, + { assert new File("$outputDir/dada2/DADA2_table.tsv").exists() }, + { assert new File("$outputDir/dada2/ASV_tax.user.tsv").exists() }, + { assert new File("$outputDir/qiime2/abundance_tables/count_table_filter_stats.tsv").exists() }, + { assert new File("$outputDir/qiime2/abundance_tables/feature-table.tsv").exists() }, + { assert new File("$outputDir/qiime2/ancom/Category-treatment1-ASV/ancom.tsv").exists() }, + { assert new File("$outputDir/qiime2/barplot/index.html").exists() }, + { assert new File("$outputDir/qiime2/alpha-rarefaction/index.html").exists() }, + { assert new File("$outputDir/qiime2/diversity/alpha_diversity/shannon_vector/kruskal-wallis-pairwise-treatment1.csv").exists() }, + { assert new File("$outputDir/qiime2/diversity/beta_diversity/bray_curtis_pcoa_results-PCoA/index.html").exists() }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } + ) + } + } +} diff --git a/tests/pipeline/fasta.nf.test b/tests/pipeline/fasta.nf.test index 9daca857..35408ce6 100644 --- a/tests/pipeline/fasta.nf.test +++ b/tests/pipeline/fasta.nf.test @@ -17,15 +17,22 @@ nextflow_pipeline { assertAll( { assert workflow.success }, { assert snapshot(UTILS.removeNextflowVersion("$outputDir")).match("software_versions") }, + { assert snapshot(path("$outputDir/asv_length_filter/ASV_len_filt.tsv"), + path("$outputDir/asv_length_filter/ASV_len_orig.tsv"), + path("$outputDir/asv_length_filter/ASV_seqs.len.fasta"), + path("$outputDir/asv_length_filter/stats.len.tsv")).match("asv_length_filter") }, { assert snapshot(path("$outputDir/barrnap/rrna.arc.gff"), path("$outputDir/barrnap/rrna.bac.gff"), path("$outputDir/barrnap/rrna.euk.gff"), path("$outputDir/barrnap/rrna.mito.gff")).match("barrnap") }, + { assert snapshot(path("$outputDir/codon_filter/ASV_codon_filtered.fna"), + path("$outputDir/codon_filter/ASV_codon_filtered.list")).match("codon_filter") }, { assert new File("$outputDir/barrnap/summary.tsv").exists() }, { assert snapshot(path("$outputDir/dada2/ref_taxonomy.rdp_18.txt")).match("dada2") }, { assert new File("$outputDir/dada2/ASV_tax_species.rdp_18.tsv").exists() }, { assert new File("$outputDir/dada2/ASV_tax.rdp_18.tsv").exists() }, - { assert snapshot(path("$outputDir/input/ASV_seqs.fasta")).match("input") } + { assert snapshot(path("$outputDir/input/ASV_seqs.fasta")).match("input") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() } ) } } diff --git a/tests/pipeline/fasta.nf.test.snap b/tests/pipeline/fasta.nf.test.snap index 93c68816..b8894c92 100644 --- a/tests/pipeline/fasta.nf.test.snap +++ b/tests/pipeline/fasta.nf.test.snap @@ -7,7 +7,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FILTER_CODONS={pandas=1.1.5, python=3.9.1}, FILTER_LEN_ASV={Biostrings=2.58.0, R=4.0.3}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-05-28T21:06:17+0000" }, @@ -17,6 +17,15 @@ ], "timestamp": "2023-05-28T21:06:17+0000" }, + "asv_length_filter": { + "content": [ + "ASV_len_filt.tsv:md5,0e5bd974219951f56db71843053ac796", + "ASV_len_orig.tsv:md5,09a300b31acf0eee817367dad5083f48", + "ASV_seqs.len.fasta:md5,9479b62fcff165963b33821e03fe7d7e", + "stats.len.tsv:md5,6cc39bcbd889634364499f3a5a82af35" + ], + "timestamp": "2023-08-31T13:06:17+0000" + }, "barrnap": { "content": [ "rrna.arc.gff:md5,6dae470aace9293d5eb8c318584852dd", @@ -25,5 +34,12 @@ "rrna.mito.gff:md5,df19e1b84ba6f691d20c72b397c88abf" ], "timestamp": "2023-05-28T21:06:17+0000" + }, + "codon_filter": { + "content": [ + "ASV_codon_filtered.fna:md5,9eac2d17e026ac8b1cba876ff3cac619", + "ASV_codon_filtered.list:md5,6cdde55829be1ebc5d73a17602e7c881" + ], + "timestamp": "2023-08-31T13:06:17+0000" } } diff --git a/tests/pipeline/iontorrent.nf.test b/tests/pipeline/iontorrent.nf.test index 9b73af86..200a9825 100644 --- a/tests/pipeline/iontorrent.nf.test +++ b/tests/pipeline/iontorrent.nf.test @@ -38,7 +38,9 @@ nextflow_pipeline { { assert snapshot(path("$outputDir/input/Samplesheet_it_SE_ITS.tsv")).match("input") }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/iontorrent.nf.test.snap b/tests/pipeline/iontorrent.nf.test.snap index 3fe42493..6d36ce3b 100644 --- a/tests/pipeline/iontorrent.nf.test.snap +++ b/tests/pipeline/iontorrent.nf.test.snap @@ -13,7 +13,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-06-20T01:42:35+0000" }, @@ -45,8 +45,8 @@ }, "multiqc": { "content": [ - "multiqc_fastqc.txt:md5,a64a5b74f7f336909d706a5747ad596d", - "multiqc_general_stats.txt:md5,e50f532aa2a52a79a2f76b66f970771c", + "multiqc_fastqc.txt:md5,6f1a5196d0840a35166b88e5b52bfcdd", + "multiqc_general_stats.txt:md5,8b11410948277038a1ba1a77bb30557f", "multiqc_cutadapt.txt:md5,9315dca2bd7b5476e54a9ccd8b1f24d5" ], "timestamp": "2023-06-20T01:42:35+0000" diff --git a/tests/pipeline/multi.nf.test b/tests/pipeline/multi.nf.test index e4fe28a0..3e01ff20 100644 --- a/tests/pipeline/multi.nf.test +++ b/tests/pipeline/multi.nf.test @@ -63,7 +63,9 @@ nextflow_pipeline { { assert new File("$outputDir/qiime2/representative_sequences/filtered-sequences.qza").exists() }, { assert new File("$outputDir/qiime2/representative_sequences/rep-seq.fasta").exists() }, { assert snapshot(path("$outputDir/qiime2/representative_sequences/descriptive_stats.tsv"), - path("$outputDir/qiime2/representative_sequences/seven_number_summary.tsv")).match("qiime2") } + path("$outputDir/qiime2/representative_sequences/seven_number_summary.tsv")).match("qiime2") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/multi.nf.test.snap b/tests/pipeline/multi.nf.test.snap index 82186c6a..98fae3c6 100644 --- a/tests/pipeline/multi.nf.test.snap +++ b/tests/pipeline/multi.nf.test.snap @@ -14,7 +14,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2023.7.0}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-05-28T21:15:03+0000" }, @@ -47,7 +47,7 @@ "multiqc": { "content": [ "multiqc_general_stats.txt:md5,5692ee73c6933866807706d29b15c880", - "multiqc_fastqc.txt:md5,3a4417c7d95a9bbe17751dc974157cd3" + "multiqc_fastqc.txt:md5,147764e40079c3abf97a17cfe2275c52" ], "timestamp": "2023-05-28T21:15:03+0000" } diff --git a/tests/pipeline/novaseq.nf.test b/tests/pipeline/novaseq.nf.test index a2101d3d..a346898d 100644 --- a/tests/pipeline/novaseq.nf.test +++ b/tests/pipeline/novaseq.nf.test @@ -28,7 +28,8 @@ nextflow_pipeline { { assert new File("$outputDir/fastqc/S2_2_fastqc.html").exists() }, { assert snapshot(path("$outputDir/input/Samplesheet_novaseq.tsv")).match("input") }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() } ) } } diff --git a/tests/pipeline/novaseq.nf.test.snap b/tests/pipeline/novaseq.nf.test.snap index 25dcf446..8b56978a 100644 --- a/tests/pipeline/novaseq.nf.test.snap +++ b/tests/pipeline/novaseq.nf.test.snap @@ -7,7 +7,7 @@ }, "software_versions": { "content": [ - "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_CODONS={pandas=1.1.5, python=3.9.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, FILTER_CODONS={pandas=1.1.5, python=3.9.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-06-20T00:10:02+0000" }, @@ -23,8 +23,8 @@ }, "multiqc": { "content": [ - "multiqc_fastqc.txt:md5,ac8744f9d5d4dc51957a719183074c2f", - "multiqc_general_stats.txt:md5,c4d6a79d9c271a72c2f6a7ef66fe39c6" + "multiqc_fastqc.txt:md5,a99548124efd185a7d4e33245edcb81b", + "multiqc_general_stats.txt:md5,195ce3e0c1b032459040613802d25d50" ], "timestamp": "2023-06-20T00:10:02+0000" } diff --git a/tests/pipeline/pacbio_its.nf.test b/tests/pipeline/pacbio_its.nf.test index 39e1d2a2..ffe4b31c 100644 --- a/tests/pipeline/pacbio_its.nf.test +++ b/tests/pipeline/pacbio_its.nf.test @@ -52,7 +52,9 @@ nextflow_pipeline { path("$outputDir/SBDI/emof.tsv"), path("$outputDir/SBDI/event.tsv")).match("SBDI") }, { assert new File("$outputDir/SBDI/annotation.tsv").exists() }, - { assert new File("$outputDir/SBDI/asv-table.tsv").exists() } + { assert new File("$outputDir/SBDI/asv-table.tsv").exists() }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/pacbio_its.nf.test.snap b/tests/pipeline/pacbio_its.nf.test.snap index 6aacb139..f26be552 100644 --- a/tests/pipeline/pacbio_its.nf.test.snap +++ b/tests/pipeline/pacbio_its.nf.test.snap @@ -35,7 +35,7 @@ }, "software_versions": { "content": [ - "{ASSIGNSH={pandas=1.1.5, python=3.9.1}, BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FORMAT_TAXRESULTS_STD={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_USEARCHGLOBAL={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{ASSIGNSH={pandas=1.1.5, python=3.9.1}, BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, FORMAT_TAXRESULTS_STD={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_USEARCHGLOBAL={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-06-20T02:07:02+0000" }, @@ -67,8 +67,8 @@ }, "multiqc": { "content": [ - "multiqc_fastqc.txt:md5,d289cd1a2523f9ad66e93a895dcbe52c", - "multiqc_general_stats.txt:md5,05682be32bc30bc4610f0ca608cafe67", + "multiqc_fastqc.txt:md5,e646906d896ac4514235ae263566a1a8", + "multiqc_general_stats.txt:md5,07a1affcf11214b293ea804eb560a2fb", "multiqc_cutadapt.txt:md5,1b3b6833e78db31ab12e5c16b7fa1d73" ], "timestamp": "2023-06-20T02:07:02+0000" diff --git a/tests/pipeline/pplace.nf.test b/tests/pipeline/pplace.nf.test index b78c479b..564cf2b9 100644 --- a/tests/pipeline/pplace.nf.test +++ b/tests/pipeline/pplace.nf.test @@ -55,7 +55,9 @@ nextflow_pipeline { { assert new File("$outputDir/pplace/test_pplace.taxonomy.per_query.tsv").exists() }, { assert new File("$outputDir/pplace/test_pplace.graft.test_pplace.epa_result.newick").exists() }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/pplace.nf.test.snap b/tests/pipeline/pplace.nf.test.snap index 10810531..e16b3e86 100644 --- a/tests/pipeline/pplace.nf.test.snap +++ b/tests/pipeline/pplace.nf.test.snap @@ -8,7 +8,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, EPANG_PLACE={epang=0.3.8}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, GAPPA_ASSIGN={gappa=0.8.0}, GAPPA_GRAFT={gappa=0.8.0}, GAPPA_HEATTREE={gappa=0.8.0}, HMMER_AFAFORMATQUERY={hmmer/easel=0.48}, HMMER_AFAFORMATREF={hmmer/easel=0.48}, HMMER_HMMALIGNQUERY={hmmer=3.3.2}, HMMER_HMMALIGNREF={hmmer=3.3.2}, HMMER_HMMBUILD={hmmer=3.3.2}, HMMER_MASKQUERY={hmmer/easel=0.48}, HMMER_MASKREF={hmmer/easel=0.48}, HMMER_UNALIGNREF={hmmer/easel=0.48}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, EPANG_PLACE={epang=0.3.8}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, GAPPA_ASSIGN={gappa=0.8.0}, GAPPA_GRAFT={gappa=0.8.0}, GAPPA_HEATTREE={gappa=0.8.0}, HMMER_AFAFORMATQUERY={hmmer/easel=0.48}, HMMER_AFAFORMATREF={hmmer/easel=0.48}, HMMER_HMMALIGNQUERY={hmmer=3.3.2}, HMMER_HMMALIGNREF={hmmer=3.3.2}, HMMER_HMMBUILD={hmmer=3.3.2}, HMMER_MASKQUERY={hmmer/easel=0.48}, HMMER_MASKREF={hmmer/easel=0.48}, HMMER_UNALIGNREF={hmmer/easel=0.48}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2023.7.0}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-06-20T17:24:03+0000" }, diff --git a/tests/pipeline/reftaxcustom.nf.test b/tests/pipeline/reftaxcustom.nf.test index 42e0d104..abd2a38a 100644 --- a/tests/pipeline/reftaxcustom.nf.test +++ b/tests/pipeline/reftaxcustom.nf.test @@ -41,9 +41,14 @@ nextflow_pipeline { { assert new File("$outputDir/fastqc/sampleID_2a_1_fastqc.html").exists() }, { assert new File("$outputDir/fastqc/sampleID_2a_2_fastqc.html").exists() }, { assert snapshot(path("$outputDir/input/Samplesheet.tsv")).match("input") }, + { assert snapshot(path("$outputDir/kraken2/ASV_tax.user.kraken2.classifiedreads.txt"), + path("$outputDir/kraken2/ASV_tax.user.kraken2.complete.tsv"), + path("$outputDir/kraken2/ASV_tax.user.kraken2.tsv")).match("kraken2") }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/reftaxcustom.nf.test.snap b/tests/pipeline/reftaxcustom.nf.test.snap index f1f6527e..7ca4c208 100644 --- a/tests/pipeline/reftaxcustom.nf.test.snap +++ b/tests/pipeline/reftaxcustom.nf.test.snap @@ -13,7 +13,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, KRAKEN2_KRAKEN2={kraken2=2.1.2, pigz=2.6}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-05-28T21:18:54+0000" }, @@ -42,9 +42,17 @@ ], "timestamp": "2023-05-28T21:18:54+0000" }, + "kraken2": { + "content": [ + "ASV_tax.user.kraken2.classifiedreads.txt:md5,8a4693c37d5c24b342ef161b92567764", + "ASV_tax.user.kraken2.complete.tsv:md5,3613dac9ce1bf03f87b57d1523e705f1", + "ASV_tax.user.kraken2.tsv:md5,95c3f9daa5da8fe00159fb07d394c3ce" + ], + "timestamp": "2023-09-15T21:16:26+0000" + }, "multiqc": { "content": [ - "multiqc_fastqc.txt:md5,3a4417c7d95a9bbe17751dc974157cd3", + "multiqc_fastqc.txt:md5,147764e40079c3abf97a17cfe2275c52", "multiqc_general_stats.txt:md5,88c2b9e6d02b83afe4f9551e6c9a91a7", "multiqc_cutadapt.txt:md5,330a7b72dc671ca99fcb3fb84b6776c1" ], diff --git a/tests/pipeline/single.nf.test b/tests/pipeline/single.nf.test index be236c9a..11bb9156 100644 --- a/tests/pipeline/single.nf.test +++ b/tests/pipeline/single.nf.test @@ -24,10 +24,10 @@ nextflow_pipeline { path("$outputDir/barrnap/rrna.mito.gff")).match("barrnap") }, { assert new File("$outputDir/barrnap/summary.tsv").exists() }, { assert snapshot(path("$outputDir/cutadapt/cutadapt_summary.tsv")).match("cutadapt") }, - { assert new File("$outputDir/cutadapt/1a_S103_L001_R1_001.trimmed.cutadapt.log").exists() }, - { assert new File("$outputDir/cutadapt/1_S103_L001_R1_001.trimmed.cutadapt.log").exists() }, - { assert new File("$outputDir/cutadapt/2a_S115_L001_R1_001.trimmed.cutadapt.log").exists() }, - { assert new File("$outputDir/cutadapt/2_S115_L001_R1_001.trimmed.cutadapt.log").exists() }, + { assert new File("$outputDir/cutadapt/sampleID_1a.trimmed.cutadapt.log").exists() }, + { assert new File("$outputDir/cutadapt/sampleID_1.trimmed.cutadapt.log").exists() }, + { assert new File("$outputDir/cutadapt/sampleID_2a.trimmed.cutadapt.log").exists() }, + { assert new File("$outputDir/cutadapt/sampleID_2.trimmed.cutadapt.log").exists() }, { assert new File("$outputDir/cutadapt/assignTaxonomy.cutadapt.log").exists() }, { assert snapshot(path("$outputDir/dada2/ASV_seqs.fasta"), path("$outputDir/dada2/ASV_table.tsv"), @@ -37,14 +37,16 @@ nextflow_pipeline { path("$outputDir/dada2/DADA2_table.tsv")).match("dada2") }, { assert new File("$outputDir/dada2/ASV_tax.rdp_18.tsv").exists() }, { assert new File("$outputDir/dada2/ASV_tax_species.rdp_18.tsv").exists() }, - { assert new File("$outputDir/fastqc/1a_S103_L001_R1_001_fastqc.html").exists() }, - { assert new File("$outputDir/fastqc/1_S103_L001_R1_001_fastqc.html").exists() }, - { assert new File("$outputDir/fastqc/2a_S115_L001_R1_001_fastqc.html").exists() }, - { assert new File("$outputDir/fastqc/2_S115_L001_R1_001_fastqc.html").exists() }, - { assert snapshot(path("$outputDir/input/Samplesheet_single_end.tsv")).match("input") }, + { assert new File("$outputDir/fastqc/sampleID_1a_fastqc.html").exists() }, + { assert new File("$outputDir/fastqc/sampleID_1_fastqc.html").exists() }, + { assert new File("$outputDir/fastqc/sampleID_2a_fastqc.html").exists() }, + { assert new File("$outputDir/fastqc/sampleID_2_fastqc.html").exists() }, + { assert snapshot(path("$outputDir/input/Samplesheet.tsv")).match("input") }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/single.nf.test.snap b/tests/pipeline/single.nf.test.snap index 86d628f1..77572d5d 100644 --- a/tests/pipeline/single.nf.test.snap +++ b/tests/pipeline/single.nf.test.snap @@ -1,38 +1,38 @@ { "input": { "content": [ - "Samplesheet_single_end.tsv:md5,71bcb9920b1187571ba9e2a5759ee4a5" + "Samplesheet.tsv:md5,dbf8d1a2b7933dab9e5a139f33c2b1f4" ], - "timestamp": "2023-05-28T20:35:33+0000" + "timestamp": "2023-08-16T20:35:33+0000" }, "cutadapt": { "content": [ - "cutadapt_summary.tsv:md5,5e5bfa4a7324a44f6d9e3cb0978ca291" + "cutadapt_summary.tsv:md5,cde6a72b1f0daccb7b69727834fbb9e5" ], - "timestamp": "2023-05-28T20:35:33+0000" + "timestamp": "2023-08-16T20:35:33+0000" }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-05-28T20:35:33+0000" }, "overall_summary_tsv": { "content": [ - "overall_summary.tsv:md5,0feea9a92fde36cbf63dba6e63617c7e" + "overall_summary.tsv:md5,9c37a0292537273537640cdb0dd8fba5" ], - "timestamp": "2023-05-28T20:35:33+0000" + "timestamp": "2023-08-16T20:35:33+0000" }, "dada2": { "content": [ "ASV_seqs.fasta:md5,d452ff8b8a306b52ffc6db7e4396c6db", - "ASV_table.tsv:md5,a1226d8573fc0595161d4b2b5ac63cac", + "ASV_table.tsv:md5,06b93679e1f67a8707d2cc7edf345340", "ref_taxonomy.rdp_18.txt:md5,815c4fce9f3d1de019fb995a43fb66ed", - "DADA2_stats.tsv:md5,8386cc209c1f64237deeec79f75b075b", - "DADA2_table.rds:md5,aefd24f6ac2753a43baca19a93c4e2ee", - "DADA2_table.tsv:md5,3e5280fd5b36c943c0148c4d5b50cb65" + "DADA2_stats.tsv:md5,d4802595db56db3ae706f1650a774e5c", + "DADA2_table.rds:md5,a8e68947cb81f49a36d243619fe5e2f0", + "DADA2_table.tsv:md5,27c340a79b092d8ebea347f9d9324996" ], - "timestamp": "2023-05-28T20:35:33+0000" + "timestamp": "2023-08-16T20:35:33+0000" }, "barrnap": { "content": [ @@ -45,10 +45,10 @@ }, "multiqc": { "content": [ - "multiqc_fastqc.txt:md5,0ea2e6e2d327d66e778e9ff5d03d933b", - "multiqc_general_stats.txt:md5,5040629a246bb3288879d3d30e9d6f40", - "multiqc_cutadapt.txt:md5,48d079bea04fe93d260b980c15793a0c" + "multiqc_fastqc.txt:md5,ede83a16cac9730e6b961ed051c1de0e", + "multiqc_general_stats.txt:md5,d22c32eed33d046503751b23670db5e4", + "multiqc_cutadapt.txt:md5,4311a83074d94040405937e02773c5a9" ], - "timestamp": "2023-05-28T20:35:33+0000" + "timestamp": "2023-08-16T20:35:33+0000" } } diff --git a/tests/pipeline/sintax.nf.test b/tests/pipeline/sintax.nf.test index f6de2995..f4ff3a4f 100644 --- a/tests/pipeline/sintax.nf.test +++ b/tests/pipeline/sintax.nf.test @@ -65,7 +65,9 @@ nextflow_pipeline { { assert new File("$outputDir/sintax/ASV_tax_sintax.unite-fungi.tsv").exists() }, { assert new File("$outputDir/sintax/ref_taxonomy_sintax.txt").exists() }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/sintax_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/sintax.nf.test.snap b/tests/pipeline/sintax.nf.test.snap index f4ba8fdd..1c19892e 100644 --- a/tests/pipeline/sintax.nf.test.snap +++ b/tests/pipeline/sintax.nf.test.snap @@ -16,7 +16,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_SINTAX={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2023.7.0}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_SINTAX={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-06-20T16:40:18+0000" }, @@ -60,7 +60,7 @@ }, "multiqc": { "content": [ - "multiqc_general_stats.txt:md5,05682be32bc30bc4610f0ca608cafe67", + "multiqc_general_stats.txt:md5,07a1affcf11214b293ea804eb560a2fb", "multiqc_cutadapt.txt:md5,1b3b6833e78db31ab12e5c16b7fa1d73" ], "timestamp": "2023-06-20T16:40:18+0000" diff --git a/tests/pipeline/test.nf.test b/tests/pipeline/test.nf.test index 7b295941..f76bb324 100644 --- a/tests/pipeline/test.nf.test +++ b/tests/pipeline/test.nf.test @@ -35,12 +35,12 @@ nextflow_pipeline { { assert new File("$outputDir/cutadapt/sampleID_2.trimmed.cutadapt.log").exists() }, { assert snapshot(path("$outputDir/dada2/ASV_seqs.fasta"), path("$outputDir/dada2/ASV_table.tsv"), - path("$outputDir/dada2/ref_taxonomy.gtdb.txt"), + path("$outputDir/dada2/ref_taxonomy.gtdb_R07-RS207.txt"), path("$outputDir/dada2/DADA2_stats.tsv"), path("$outputDir/dada2/DADA2_table.rds"), path("$outputDir/dada2/DADA2_table.tsv")).match("dada2") }, - { assert new File("$outputDir/dada2/ASV_tax.gtdb.tsv").exists() }, - { assert new File("$outputDir/dada2/ASV_tax_species.gtdb.tsv").exists() }, + { assert new File("$outputDir/dada2/ASV_tax.gtdb_R07-RS207.tsv").exists() }, + { assert new File("$outputDir/dada2/ASV_tax_species.gtdb_R07-RS207.tsv").exists() }, { assert new File("$outputDir/fastqc/sampleID_1_1_fastqc.html").exists() }, { assert new File("$outputDir/fastqc/sampleID_1_2_fastqc.html").exists() }, { assert new File("$outputDir/fastqc/sampleID_1a_1_fastqc.html").exists() }, @@ -93,7 +93,13 @@ nextflow_pipeline { path("$outputDir/SBDI/emof.tsv"), path("$outputDir/SBDI/event.tsv")).match("SBDI") }, { assert new File("$outputDir/SBDI/annotation.tsv").exists() }, - { assert new File("$outputDir/SBDI/asv-table.tsv").exists() } + { assert new File("$outputDir/SBDI/asv-table.tsv").exists() }, + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }, + { assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() }, + { assert snapshot(path("$outputDir/vsearch_cluster/ASV_post_clustering_filtered.fna"), + path("$outputDir/vsearch_cluster/ASV_post_clustering_filtered.stats.tsv"), + path("$outputDir/vsearch_cluster/ASV_post_clustering_filtered.table.tsv")).match("vsearch_cluster") } ) } } diff --git a/tests/pipeline/test.nf.test.snap b/tests/pipeline/test.nf.test.snap index 11a200cb..d3b9e8ac 100644 --- a/tests/pipeline/test.nf.test.snap +++ b/tests/pipeline/test.nf.test.snap @@ -22,13 +22,13 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_LEN_ASV={Biostrings=2.58.0, R=4.0.3}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.6.1}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.4, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.3.1, dada2=1.28.0}, DADA2_FILTNTRIM={R=4.3.1, dada2=1.28.0}, DADA2_QUALITY1={R=4.3.1, ShortRead=1.58.0, dada2=1.28.0}, DADA2_TAXONOMY={R=4.3.1, dada2=1.28.0}, FASTQC={fastqc=0.12.1}, FILTER_CLUSTERS={pandas=1.1.5, python=3.9.1}, FILTER_LEN_ASV={Biostrings=2.58.0, R=4.0.3}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2023.7.0}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, VSEARCH_CLUSTER={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0}}" ], "timestamp": "2023-05-28T20:55:32+0000" }, "overall_summary_tsv": { "content": [ - "overall_summary.tsv:md5,42879bf840cad10ef96b1da77e0768b7" + "overall_summary.tsv:md5,a20dea9e5f4ca80723d31494399510fc" ], "timestamp": "2023-05-28T20:55:32+0000" }, @@ -36,7 +36,7 @@ "content": [ "ASV_seqs.fasta:md5,864c3e0dc9b4a7649beee0c8665dceb5", "ASV_table.tsv:md5,2618251e597593e4d716dd9bed095539", - "ref_taxonomy.gtdb.txt:md5,eb743d553579d464dcaa6bdd75f69ccd", + "ref_taxonomy.gtdb_R07-RS207.txt:md5,06b393cd18dfe6a26a8003dcef6c521c", "DADA2_stats.tsv:md5,54a1ac8d6c5a3ff15f700c4b2dd40c86", "DADA2_table.rds:md5,d095501019ce7ebccfa0eb801db1ed29", "DADA2_table.tsv:md5,5c9fb0bfd70da165f0ce6a361bfe0b43" @@ -45,11 +45,11 @@ }, "barrnap": { "content": [ - "ASV_seqs.ssu.fasta:md5,10f0d101b63c193b5899c704488f9001", - "ASV_table.ssu.tsv:md5,03556c7c2372ad6c3cecc7390ed9617d", - "rrna.arc.gff:md5,6dae470aace9293d5eb8c318584852dd", - "rrna.bac.gff:md5,439a9084f089120f700f938dfb58fa41", - "rrna.euk.gff:md5,c9bc1d9d8fb77dc19c95dee2d53840eb", + "ASV_seqs.ssu.fasta:md5,12ec676f3a7edeb075705430ad8c208e", + "ASV_table.ssu.tsv:md5,78df5df258b5e24b1fbcb97a3aeddec0", + "rrna.arc.gff:md5,e4c828506e3713c9c74a87d02c941440", + "rrna.bac.gff:md5,4a832ff1efb4c8a306c014481033cb09", + "rrna.euk.gff:md5,ad8b84800ff3f911db88657fd58b7cd9", "stats.ssu.tsv:md5,dbb23448395135334093edbb3efb1cd0", "rrna.mito.gff:md5,df19e1b84ba6f691d20c72b397c88abf" ], @@ -57,7 +57,7 @@ }, "multiqc": { "content": [ - "multiqc_fastqc.txt:md5,3a4417c7d95a9bbe17751dc974157cd3", + "multiqc_fastqc.txt:md5,147764e40079c3abf97a17cfe2275c52", "multiqc_general_stats.txt:md5,88c2b9e6d02b83afe4f9551e6c9a91a7", "multiqc_cutadapt.txt:md5,330a7b72dc671ca99fcb3fb84b6776c1" ], @@ -65,12 +65,20 @@ }, "asv_length_filter": { "content": [ - "ASV_len_filt.tsv:md5,a0a6de452999953191b5df29d0b3176a", - "ASV_len_orig.tsv:md5,09a300b31acf0eee817367dad5083f48", - "ASV_seqs.len.fasta:md5,04e8838f97cabf090b95891cde5c45b7", - "ASV_table.len.tsv:md5,8a68fdf1da3bbb0e24bfe4f90af0fbfd", + "ASV_len_filt.tsv:md5,3099f093f831e21397de2310d4c83ff4", + "ASV_len_orig.tsv:md5,5ad596085cdcb600ce32c09bc786a678", + "ASV_seqs.len.fasta:md5,1d61dbb2e90956a1f2698718a4dc48c5", + "ASV_table.len.tsv:md5,9464eadabcf39ed4b71a75957660e2a7", "stats.len.tsv:md5,9b13b65cf9cc5fd59f6d8717d26202ed" ], "timestamp": "2023-05-28T20:55:32+0000" + }, + "vsearch_cluster": { + "content": [ + "ASV_post_clustering_filtered.fna:md5,57379771c59df16d0c0164db34e8f148", + "ASV_post_clustering_filtered.stats.tsv:md5,222f6a7cdbf2d4b3f745a49359a61c0d", + "ASV_post_clustering_filtered.table.tsv:md5,e607469a32d490bbe139e9196ff4a47d" + ], + "timestamp": "2023-05-28T20:55:32+0000" } } diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf index 03e5bf55..05ddfee7 100644 --- a/workflows/ampliseq.nf +++ b/workflows/ampliseq.nf @@ -1,21 +1,19 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' -// Validate input parameters -WorkflowAmpliseq.initialise(params, log) +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// Check input path parameters to see if they exist -// params.input may be: folder, samplesheet, fasta file, and therefore should not appear here (because tests only for "file") -def checkPathParamList = [ params.multiqc_config, params.metadata, params.classifier ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { error('Input samplesheet not specified!') } +WorkflowAmpliseq.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -49,7 +47,7 @@ if (params.dada_ref_tax_custom) { ch_assigntax = Channel.fromPath("${params.dada_ref_tax_custom}", checkIfExists: true) if (params.dada_ref_tax_custom_sp) { ch_addspecies = Channel.fromPath("${params.dada_ref_tax_custom_sp}", checkIfExists: true) - } + } else { ch_addspecies = Channel.empty() } ch_dada_ref_taxonomy = Channel.empty() val_dada_ref_taxonomy = "user" } else if (params.dada_ref_taxonomy && !params.skip_dada_taxonomy && !params.skip_taxonomy) { @@ -73,12 +71,27 @@ if (params.sintax_ref_taxonomy && !params.skip_taxonomy) { val_sintax_ref_taxonomy = "none" } +if (params.kraken2_ref_tax_custom) { + //custom ref taxonomy input from params.kraken2_ref_tax_custom + ch_kraken2_ref_taxonomy = Channel.fromPath("${params.kraken2_ref_tax_custom}", checkIfExists: true) + val_kraken2_ref_taxonomy = "user" +} else if (params.kraken2_ref_taxonomy && !params.skip_taxonomy) { + //standard ref taxonomy input from params.dada_ref_taxonomy & conf/ref_databases.config + ch_kraken2_ref_taxonomy = Channel.fromList(params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["file"]).map { file(it) } + val_kraken2_ref_taxonomy = params.kraken2_ref_taxonomy.replace('=','_').replace('.','_') +} else { + ch_kraken2_ref_taxonomy = Channel.empty() + val_kraken2_ref_taxonomy = "none" +} + +// report sources +ch_report_template = Channel.fromPath("${params.report_template}", checkIfExists: true) +ch_report_css = Channel.fromPath("${params.report_css}", checkIfExists: true) +ch_report_logo = Channel.fromPath("${params.report_logo}", checkIfExists: true) +ch_report_abstract = params.report_abstract ? Channel.fromPath(params.report_abstract, checkIfExists: true) : [] // Set non-params Variables -String[] fasta_extensions = [".fasta", ".fna", ".fa"] // this is the alternative ASV fasta input -is_fasta_input = WorkflowAmpliseq.checkIfFileHasExtension( params.input.toString().toLowerCase(), fasta_extensions ) - single_end = params.single_end if (params.pacbio || params.iontorrent) { single_end = true @@ -86,15 +99,11 @@ if (params.pacbio || params.iontorrent) { trunclenf = params.trunclenf ?: 0 trunclenr = params.trunclenr ?: 0 -if ( !single_end && !params.illumina_pe_its && (params.trunclenf == null || params.trunclenr == null) && !is_fasta_input ) { +if ( !single_end && !params.illumina_pe_its && (params.trunclenf == null || params.trunclenr == null) && !params.input_fasta ) { find_truncation_values = true log.warn "No DADA2 cutoffs were specified (`--trunclenf` & `--trunclenr`), therefore reads will be truncated where median quality drops below ${params.trunc_qmin} (defined by `--trunc_qmin`) but at least a fraction of ${params.trunc_rmin} (defined by `--trunc_rmin`) of the reads will be retained.\nThe chosen cutoffs do not account for required overlap for merging, therefore DADA2 might have poor merging efficiency or even fail.\n" } else { find_truncation_values = false } -if ( !is_fasta_input && (!params.FW_primer || !params.RV_primer) && !params.skip_cutadapt ) { - error("Incompatible parameters: `--FW_primer` and `--RV_primer` are required for primer trimming. If primer trimming is not needed, use `--skip_cutadapt`.") -} - // save params to values to be able to overwrite it tax_agglom_min = params.tax_agglom_min tax_agglom_max = params.tax_agglom_max @@ -109,6 +118,10 @@ if ( params.sintax_ref_taxonomy ) { } else { sintax_taxlevels = "" } +if ( params.kraken2_ref_taxonomy ) { + kraken2_taxlevels = params.kraken2_assign_taxlevels ? "${params.kraken2_assign_taxlevels}" : + params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["taxlevels"] ?: "" +} else { kraken2_taxlevels = params.kraken2_assign_taxlevels ? "${params.kraken2_assign_taxlevels}" : "" } //make sure that taxlevels adheres to requirements when mixed with addSpecies if ( params.dada_ref_taxonomy && !params.skip_dada_addspecies && !params.skip_dada_taxonomy && !params.skip_taxonomy && taxlevels ) { @@ -118,13 +131,16 @@ if ( params.dada_ref_taxonomy && !params.skip_dada_addspecies && !params.skip_da } //only run QIIME2 when taxonomy is actually calculated and all required data is available -if ( !(workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) && !params.skip_taxonomy && !params.skip_qiime && (!params.skip_dada_taxonomy || params.sintax_ref_taxonomy || params.qiime_ref_taxonomy) ) { +if ( !(workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) && !params.skip_taxonomy && !params.skip_qiime && (!params.skip_dada_taxonomy || params.sintax_ref_taxonomy || params.qiime_ref_taxonomy || params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom) ) { run_qiime2 = true } else { run_qiime2 = false if ( workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1 ) { log.warn "Conda or mamba is enabled, any steps involving QIIME2 are not available. Use a container engine instead of conda to enable all software." } } +// This tracks tax tables produced during pipeline and each table will be used during phyloseq +ch_tax_for_phyloseq = Channel.empty() + /* ======================================================================================== @@ -145,7 +161,7 @@ include { FILTER_SSU } from '../modules/local/filter_ssu' include { FILTER_LEN_ASV } from '../modules/local/filter_len_asv' include { MERGE_STATS as MERGE_STATS_FILTERSSU } from '../modules/local/merge_stats' include { MERGE_STATS as MERGE_STATS_FILTERLENASV } from '../modules/local/merge_stats' -include { MERGE_STATS as MERGE_STATS_CODONS } from '../modules/local/merge_stats' +include { MERGE_STATS as MERGE_STATS_CODONS } from '../modules/local/merge_stats' include { FILTER_CODONS } from '../modules/local/filter_codons' include { FORMAT_FASTAINPUT } from '../modules/local/format_fastainput' include { FORMAT_TAXONOMY } from '../modules/local/format_taxonomy' @@ -165,6 +181,10 @@ include { QIIME2_INTAX } from '../modules/local/qiime2_intax' include { PICRUST } from '../modules/local/picrust' include { SBDIEXPORT } from '../modules/local/sbdiexport' include { SBDIEXPORTREANNOTATE } from '../modules/local/sbdiexportreannotate' +include { SUMMARY_REPORT } from '../modules/local/summary_report' +include { PHYLOSEQ_INTAX as PHYLOSEQ_INTAX_PPLACE } from '../modules/local/phyloseq_intax' +include { PHYLOSEQ_INTAX as PHYLOSEQ_INTAX_QIIME2 } from '../modules/local/phyloseq_intax' +include { FILTER_CLUSTERS } from '../modules/local/filter_clusters' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -177,10 +197,12 @@ include { QIIME2_TAXONOMY } from '../subworkflows/local/qiime2_tax include { CUTADAPT_WORKFLOW } from '../subworkflows/local/cutadapt_workflow' include { DADA2_TAXONOMY_WF } from '../subworkflows/local/dada2_taxonomy_wf' include { SINTAX_TAXONOMY_WF } from '../subworkflows/local/sintax_taxonomy_wf' +include { KRAKEN2_TAXONOMY_WF } from '../subworkflows/local/kraken2_taxonomy_wf' include { QIIME2_EXPORT } from '../subworkflows/local/qiime2_export' include { QIIME2_BARPLOTAVG } from '../subworkflows/local/qiime2_barplotavg' include { QIIME2_DIVERSITY } from '../subworkflows/local/qiime2_diversity' include { QIIME2_ANCOM } from '../subworkflows/local/qiime2_ancom' +include { PHYLOSEQ_WORKFLOW } from '../subworkflows/local/phyloseq_workflow' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -195,6 +217,7 @@ include { QIIME2_ANCOM } from '../subworkflows/local/qiime2_anc include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' include { FASTA_NEWICK_EPANG_GAPPA } from '../subworkflows/nf-core/fasta_newick_epang_gappa/main' @@ -212,10 +235,49 @@ workflow AMPLISEQ { ch_versions = Channel.empty() // - // Create a channel for input read files + // Create input channels // - PARSE_INPUT ( params.input, is_fasta_input, single_end, params.multiple_sequencing_runs, params.extension ) - ch_reads = PARSE_INPUT.out.reads + ch_input_fasta = Channel.empty() + ch_input_reads = Channel.empty() + if ( params.input ) { + // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ + ch_input_reads = Channel.fromSamplesheet("input") + .map{ meta, readfw, readrv -> + meta.single_end = single_end.toBoolean() + def reads = single_end ? readfw : [readfw,readrv] + if ( !meta.single_end && !readrv ) { error("Entry `reverseReads` is missing in $params.input for $meta.id, either correct the samplesheet or use `--single_end`, `--pacbio`, or `--iontorrent`") } // make sure that reverse reads are present when single_end isnt specified + if ( !meta.single_end && ( readfw.getSimpleName() == meta.id || readrv.getSimpleName() == meta.id ) ) { error("Entry `sampleID` cannot be identical to simple name of `forwardReads` or `reverseReads`, please change `sampleID` in $params.input for sample $meta.id") } // sample name and any file name without extensions arent identical, because rename_raw_data_files.nf would forward 3 files (2 renamed +1 input) instead of 2 in that case + if ( meta.single_end && ( readfw.getSimpleName() == meta.id+"_1" || readfw.getSimpleName() == meta.id+"_2" ) ) { error("Entry `sampleID`+ `_1` or `_2` cannot be identical to simple name of `forwardReads`, please change `sampleID` in $params.input for sample $meta.id") } // sample name and file name without extensions arent identical, because rename_raw_data_files.nf would forward 2 files (1 renamed +1 input) instead of 1 in that case + return [meta, reads] } + } else if ( params.input_fasta ) { + ch_input_fasta = Channel.fromPath(params.input_fasta, checkIfExists: true) + } else if ( params.input_folder ) { + PARSE_INPUT ( params.input_folder, single_end, params.multiple_sequencing_runs, params.extension ) + ch_input_reads = PARSE_INPUT.out.reads + } else { + error("One of `--input`, `--input_fasta`, `--input_folder` must be provided!") + } + + //Filter empty files + ch_input_reads.dump(tag:'ch_input_reads') + .branch { + failed: it[0].single_end ? it[1].countFastq() < params.min_read_counts : it[1][0].countFastq() < params.min_read_counts || it[1][1].countFastq() < params.min_read_counts + passed: true + } + .set { ch_reads_result } + ch_reads_result.passed.set { ch_reads } + ch_reads_result.failed + .map { meta, reads -> [ meta.id ] } + .collect() + .subscribe { + samples = it.join("\n") + if (params.ignore_empty_input_files) { + log.warn "At least one input file for the following sample(s) had too few reads (<$params.min_read_counts):\n$samples\nThe threshold can be adjusted with `--min_read_counts`. Ignoring failed samples and continue!\n" + } else { + error("At least one input file for the following sample(s) had too few reads (<$params.min_read_counts):\n$samples\nEither remove those samples, adjust the threshold with `--min_read_counts`, or ignore that samples using `--ignore_empty_input_files`.") + } + } + ch_reads.dump(tag: 'ch_reads') // // MODULE: Rename files @@ -282,20 +344,6 @@ workflow AMPLISEQ { //group by sequencing run & group by meta DADA2_PREPROCESSING.out.logs - .map { - info, reads -> - def meta = [:] - meta.run = info.run - meta.single_end = info.single_end - [ meta, reads, info.id ] } - .groupTuple(by: 0 ) - .map { - info, reads, ids -> - def meta = [:] - meta.run = info.run - meta.single_end = info.single_end - meta.id = ids.flatten().sort() - [ meta, reads.flatten().sort() ] } .join( DADA2_DENOISING.out.denoised ) .join( DADA2_DENOISING.out.mergers ) .join( DADA2_RMCHIMERA.out.rds ) @@ -315,18 +363,40 @@ workflow AMPLISEQ { ch_stats = DADA2_MERGE.out.dada2stats } - + // + // MODULE : ASV post-clustering with VSEARCH + // + if (params.vsearch_cluster) { + ch_fasta_for_clustering = DADA2_MERGE.out.fasta + .map { + fasta -> + def meta = [:] + meta.id = "ASV_post_clustering" + [ meta, fasta ] } + VSEARCH_CLUSTER ( ch_fasta_for_clustering ) + ch_versions = ch_versions.mix(VSEARCH_CLUSTER.out.versions.ifEmpty(null)) + FILTER_CLUSTERS ( VSEARCH_CLUSTER.out.clusters, DADA2_MERGE.out.asv ) + ch_versions = ch_versions.mix(FILTER_CLUSTERS.out.versions.ifEmpty(null)) + ch_dada2_fasta = FILTER_CLUSTERS.out.fasta + ch_dada2_asv = FILTER_CLUSTERS.out.asv + } else { + ch_dada2_fasta = DADA2_MERGE.out.fasta + ch_dada2_asv = DADA2_MERGE.out.asv + } // - // Modules : Filter rRNA + // Entry for ASV fasta files via "--input_fasta" // - if ( is_fasta_input ) { - FORMAT_FASTAINPUT( PARSE_INPUT.out.fasta ) + if ( params.input_fasta ) { + FORMAT_FASTAINPUT( ch_input_fasta ) ch_unfiltered_fasta = FORMAT_FASTAINPUT.out.fasta } else { - ch_unfiltered_fasta = DADA2_MERGE.out.fasta + ch_unfiltered_fasta = ch_dada2_fasta } + // + // Modules : Filter rRNA + // if (!params.skip_barrnap && params.filter_ssu) { BARRNAP ( ch_unfiltered_fasta ) BARRNAPSUMMARY ( BARRNAP.out.gff.collect() ) @@ -337,7 +407,7 @@ workflow AMPLISEQ { } ch_barrnapsummary = BARRNAPSUMMARY.out.summary ch_versions = ch_versions.mix(BARRNAP.out.versions.ifEmpty(null)) - FILTER_SSU ( DADA2_MERGE.out.fasta, DADA2_MERGE.out.asv, BARRNAPSUMMARY.out.summary ) + FILTER_SSU ( ch_unfiltered_fasta, ch_dada2_asv.ifEmpty( [] ), BARRNAPSUMMARY.out.summary ) MERGE_STATS_FILTERSSU ( ch_stats, FILTER_SSU.out.stats ) ch_stats = MERGE_STATS_FILTERSSU.out.tsv ch_dada2_fasta = FILTER_SSU.out.fasta @@ -349,11 +419,9 @@ workflow AMPLISEQ { ch_barrnapsummary = BARRNAPSUMMARY.out.summary ch_versions = ch_versions.mix(BARRNAP.out.versions.ifEmpty(null)) ch_dada2_fasta = ch_unfiltered_fasta - ch_dada2_asv = DADA2_MERGE.out.asv } else { ch_barrnapsummary = Channel.empty() ch_dada2_fasta = ch_unfiltered_fasta - ch_dada2_asv = DADA2_MERGE.out.asv } // @@ -366,18 +434,22 @@ workflow AMPLISEQ { ch_stats = MERGE_STATS_FILTERLENASV.out.tsv ch_dada2_fasta = FILTER_LEN_ASV.out.fasta ch_dada2_asv = FILTER_LEN_ASV.out.asv + // Make sure that not all sequences were removed + ch_dada2_fasta.subscribe { if (it.countLines() == 0) error("ASV length filtering activated by '--min_len_asv' or '--max_len_asv' removed all ASVs, please adjust settings.") } } // // Modules : Filtering based on codons in an open reading frame // if (params.filter_codons ) { - FILTER_CODONS ( ch_dada2_fasta, ch_dada2_asv, ch_stats ) + FILTER_CODONS ( ch_dada2_fasta, ch_dada2_asv.ifEmpty( [] ) ) ch_versions = ch_versions.mix(FILTER_CODONS.out.versions.ifEmpty(null)) MERGE_STATS_CODONS( ch_stats, FILTER_CODONS.out.stats ) ch_stats = MERGE_STATS_CODONS.out.tsv ch_dada2_fasta = FILTER_CODONS.out.fasta ch_dada2_asv = FILTER_CODONS.out.asv + // Make sure that not all sequences were removed + ch_dada2_fasta.subscribe { if (it.countLines() == 0) error("ASV codon filtering activated by '--filter_codons' removed all ASVs, please adjust settings.") } } // @@ -422,10 +494,25 @@ workflow AMPLISEQ { taxlevels ).tax.set { ch_dada2_tax } ch_versions = ch_versions.mix(DADA2_TAXONOMY_WF.out.versions) + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( ch_dada2_tax.map { it = [ "dada2", file(it) ] } ) } else { ch_dada2_tax = Channel.empty() } + //Kraken2 + if (!params.skip_taxonomy && (params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom) ) { + KRAKEN2_TAXONOMY_WF ( + ch_kraken2_ref_taxonomy, + val_kraken2_ref_taxonomy, + ch_fasta, + kraken2_taxlevels + ).qiime2_tsv.set { ch_kraken2_tax } + ch_versions = ch_versions.mix(KRAKEN2_TAXONOMY_WF.out.versions) + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( ch_kraken2_tax.map { it = [ "kraken2", file(it) ] } ) + } else { + ch_kraken2_tax = Channel.empty() + } + // SINTAX if (!params.skip_taxonomy && params.sintax_ref_taxonomy) { SINTAX_TAXONOMY_WF ( @@ -436,6 +523,7 @@ workflow AMPLISEQ { sintax_taxlevels ).tax.set { ch_sintax_tax } ch_versions = ch_versions.mix(SINTAX_TAXONOMY_WF.out.versions) + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( ch_sintax_tax.map { it = [ "sintax", file(it) ] } ) } else { ch_sintax_tax = Channel.empty() } @@ -456,8 +544,8 @@ workflow AMPLISEQ { } FASTA_NEWICK_EPANG_GAPPA ( ch_pp_data ) ch_versions = ch_versions.mix( FASTA_NEWICK_EPANG_GAPPA.out.versions ) - ch_pplace_tax = FORMAT_PPLACETAX ( FASTA_NEWICK_EPANG_GAPPA.out.taxonomy_per_query ).tsv + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( PHYLOSEQ_INTAX_PPLACE ( ch_pplace_tax ).tsv.map { it = [ "pplace", file(it) ] } ) } else { ch_pplace_tax = Channel.empty() } @@ -477,6 +565,10 @@ workflow AMPLISEQ { ch_qiime_classifier ) ch_versions = ch_versions.mix( QIIME2_TAXONOMY.out.versions.ifEmpty(null) ) //usually a .first() is here, dont know why this leads here to a warning + ch_qiime2_tax = QIIME2_TAXONOMY.out.tsv + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( PHYLOSEQ_INTAX_QIIME2 ( ch_qiime2_tax ).tsv.map { it = [ "qiime2", file(it) ] } ) + } else { + ch_qiime2_tax = Channel.empty() } // @@ -495,23 +587,33 @@ workflow AMPLISEQ { // Import taxonomic classification into QIIME2, if available if ( params.skip_taxonomy ) { log.info "Skip taxonomy classification" + val_used_taxonomy = "skipped" ch_tax = Channel.empty() tax_agglom_min = 1 tax_agglom_max = 2 - } else if ( params.sintax_ref_taxonomy ) { - log.info "Use SINTAX taxonomy classification" - ch_tax = QIIME2_INTAX ( ch_sintax_tax ).qza } else if ( params.pplace_tree && params.pplace_taxonomy) { log.info "Use EPA-NG / GAPPA taxonomy classification" - ch_tax = QIIME2_INTAX ( ch_pplace_tax ).qza + val_used_taxonomy = "phylogenetic placement" + ch_tax = QIIME2_INTAX ( ch_pplace_tax, "parse_dada2_taxonomy.r" ).qza } else if ( params.dada_ref_taxonomy && !params.skip_dada_taxonomy ) { log.info "Use DADA2 taxonomy classification" - ch_tax = QIIME2_INTAX ( ch_dada2_tax ).qza + val_used_taxonomy = "DADA2" + ch_tax = QIIME2_INTAX ( ch_dada2_tax, "parse_dada2_taxonomy.r" ).qza + } else if ( params.sintax_ref_taxonomy ) { + log.info "Use SINTAX taxonomy classification" + val_used_taxonomy = "SINTAX" + ch_tax = QIIME2_INTAX ( ch_sintax_tax, "parse_dada2_taxonomy.r" ).qza + } else if ( params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom ) { + log.info "Use Kraken2 taxonomy classification" + val_used_taxonomy = "Kraken2" + ch_tax = QIIME2_INTAX ( ch_kraken2_tax, "" ).qza } else if ( params.qiime_ref_taxonomy || params.classifier ) { log.info "Use QIIME2 taxonomy classification" + val_used_taxonomy = "QIIME2" ch_tax = QIIME2_TAXONOMY.out.qza } else { log.info "Use no taxonomy classification" + val_used_taxonomy = "none" ch_tax = Channel.empty() tax_agglom_min = 1 tax_agglom_max = 2 @@ -540,7 +642,7 @@ workflow AMPLISEQ { } //Export various ASV tables if (!params.skip_abundance_tables) { - QIIME2_EXPORT ( ch_asv, ch_seq, ch_tax, QIIME2_TAXONOMY.out.tsv, ch_dada2_tax, ch_pplace_tax, ch_sintax_tax, tax_agglom_min, tax_agglom_max ) + QIIME2_EXPORT ( ch_asv, ch_seq, ch_tax, ch_qiime2_tax, ch_dada2_tax, ch_pplace_tax, ch_sintax_tax, tax_agglom_min, tax_agglom_max ) } if (!params.skip_barplot) { @@ -597,13 +699,15 @@ workflow AMPLISEQ { tax_agglom_max ) } + } else { + ch_tsv = ch_dada2_asv } // // MODULE: Predict functional potential of a bacterial community from marker genes with Picrust2 // if ( params.picrust ) { - if ( run_qiime2 && !params.skip_abundance_tables && ( params.dada_ref_taxonomy || params.qiime_ref_taxonomy || params.classifier || params.sintax_ref_taxonomy ) && !params.skip_taxonomy ) { + if ( run_qiime2 && !params.skip_abundance_tables && ( params.dada_ref_taxonomy || params.qiime_ref_taxonomy || params.classifier || params.sintax_ref_taxonomy || params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom ) && !params.skip_taxonomy ) { PICRUST ( QIIME2_EXPORT.out.abs_fasta, QIIME2_EXPORT.out.abs_tsv, "QIIME2", "This Picrust2 analysis is based on filtered reads from QIIME2" ) } else { PICRUST ( ch_fasta, ch_dada2_asv, "DADA2", "This Picrust2 analysis is based on unfiltered reads from DADA2" ) @@ -627,6 +731,29 @@ workflow AMPLISEQ { ch_versions = ch_versions.mix(SBDIEXPORT.out.versions.first()) } + // + // SUBWORKFLOW: Create phyloseq objects + // + if ( !params.skip_taxonomy ) { + if ( params.pplace_tree ) { + ch_tree_for_phyloseq = FASTA_NEWICK_EPANG_GAPPA.out.grafted_phylogeny + } else { + ch_tree_for_phyloseq = [] + } + + PHYLOSEQ_WORKFLOW ( + ch_tax_for_phyloseq, + ch_tsv, + ch_metadata.ifEmpty([]), + ch_tree_for_phyloseq, + run_qiime2 + ) + ch_versions = ch_versions.mix(PHYLOSEQ_WORKFLOW.out.versions.first()) + } + + // + // MODULE: Sortware versions + // CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) @@ -638,7 +765,7 @@ workflow AMPLISEQ { workflow_summary = WorkflowAmpliseq.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowAmpliseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + methods_description = WorkflowAmpliseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() @@ -661,11 +788,85 @@ workflow AMPLISEQ { multiqc_report = MULTIQC.out.report.toList() } + // + // MODULE: Summary Report + // + if (!params.skip_report) { + SUMMARY_REPORT ( + ch_report_template, + ch_report_css, + ch_report_logo, + ch_report_abstract, + ch_metadata.ifEmpty( [] ), + params.input ? file(params.input) : [], // samplesheet input + ch_input_fasta.ifEmpty( [] ), // fasta input + !params.input_fasta && !params.skip_fastqc && !params.skip_multiqc ? MULTIQC.out.plots : [], //.collect().flatten().collectFile(name: "mqc_fastqc_per_sequence_quality_scores_plot_1.svg") + !params.skip_cutadapt ? CUTADAPT_WORKFLOW.out.summary.collect().ifEmpty( [] ) : [], + find_truncation_values, + DADA2_PREPROCESSING.out.args.first().ifEmpty( [] ), + !params.skip_dada_quality ? DADA2_PREPROCESSING.out.qc_svg.ifEmpty( [] ) : [], + !params.skip_dada_quality ? DADA2_PREPROCESSING.out.qc_svg_preprocessed.ifEmpty( [] ) : [], + DADA2_ERR.out.svg + .map { + meta_old, svgs -> + def meta = [:] + meta.single_end = meta_old.single_end + [ meta, svgs, meta_old.run ] } + .groupTuple(by: 0 ) + .map { + meta_old, svgs, runs -> + def meta = [:] + meta.single_end = meta_old.single_end + meta.run = runs.flatten() + [ meta, svgs.flatten() ] + }.ifEmpty( [[],[]] ), + DADA2_MERGE.out.asv.ifEmpty( [] ), + ch_unfiltered_fasta.ifEmpty( [] ), // this is identical to DADA2_MERGE.out.fasta if !params.input_fasta + DADA2_MERGE.out.dada2asv.ifEmpty( [] ), + DADA2_MERGE.out.dada2stats.ifEmpty( [] ), + params.vsearch_cluster ? FILTER_CLUSTERS.out.asv.ifEmpty( [] ) : [], + !params.skip_barrnap ? BARRNAPSUMMARY.out.summary.ifEmpty( [] ) : [], + params.filter_ssu ? FILTER_SSU.out.stats.ifEmpty( [] ) : [], + params.filter_ssu ? FILTER_SSU.out.fasta.ifEmpty( [] ) : [], + params.min_len_asv || params.max_len_asv ? FILTER_LEN_ASV.out.stats.ifEmpty( [] ) : [], + params.min_len_asv || params.max_len_asv ? FILTER_LEN_ASV.out.len_orig.ifEmpty( [] ) : [], + params.filter_codons ? FILTER_CODONS.out.fasta.ifEmpty( [] ) : [], + params.filter_codons ? FILTER_CODONS.out.stats.ifEmpty( [] ) : [], + params.cut_its != "none" ? ITSX_CUTASV.out.summary.ifEmpty( [] ) : [], + !params.skip_taxonomy && params.dada_ref_taxonomy && !params.skip_dada_taxonomy ? ch_dada2_tax.ifEmpty( [] ) : [], + !params.skip_taxonomy && params.dada_ref_taxonomy && !params.skip_dada_taxonomy ? DADA2_TAXONOMY_WF.out.cut_tax.ifEmpty( [[],[]] ) : [[],[]], + !params.skip_taxonomy && params.sintax_ref_taxonomy ? ch_sintax_tax.ifEmpty( [] ) : [], + !params.skip_taxonomy && ( params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom ) ? KRAKEN2_TAXONOMY_WF.out.tax_tsv.ifEmpty( [] ) : [], + !params.skip_taxonomy && params.pplace_tree ? ch_pplace_tax.ifEmpty( [] ) : [], + !params.skip_taxonomy && params.pplace_tree ? FASTA_NEWICK_EPANG_GAPPA.out.heattree.ifEmpty( [[],[]] ) : [[],[]], + !params.skip_taxonomy && ( params.qiime_ref_taxonomy || params.classifier ) && run_qiime2 ? QIIME2_TAXONOMY.out.tsv.ifEmpty( [] ) : [], + run_qiime2, + run_qiime2 ? val_used_taxonomy : "", + run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? ch_dada2_asv.countLines()+","+QIIME2_FILTERTAXA.out.tsv.countLines() : "", + run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? FILTER_STATS.out.tsv.ifEmpty( [] ) : [], + run_qiime2 && !params.skip_barplot ? QIIME2_BARPLOT.out.folder.ifEmpty( [] ) : [], + run_qiime2 && !params.skip_abundance_tables ? QIIME2_EXPORT.out.abs_tsv.ifEmpty( [] ) : [], + run_qiime2 && !params.skip_alpha_rarefaction && params.metadata ? "done" : "", + run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.depth.ifEmpty( [] ) : [], + run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.alpha.collect().ifEmpty( [] ) : [], + run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.beta.collect().ifEmpty( [] ) : [], + run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.adonis.collect().ifEmpty( [] ) : [], + run_qiime2 && !params.skip_ancom && params.metadata ? QIIME2_ANCOM.out.ancom.collect().ifEmpty( [] ) : [], + params.picrust ? PICRUST.out.pathways.ifEmpty( [] ) : [], + params.sbdiexport ? SBDIEXPORT.out.sbditables.mix(SBDIEXPORTREANNOTATE.out.sbdiannottables).collect().ifEmpty( [] ) : [], + !params.skip_taxonomy ? PHYLOSEQ_WORKFLOW.out.rds.map{info,rds -> [rds]}.collect().ifEmpty( [] ) : [] + ) + ch_versions = ch_versions.mix(SUMMARY_REPORT.out.versions) + } + //Save input in results folder - input = file(params.input) - if ( is_fasta_input || input.toString().toLowerCase().endsWith("tsv") ) { + if ( params.input ) { + file("${params.outdir}/input").mkdir() + file("${params.input}").copyTo("${params.outdir}/input") + } + if ( params.input_fasta ) { file("${params.outdir}/input").mkdir() - input.copyTo("${params.outdir}/input") + file("${params.input_fasta}").copyTo("${params.outdir}/input") } //Save metadata in results folder if ( params.metadata ) { @@ -684,6 +885,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)