From 870d2ca8cacc3fae0768fe67c450b546ef796fdb Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 16 May 2023 19:17:43 +0200 Subject: [PATCH 01/30] Post patch release bump --- .github/workflows/ci.yml | 6 +++--- Dockerfile | 4 ++-- environment.yml | 2 +- nextflow.config | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 363c4fea4..df69e75cd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,13 +37,13 @@ jobs: - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/eager:2.4.7 + run: docker build --no-cache . -t nfcore/eager:dev - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/eager:dev - docker tag nfcore/eager:dev nfcore/eager:2.4.7 + docker tag nfcore/eager:dev nfcore/eager:dev - name: Install Nextflow env: @@ -216,4 +216,4 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index a8a37f54f..302afc92e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ COPY environment.yml / RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-eager-2.4.7/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.4.8dev/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-eager-2.4.7 > nf-core-eager-2.4.7.yml \ No newline at end of file +RUN conda env export --name nf-core-eager-2.4.8dev > nf-core-eager-2.4.8dev.yml \ No newline at end of file diff --git a/environment.yml b/environment.yml index 784b7c383..528d207aa 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-eager-2.4.7 +name: nf-core-eager-2.4.8dev channels: - conda-forge - bioconda diff --git a/nextflow.config b/nextflow.config index 67acbd616..5ed6894f0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -285,7 +285,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/eager:2.4.7' +process.container = 'nfcore/eager:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -415,7 +415,7 @@ manifest { description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline' mainScript = 'main.nf' nextflowVersion = '>=20.07.1' - version = '2.4.7' + version = '2.4.8dev' } // Function to ensure that resource requirements don't go beyond From 550225c72e60958e1b7b4ab68fe13e66bbe3f91c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 15 Aug 2023 15:39:46 +0200 Subject: [PATCH 02/30] Add UDG info to trimbam output bam name. --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 3d2470161..4ac17d697 100644 --- a/main.nf +++ b/main.nf @@ -2245,8 +2245,8 @@ process bam_trim { // def right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_none_udg_right}" """ bam trimBam $bam tmp.bam -L ${left_clipping} -R ${right_clipping} ${softclip} - samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}.trimmed.bam - samtools index ${libraryid}.trimmed.bam ${size} + samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}_udg${udg}.trimmed.bam + samtools index ${libraryid}_udg${udg}.trimmed.bam ${size} """ } From 1005ce8e5fd417823b4f9759248530d297694e6d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 15 Aug 2023 16:16:45 +0200 Subject: [PATCH 03/30] bump multiQC version --- CHANGELOG.md | 14 ++++++++++++++ environment.yml | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 232c073b5..2b7e90c6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,20 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [2.5.0] - 2023-XX-XX + +### `Added` + +### `Fixed` + +- [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments. + +### `Dependencies` + +- `multiqc`: 1.14 -> 1.15 + +### `Deprecated` + ## [2.4.7] - 2023-05-16 ### `Added` diff --git a/environment.yml b/environment.yml index 784b7c383..00cc32e88 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8 - - bioconda::multiqc=1.14 + - bioconda::multiqc=1.15 - bioconda::pmdtools=0.60 - bioconda::bedtools=2.30.0 - conda-forge::libiconv=1.16 @@ -50,4 +50,4 @@ dependencies: - bioconda::eigenstratdatabasetools=1.0.2 - bioconda::mapdamage2=2.2.1 - bioconda::bbmap=38.92 - - bioconda::bcftools=1.12 \ No newline at end of file + - bioconda::bcftools=1.12 From 09af474a5b622e93ff4f40db996f04c91c948c58 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 11:16:44 +0200 Subject: [PATCH 04/30] add parameters to schema and config --- nextflow.config | 5 +- nextflow_schema.json | 3449 ++++++++++++++++++++++-------------------- 2 files changed, 1788 insertions(+), 1666 deletions(-) diff --git a/nextflow.config b/nextflow.config index 67acbd616..b202c68c3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -120,10 +120,13 @@ params { preseq_cval = 0.95 preseq_terms = 100 - //DamageProfiler settings + //Damage estimation settings + damage_estimation_tool = 'damageprofiler' damageprofiler_length = 100 damageprofiler_threshold = 15 damageprofiler_yaxis = 0.30 + mapdamage_downsample = 10000 + mapdamage_yaxis = 0.30 //PMDTools settings run_pmdtools = false diff --git a/nextflow_schema.json b/nextflow_schema.json index edfe32bbd..af8779f65 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,1697 +1,1816 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/eager/master/nextflow_schema.json", - "title": "nf-core/eager pipeline parameters", - "description": "A fully reproducible and state-of-the-art ancient DNA analysis pipeline", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data, and additional metadata.", - "required": ["input"], - "properties": { - "input": { - "type": "string", - "description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.", - "fa_icon": "fas fa-dna", - "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications." - }, - "udg_type": { - "type": "string", - "default": "none", - "description": "Specifies whether you have UDG treated libraries. Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input.", - "fa_icon": "fas fa-vial", - "help_text": "Defines whether Uracil-DNA glycosylase (UDG) treatment was used to remove DNA\ndamage on the sequencing libraries.\n\nSpecify `'none'` if no treatment was performed. If you have partial UDG treated\ndata ([Rohland et al 2016](http://dx.doi.org/10.1098/rstb.2013.0624)), specify\n`'half'`. If you have complete UDG treated data ([Briggs et al.\n2010](https://doi.org/10.1093/nar/gkp1163)), specify `'full'`. \n\nWhen also using PMDtools specifying `'half'` will use a different model for DNA\ndamage assessment in PMDTools (PMDtools: `--UDGhalf`). Specify `'full'` and the\nPMDtools DNA damage assessment will use CpG context only (PMDtools: `--CpG`).\nDefault: `'none'`.\n\n> **Tip**: You should provide a small decoy reference genome with pre-made indices, e.g.\n> the human mtDNA genome, for the mandatory parameter `--fasta` in order to\n> avoid long computational time for generating the index files of the reference\n> genome, even if you do not actually need a reference genome for any downstream\n> analyses.", - "enum": ["none", "half", "full"] - }, - "single_stranded": { - "type": "boolean", - "description": "Specifies that libraries are single stranded. Always affects MALTExtract but will be ignored by pileupCaller with TSV input. Not required for TSV input.", - "fa_icon": "fas fa-minus", - "help_text": "Indicates libraries are single stranded.\n\nCurrently only affects MALTExtract where it will switch on damage patterns\ncalculation mode to single-stranded, (MaltExtract: `--singleStranded`) and\ngenotyping with pileupCaller where a different method is used (pileupCaller:\n`--singleStrandMode`). Default: false\n\nOnly required when using the 'Path' method of `--input`" - }, - "single_end": { - "type": "boolean", - "description": "Specifies that the input is single end reads. Not required for TSV input.", - "fa_icon": "fas fa-align-left", - "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, specify this parameter on the command line when you launch the pipeline. It is not possible to run a mixture of single-end and paired-end files in one run.\n\nOnly required when using the 'Path' method of `--input`" - }, - "colour_chemistry": { - "type": "integer", - "default": 4, - "description": "Specifies which Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4.", - "fa_icon": "fas fa-palette", - "help_text": "Specifies which Illumina colour chemistry a library was sequenced with. This informs whether to perform poly-G trimming (if `--complexity_filter_poly_g` is also supplied). Only 2 colour chemistry sequencers (e.g. NextSeq or NovaSeq) can generate uncertain poly-G tails (due to 'G' being indicated via a no-colour detection). Default is '4' to indicate e.g. HiSeq or MiSeq platforms, which do not require poly-G trimming. Options: 2, 4. Default: 4\n\nOnly required when using the 'Path' method of input." - }, - "bam": { - "type": "boolean", - "description": "Specifies that the input is in BAM format. Not required for TSV input.", - "fa_icon": "fas fa-align-justify", - "help_text": "Specifies the input file type to `--input` is in BAM format. This will automatically also apply `--single_end`.\n\nOnly required when using the 'Path' method of `--input`.\n" - } - }, - "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager.\nThe most efficient but more simplistic is supplying direct paths (with\nwildcards) to your FASTQ or BAM files, with each file or pair being considered a\nsingle library and each one run independently. TSV input requires creation of an\nextra file by the user and extra metadata, but allows more powerful lane and\nlibrary merging." - }, - "input_data_additional_options": { - "title": "Input Data Additional Options", - "type": "object", - "description": "Additional options regarding input data.", - "default": "", - "properties": { - "snpcapture_bed": { - "type": "string", - "fa_icon": "fas fa-magnet", - "description": "If library result of SNP capture, path to BED file containing SNPS positions on reference genome. SNP statistics are qualimap results directory only not MultiQC.", - "help_text": "Can be used to set a path to a BED file (3/6 column format) of SNP positions of a reference genome, to calculate SNP captured libraries on-target efficiency. This should be used for array or in-solution SNP capture protocols such as 390K, 1240K, etc. If supplied, some on-target metrics are automatically generated for you by qualimap in the 'Globals inside' section of the 'genome_results.txt' file in the qualimap results directory. These statistics are currently NOT displayed in MultiQC!" - }, - "run_convertinputbam": { - "type": "boolean", - "description": "Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.).", - "fa_icon": "fas fa-undo-alt", - "help_text": "Allows you to convert an input BAM file back to FASTQ for downstream processing. Note this is required if you need to perform AdapterRemoval and/or polyG clipping.\n\nIf not turned on, BAMs will automatically be sent to post-mapping steps." - } - }, - "fa_icon": "far fa-plus-square" - }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "properties": { - "fasta": { - "type": "string", - "fa_icon": "fas fa-font", - "description": "Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta'.", - "help_text": "You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you.\n\nFor example:\n\n```bash\n--fasta '///my_reference.fasta'\n```\n\n> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--save_reference` flag.\n> You must select either a `--fasta` or `--genome`\n" - }, - "genome": { - "type": "string", - "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", - "fa_icon": "fas fa-book", - "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" - }, - "igenomes_base": { - "type": "string", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - }, - "bwa_index": { - "type": "string", - "description": "Path to directory containing pre-made BWA indices (i.e. the directory before the files ending in '.amb' '.ann' '.bwt'. Do not include the files themselves. Most likely the same directory of the file provided with --fasta). If not supplied will be made for you.", - "fa_icon": "fas fa-address-book", - "help_text": "If you want to use pre-existing `bwa index` indices, please supply the **directory** to the FASTA you also specified in `--fasta` nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bwa_index/BWAIndex/'\n```\n\n> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." - }, - "bt2_index": { - "type": "string", - "description": "Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you.", - "fa_icon": "far fa-address-book", - "help_text": "If you want to use pre-existing `bt2 index` indices, please supply the **directory** to the FASTA you also specified in `--fasta`. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bt2` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bt2_index/BT2Index/'\n```\n\n> `bowtie2-build` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." - }, - "fasta_index": { - "type": "string", - "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", - "fa_icon": "far fa-bookmark", - "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`\n\nFor example:\n\n```bash\n--fasta_index 'Mammoth_MT_Krause.fasta.fai'\n```" - }, - "seq_dict": { - "type": "string", - "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", - "fa_icon": "fas fa-spell-check", - "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.\n\nFor example:\n\n```bash\n--seq_dict 'Mammoth_MT_Krause.dict'\n```" - }, - "large_ref": { - "type": "boolean", - "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", - "fa_icon": "fas fa-mountain", - "help_text": "This parameter is required to be set for large reference genomes. If your\nreference genome is larger than 3.5GB, the `samtools index` calls in the\npipeline need to generate `CSI` indices instead of `BAI` indices to compensate\nfor the size of the reference genome (with samtools: `-c`). This parameter is\nnot required for smaller references (including the human `hg19` or\n`grch37`/`grch38` references), but `>4GB` genomes have been shown to need `CSI`\nindices. Default: off" - }, - "save_reference": { - "type": "boolean", - "description": "If not already supplied by user, turns on saving of generated reference genome indices for later re-usage.", - "fa_icon": "far fa-save", - "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`" - } - }, - "description": "Specify locations of references and optionally, additional pre-made indices", - "help_text": "All nf-core/eager runs require a reference genome in FASTA format to map reads\nagainst to.\n\nIn addition we provide various options for indexing of different types of\nreference genomes (based on the tools used in the pipeline). nf-core/eager can\nindex reference genomes for you (with options to save these for other analysis),\nbut you can also supply your pre-made indices.\n\nSupplying pre-made indices saves time in pipeline execution and is especially\nadvised when running multiple times on the same cluster system for example. You\ncan even add a resource [specific profile](#profile) that sets paths to\npre-computed reference genomes, saving time when specifying these.\n\n> :warning: you must always supply a reference file. If you want to use\n functionality that does not require one, supply a small decoy genome such as\n phiX or the human mtDNA genome." - }, - "output_options": { - "title": "Output options", - "type": "object", - "description": "Specify where to put output files and optional saving of intermediate files", - "default": "", - "properties": { - "outdir": { - "type": "string", - "description": "The output directory where the results will be saved.", - "default": "./results", - "fa_icon": "fas fa-folder-open", - "help_text": "The output directory where the results will be saved. By default will be made in the directory you run the command in under `./results`." - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "hidden": true, - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"] - } - }, - "fa_icon": "fas fa-cloud-download-alt" - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "hidden": true, - "fa_icon": "fas fa-question-circle" - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "An email address to send a summary email to when the pipeline is completed.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "hidden": true, - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run if it **fails**. Normally would be the same as in `--email` but can be different. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.\n\n> Note that this functionality requires either `mail` or `sendmail` to be installed on your system." - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true, - "help_text": "Set to receive plain-text e-mails instead of HTML formatted." - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true, - "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true, - "help_text": "Set to disable colourful command line output and live life in monochrome." - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "hidden": true, - "description": "Parameter used for checking conda channels to be set correctly." - }, - "schema_ignore_params": { - "type": "string", - "fa_icon": "fas fa-not-equal", - "description": "String to specify ignored parameters for parameter validation", - "hidden": true, - "default": "genomes" - } - }, - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`." - }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These generally should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog", - "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" - }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", - "fa_icon": "fas fa-users-cog" - }, - "hostnames": { - "type": "string", - "description": "Institutional configs hostname.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "awsqueue": { - "type": "string", - "description": "The AWSBatch JobQueue that needs to be set when running on AWSBatch", - "fa_icon": "fab fa-aws" - }, - "awsregion": { - "type": "string", - "default": "eu-west-1", - "description": "The AWS Region for your AWS Batch job to run on", - "fa_icon": "fab fa-aws" - }, - "awscli": { - "type": "string", - "description": "Path to the AWS CLI tool", - "fa_icon": "fab fa-aws" - } - } - }, - "skip_steps": { - "title": "Skip steps", - "type": "object", - "description": "Skip any of the mentioned steps.", - "default": "", - "properties": { - "skip_fastqc": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off FastQC pre- and post-Adapter Removal, to speed up the pipeline. Use of this flag is most common when data has been previously pre-processed and the post-Adapter Removal mapped reads are being re-mapped to a new reference genome." - }, - "skip_adapterremoval": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off adapter trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`." - }, - "skip_preseq": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the computation of library complexity estimation." - }, - "skip_deduplication": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No duplicates will be removed on any data in the pipeline.\n" - }, - "skip_damage_calculation": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the DamageProfiler module to compute DNA damage profiles.\n" - }, - "skip_qualimap": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off QualiMap and thus does not compute coverage and other mapping metrics.\n" - } - }, - "fa_icon": "fas fa-fast-forward", - "help_text": "Some of the steps in the pipeline can be executed optionally. If you specify\nspecific steps to be skipped, there won't be any output related to these\nmodules." - }, - "complexity_filtering": { - "title": "Complexity filtering", - "type": "object", - "description": "Processing of Illumina two-colour chemistry data.", - "default": "", - "properties": { - "complexity_filter_poly_g": { - "type": "boolean", - "description": "Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries.", - "fa_icon": "fas fa-power-off", - "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using `fastp`, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n" - }, - "complexity_filter_poly_g_min": { - "type": "integer", - "default": 10, - "description": "Specify length of poly-g min for clipping to be performed.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming. By default, this is set to a value of `10` unless the user has chosen something specifically using this option.\n\n> Modifies fastp parameter: `--poly_g_min_len`" - } - }, - "fa_icon": "fas fa-filter", - "help_text": "More details can be seen in the [fastp\ndocumentation](https://github.com/OpenGene/fastp)\n\nIf using TSV input, this is performed per lane separately" - }, - "read_merging_and_adapter_removal": { - "title": "Read merging and adapter removal", - "type": "object", - "description": "Options for adapter clipping and paired-end merging.", - "default": "", - "properties": { - "clip_forward_adaptor": { - "type": "string", - "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", - "description": "Specify adapter sequence to be clipped off (forward strand).", - "fa_icon": "fas fa-cut", - "help_text": "Defines the adapter sequence to be used for the forward read. By default, this is set to `'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'`.\n\n> Modifies AdapterRemoval parameter: `--adapter1`" - }, - "clip_reverse_adaptor": { - "type": "string", - "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA", - "description": "Specify adapter sequence to be clipped off (reverse strand).", - "fa_icon": "fas fa-cut", - "help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`" - }, - "clip_adapters_list": { - "type": "string", - "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters", - "fa_icon": "fas fa-cut", - "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`" - }, - "clip_readlength": { - "type": "integer", - "default": 30, - "description": "Specify read minimum length to be kept for downstream analysis.", - "fa_icon": "fas fa-ruler", - "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" - }, - "clip_min_read_quality": { - "type": "integer", - "default": 20, - "description": "Specify minimum base quality for trimming off bases.", - "fa_icon": "fas fa-medal", - "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.\n\n> Modifies AdapterRemoval parameter: `--minquality`" - }, - "min_adap_overlap": { - "type": "integer", - "default": 1, - "description": "Specify minimum adapter overlap required for clipping.", - "fa_icon": "fas fa-hands-helping", - "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before adapters are trimmed from reads. Default is set to `1` base overlap.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" - }, - "skip_collapse": { - "type": "boolean", - "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" - }, - "skip_trim": { - "type": "boolean", - "description": "Skip adapter and quality trimming.", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off adapter AND quality trimming.\n\nFor example:\n\n```bash\n--skip_trim --input '*.fastq'\n```\n\n:warning: it is not possible to keep quality trimming (n or base quality) on,\n_and_ skip adapter trimming.\n\n:warning: it is not possible to turn off one or the other of quality\ntrimming or n trimming. i.e. --trimns --trimqualities are both given\nor neither. However setting quality in `--clip_min_read_quality` to 0 would\ntheoretically turn off base quality trimming.\n\n> Modifies AdapterRemoval parameters: `--trimns --trimqualities --adapter1 --adapter2`" - }, - "preserve5p": { - "type": "boolean", - "description": "Skip quality base trimming (n, score, window) of 5 prime end.", - "fa_icon": "fas fa-life-ring", - "help_text": "Turns off quality based trimming at the 5p end of reads when any of the --trimns, --trimqualities, or --trimwindows options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. Described [here](https://github.com/MikkelSchubert/adapterremoval/issues/32#issuecomment-504758137).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`" - }, - "mergedonly": { - "type": "boolean", - "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", - "fa_icon": "fas fa-handshake", - "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." - }, - "qualitymax": { - "type": "integer", - "description": "Specify the maximum Phred score used in input FASTQ files", - "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", - "default": 41, - "fa_icon": "fas fa-arrow-up" - }, - "run_post_ar_trimming": { - "type": "boolean", - "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)", - "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n" - }, - "post_ar_trim_front": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the front of a merged read or R1", - "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`" - }, - "post_ar_trim_tail": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the tail of of a merged read or R1", - "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`" - }, - "post_ar_trim_front2": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the front of R2", - "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`" - }, - "post_ar_trim_tail2": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the tail of R2", - "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`" - } - }, - "fa_icon": "fas fa-cut", - "help_text": "These options handle various parts of adapter clipping and read merging steps.\n\nMore details can be seen in the [AdapterRemoval\ndocumentation](https://adapterremoval.readthedocs.io/en/latest/)\n\nIf using TSV input, this is performed per lane separately.\n\n> :warning: `--skip_trim` will skip adapter clipping AND quality trimming\n> (n, base quality). It is currently not possible skip one or the other." - }, - "mapping": { - "title": "Read mapping to reference genome", - "type": "object", - "description": "Options for reference-genome mapping", - "default": "", - "properties": { - "mapper": { - "title": "Mapper", - "type": "string", - "description": "Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'.", - "default": "bwaaln", - "fa_icon": "fas fa-layer-group", - "help_text": "Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)\n", - "enum": ["bwaaln", "bwamem", "circularmapper", "bowtie2"] - }, - "bwaalnn": { - "type": "number", - "default": 0.01, - "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", - "fa_icon": "fas fa-sort-numeric-down", - "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`" - }, - "bwaalnk": { - "type": "integer", - "default": 2, - "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", - "fa_icon": "fas fa-drafting-compass", - "help_text": "Configures the `bwa aln -k` parameter for the seeding phase in the mapping algorithm. Default is set to `2`.\n\n> Modifies BWA aln parameter: `-k`" - }, - "bwaalnl": { - "type": "integer", - "default": 1024, - "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "Configures the length of the seed used in `bwa aln -l`. Default is set to be 'turned off' at the recommendation of Schubert et al. ([2012 _BMC Genomics_](https://doi.org/10.1186/1471-2164-13-178)) for ancient DNA with `1024`.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`\n" - }, - "bwaalno": { - "type": "integer", - "default": 2, - "fa_icon": "fas fa-people-arrows", - "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", - "help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n" - }, - "circularextension": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only).", - "fa_icon": "fas fa-external-link-alt", - "help_text": "The number of bases to extend the reference genome with. By default this is set to `500` if not specified otherwise.\n\n> Modifies circulargenerator and realignsamfile parameter: `-e`" - }, - "circulartarget": { - "type": "string", - "default": "MT", - "description": "Specify the FASTA header of the target chromosome to extend (circularmapper only).", - "fa_icon": "fas fa-bullseye", - "help_text": "The chromosome in your FASTA reference that you'd like to be treated as circular. By default this is set to `MT` but can be configured to match any other chromosome.\n\n> Modifies circulargenerator parameter: `-s`" - }, - "circularfilter": { - "type": "boolean", - "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", - "fa_icon": "fas fa-filter", - "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" - }, - "bt2_alignmode": { - "type": "string", - "default": "local", - "description": "Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'.", - "fa_icon": "fas fa-arrows-alt-h", - "help_text": "The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105).\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", - "enum": ["local", "end-to-end"] - }, - "bt2_sensitivity": { - "type": "string", - "default": "sensitive", - "description": "Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'.", - "fa_icon": "fas fa-microscope", - "help_text": "The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment)\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", - "enum": [ - "no-preset", - "very-fast", - "fast", - "sensitive", - "very-sensitive" - ] - }, - "bt2n": { - "type": "integer", - "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", - "fa_icon": "fas fa-sort-numeric-down", - "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", - "default": 0 - }, - "bt2l": { - "type": "integer", - "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", - "default": 0 - }, - "bt2_trim5": { - "type": "integer", - "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", - "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", - "default": 0 - }, - "bt2_trim3": { - "type": "integer", - "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", - "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", - "default": 0 - }, - "bt2_maxins": { - "type": "integer", - "default": 500, - "fa_icon": "fas fa-exchange-alt", - "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", - "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n See [Bowtie2 documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for more information.\n\n> Modifies Bowtie2 parameters: `--maxins`" - } - }, - "fa_icon": "fas fa-layer-group", - "help_text": "If using TSV input, mapping is performed at the library level, i.e. after lane merging.\n" - }, - "host_removal": { - "title": "Removal of Host-Mapped Reads", - "type": "object", - "description": "Options for production of host-read removed FASTQ files for privacy reasons.", - "default": "", - "properties": { - "hostremoval_input_fastq": { - "type": "boolean", - "description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", - "fa_icon": "fas fa-power-off", - "help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n" - }, - "hostremoval_mode": { - "type": "string", - "default": "remove", - "description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).", - "fa_icon": "fas fa-mask", - "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", - "enum": ["strip", "replace", "remove"] - } - }, - "fa_icon": "fas fa-user-shield", - "help_text": "These parameters are used for removing mapped reads from the original input\nFASTQ files, usually in the context of uploading the original FASTQ files to a\npublic read archive (NCBI SRA/EBI ENA/DDBJ SRA).\n\nThese flags will produce FASTQ files almost identical to your input files,\nexcept that reads with the same read ID as one found in the mapped bam file, are\neither removed or 'masked' (every base replaced with Ns).\n\nThis functionality allows you to provide other researchers who wish to re-use\nyour data to apply their own adapter removal/read merging procedures, while\nmaintaining anonymity for sample donors - for example with microbiome\nresearch.\n\nIf using TSV input, stripping is performed library, i.e. after lane merging." - }, - "bam_filtering": { - "title": "BAM Filtering", - "type": "object", - "description": "Options for quality filtering and how to deal with off-target unmapped reads.", - "default": "", - "properties": { - "run_bam_filtering": { - "type": "boolean", - "description": "Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on the bam filtering module for either mapping quality filtering or unmapped read treatment.\n" - }, - "bam_mapping_quality_threshold": { - "type": "integer", - "description": "Minimum mapping quality for reads filter.", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", - "default": 0 - }, - "bam_filter_minreadlength": { - "type": "integer", - "fa_icon": "fas fa-ruler-horizontal", - "description": "Specify minimum read length to be kept after mapping.", - "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", - "default": 0 - }, - "bam_unmapped_type": { - "type": "string", - "default": "discard", - "description": "Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format Options: 'discard', 'bam', 'fastq', 'both'.", - "fa_icon": "fas fa-trash-alt", - "help_text": "Defines how to proceed with unmapped reads: `'discard'` removes all unmapped reads, `keep` keeps both unmapped and mapped reads in the same BAM file, `'bam'` keeps unmapped reads as BAM file, `'fastq'` keeps unmapped reads as FastQ file, `both` keeps both BAM and FASTQ files. Default is `discard`. `keep` is what would happen if `--run_bam_filtering` was _not_ supplied.\n\nNote that in all cases, if `--bam_mapping_quality_threshold` is also supplied, mapping quality filtering will still occur on the mapped reads.\n\n> Modifies samtools view parameter: `-f4 -F4`", - "enum": ["discard", "keep", "bam", "fastq", "both"] - } - }, - "fa_icon": "fas fa-sort-amount-down", - "help_text": "Users can configure to keep/discard/extract certain groups of reads efficiently\nin the nf-core/eager pipeline.\n\nIf using TSV input, filtering is performed library, i.e. after lane merging.\n\nThis module utilises `samtools view` and `filter_bam_fragment_length.py`" - }, - "deduplication": { - "title": "DeDuplication", - "type": "object", - "description": "Options for removal of PCR amplicon duplicates that can artificially inflate coverage.", - "default": "", - "properties": { - "dedupper": { - "type": "string", - "default": "markduplicates", - "description": "Deduplication method to use. Options: 'markduplicates', 'dedup'.", - "fa_icon": "fas fa-object-group", - "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n", - "enum": ["markduplicates", "dedup"] - }, - "dedup_all_merged": { - "type": "boolean", - "description": "Turn on treating all reads as merged reads.", - "fa_icon": "fas fa-handshake", - "help_text": "Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above).\n\n> Modifies dedup parameter: `-m`" - } - }, - "fa_icon": "fas fa-clone", - "help_text": "If using TSV input, deduplication is performed per library, i.e. after lane merging." - }, - "library_complexity_analysis": { - "title": "Library Complexity Analysis", - "type": "object", - "description": "Options for calculating library complexity (i.e. how many unique reads are present).", - "default": "", - "properties": { - "preseq_mode": { - "type": "string", - "default": "c_curve", - "description": "Specify which mode of preseq to run.", - "fa_icon": "fas fa-toggle-on", - "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram", - "enum": ["c_curve", "lc_extrap"] - }, - "preseq_step_size": { - "type": "integer", - "default": 1000, - "description": "Specify the step size of Preseq.", - "fa_icon": "fas fa-shoe-prints", - "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" - }, - "preseq_maxextrap": { - "type": "integer", - "default": 10000000000, - "description": "Specify the maximum extrapolation (lc_extrap mode only)", - "fa_icon": "fas fa-ban", - "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`" - }, - "preseq_terms": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`" - }, - "preseq_bootstrap": { - "type": "integer", - "default": 100, - "description": "Specify number of bootstraps to perform (lc_extrap mode only)", - "fa_icon": "fab fa-bootstrap", - "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`" - }, - "preseq_cval": { - "type": "number", - "default": 0.95, - "description": "Specify confidence interval level (lc_extrap mode only)", - "fa_icon": "fas fa-check-circle", - "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`" - } - }, - "fa_icon": "fas fa-bezier-curve", - "help_text": "nf-core/eager uses Preseq on mapped reads as one method to calculate library\ncomplexity. If DeDup is used, Preseq uses the histogram output of DeDup,\notherwise the sorted non-duplicated BAM file is supplied. Furthermore, if\npaired-end read collapsing is not performed, the `-P` flag is used." - }, - "adna_damage_analysis": { - "title": "(aDNA) Damage Analysis", - "type": "object", - "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", - "default": "", - "properties": { - "damageprofiler_length": { - "type": "integer", - "default": 100, - "description": "Specify length filter for DamageProfiler.", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "Specifies the length filter for DamageProfiler. By default set to `100`.\n\n> Modifies DamageProfile parameter: `-l`" - }, - "damageprofiler_threshold": { - "type": "integer", - "default": 15, - "description": "Specify number of bases of each read to consider for DamageProfiler calculations.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "Specifies the length of the read start and end to be considered for profile generation in DamageProfiler. By default set to `15` bases.\n\n> Modifies DamageProfile parameter: `-t`" - }, - "damageprofiler_yaxis": { - "type": "number", - "default": 0.3, - "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", - "fa_icon": "fas fa-ruler-vertical", - "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the DamageProfiler damage plot. This is set to `0.30` (i.e. 30%) by default as this matches the popular [mapDamage2.0](https://ginolhac.github.io/mapDamage) program. However, the default behaviour of DamageProfiler is to 'autoscale' the y-axis maximum to zoom in on any _possible_ damage that may occur (e.g. if the damage is about 10%, the highest value on the y-axis would be set to 0.12). This 'autoscale' behaviour can be turned on by specifying the number to `0`. Default: `0.30`.\n\n> Modifies DamageProfile parameter: `-yaxis_damageplot`" - }, - "run_pmdtools": { - "type": "boolean", - "description": "Turn on PMDtools", - "fa_icon": "fas fa-power-off", - "help_text": "Specifies to run PMDTools for damage based read filtering and assessment of DNA damage in sequencing libraries. By default turned off.\n" - }, - "pmdtools_range": { - "type": "integer", - "default": 10, - "description": "Specify range of bases for PMDTools to scan for damage.", - "fa_icon": "fas fa-arrows-alt-h", - "help_text": "Specifies the range in which to consider DNA damage from the ends of reads. By default set to `10`.\n\n> Modifies PMDTools parameter: `--range`" - }, - "pmdtools_threshold": { - "type": "integer", - "default": 3, - "description": "Specify PMDScore threshold for PMDTools.", - "fa_icon": "fas fa-chart-bar", - "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis. By default set to `3` if not set specifically by the user.\n\n> Modifies PMDTools parameter: `--threshold`" - }, - "pmdtools_reference_mask": { - "type": "string", - "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", - "fa_icon": "fas fa-mask", - "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." - }, - "pmdtools_max_reads": { - "type": "integer", - "default": 10000, - "description": "Specify the maximum number of reads to consider for metrics generation.", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" - }, - "pmdtools_platypus": { - "type": "boolean", - "description": "Append big list of base frequencies for platypus to output.", - "fa_icon": "fas fa-power-off", - "help_text": "Enables the printing of a wider list of base frequencies used by platypus as an addition to the output base misincorporation frequency table. By default turned off.\n" - }, - "run_mapdamage_rescaling": { - "type": "boolean", - "fa_icon": "fas fa-map", - "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", - "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" - }, - "rescale_seqlength": { - "type": "integer", - "default": 12, - "fa_icon": "fas fa-ruler-horizontal", - "description": "Length of read sequence to use from each side for rescaling. Can be overridden by --rescale_length_*p.", - "help_text": "Specify the length from the end of the read that mapDamage should rescale at both ends.\n\n> Modifies the `--seq-length` parameter of mapDamage2." - }, - "rescale_length_5p": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-balance-scale-right", - "description": "Length of read for mapDamage2 to rescale from 5p end. Only used if not 0 otherwise --rescale_seqlength used.", - "help_text": "Specify the length from the end of the read that mapDamage should rescale. Overrides `--rescale_seqlength`.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." - }, - "rescale_length_3p": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-balance-scale-left", - "description": "Length of read for mapDamage2 to rescale from 3p end. Only used if not 0 otherwise --rescale_seqlength used..", - "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/eager/master/nextflow_schema.json", + "title": "nf-core/eager pipeline parameters", + "description": "A fully reproducible and state-of-the-art ancient DNA analysis pipeline", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data, and additional metadata.", + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string", + "description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.", + "fa_icon": "fas fa-dna", + "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications." + }, + "udg_type": { + "type": "string", + "default": "none", + "description": "Specifies whether you have UDG treated libraries. Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input.", + "fa_icon": "fas fa-vial", + "help_text": "Defines whether Uracil-DNA glycosylase (UDG) treatment was used to remove DNA\ndamage on the sequencing libraries.\n\nSpecify `'none'` if no treatment was performed. If you have partial UDG treated\ndata ([Rohland et al 2016](http://dx.doi.org/10.1098/rstb.2013.0624)), specify\n`'half'`. If you have complete UDG treated data ([Briggs et al.\n2010](https://doi.org/10.1093/nar/gkp1163)), specify `'full'`. \n\nWhen also using PMDtools specifying `'half'` will use a different model for DNA\ndamage assessment in PMDTools (PMDtools: `--UDGhalf`). Specify `'full'` and the\nPMDtools DNA damage assessment will use CpG context only (PMDtools: `--CpG`).\nDefault: `'none'`.\n\n> **Tip**: You should provide a small decoy reference genome with pre-made indices, e.g.\n> the human mtDNA genome, for the mandatory parameter `--fasta` in order to\n> avoid long computational time for generating the index files of the reference\n> genome, even if you do not actually need a reference genome for any downstream\n> analyses.", + "enum": [ + "none", + "half", + "full" + ] + }, + "single_stranded": { + "type": "boolean", + "description": "Specifies that libraries are single stranded. Always affects MALTExtract but will be ignored by pileupCaller with TSV input. Not required for TSV input.", + "fa_icon": "fas fa-minus", + "help_text": "Indicates libraries are single stranded.\n\nCurrently only affects MALTExtract where it will switch on damage patterns\ncalculation mode to single-stranded, (MaltExtract: `--singleStranded`) and\ngenotyping with pileupCaller where a different method is used (pileupCaller:\n`--singleStrandMode`). Default: false\n\nOnly required when using the 'Path' method of `--input`" + }, + "single_end": { + "type": "boolean", + "description": "Specifies that the input is single end reads. Not required for TSV input.", + "fa_icon": "fas fa-align-left", + "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, specify this parameter on the command line when you launch the pipeline. It is not possible to run a mixture of single-end and paired-end files in one run.\n\nOnly required when using the 'Path' method of `--input`" + }, + "colour_chemistry": { + "type": "integer", + "default": 4, + "description": "Specifies which Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4.", + "fa_icon": "fas fa-palette", + "help_text": "Specifies which Illumina colour chemistry a library was sequenced with. This informs whether to perform poly-G trimming (if `--complexity_filter_poly_g` is also supplied). Only 2 colour chemistry sequencers (e.g. NextSeq or NovaSeq) can generate uncertain poly-G tails (due to 'G' being indicated via a no-colour detection). Default is '4' to indicate e.g. HiSeq or MiSeq platforms, which do not require poly-G trimming. Options: 2, 4. Default: 4\n\nOnly required when using the 'Path' method of input." + }, + "bam": { + "type": "boolean", + "description": "Specifies that the input is in BAM format. Not required for TSV input.", + "fa_icon": "fas fa-align-justify", + "help_text": "Specifies the input file type to `--input` is in BAM format. This will automatically also apply `--single_end`.\n\nOnly required when using the 'Path' method of `--input`.\n" + } + }, + "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager.\nThe most efficient but more simplistic is supplying direct paths (with\nwildcards) to your FASTQ or BAM files, with each file or pair being considered a\nsingle library and each one run independently. TSV input requires creation of an\nextra file by the user and extra metadata, but allows more powerful lane and\nlibrary merging." + }, + "input_data_additional_options": { + "title": "Input Data Additional Options", + "type": "object", + "description": "Additional options regarding input data.", + "default": "", + "properties": { + "snpcapture_bed": { + "type": "string", + "fa_icon": "fas fa-magnet", + "description": "If library result of SNP capture, path to BED file containing SNPS positions on reference genome. SNP statistics are qualimap results directory only not MultiQC.", + "help_text": "Can be used to set a path to a BED file (3/6 column format) of SNP positions of a reference genome, to calculate SNP captured libraries on-target efficiency. This should be used for array or in-solution SNP capture protocols such as 390K, 1240K, etc. If supplied, some on-target metrics are automatically generated for you by qualimap in the 'Globals inside' section of the 'genome_results.txt' file in the qualimap results directory. These statistics are currently NOT displayed in MultiQC!" + }, + "run_convertinputbam": { + "type": "boolean", + "description": "Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.).", + "fa_icon": "fas fa-undo-alt", + "help_text": "Allows you to convert an input BAM file back to FASTQ for downstream processing. Note this is required if you need to perform AdapterRemoval and/or polyG clipping.\n\nIf not turned on, BAMs will automatically be sent to post-mapping steps." + } + }, + "fa_icon": "far fa-plus-square" + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "properties": { + "fasta": { + "type": "string", + "fa_icon": "fas fa-font", + "description": "Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta'.", + "help_text": "You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you.\n\nFor example:\n\n```bash\n--fasta '///my_reference.fasta'\n```\n\n> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--save_reference` flag.\n> You must select either a `--fasta` or `--genome`\n" + }, + "genome": { + "type": "string", + "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", + "fa_icon": "fas fa-book", + "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" + }, + "igenomes_base": { + "type": "string", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + }, + "bwa_index": { + "type": "string", + "description": "Path to directory containing pre-made BWA indices (i.e. the directory before the files ending in '.amb' '.ann' '.bwt'. Do not include the files themselves. Most likely the same directory of the file provided with --fasta). If not supplied will be made for you.", + "fa_icon": "fas fa-address-book", + "help_text": "If you want to use pre-existing `bwa index` indices, please supply the **directory** to the FASTA you also specified in `--fasta` nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bwa_index/BWAIndex/'\n```\n\n> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." + }, + "bt2_index": { + "type": "string", + "description": "Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you.", + "fa_icon": "far fa-address-book", + "help_text": "If you want to use pre-existing `bt2 index` indices, please supply the **directory** to the FASTA you also specified in `--fasta`. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bt2` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bt2_index/BT2Index/'\n```\n\n> `bowtie2-build` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." + }, + "fasta_index": { + "type": "string", + "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", + "fa_icon": "far fa-bookmark", + "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`\n\nFor example:\n\n```bash\n--fasta_index 'Mammoth_MT_Krause.fasta.fai'\n```" + }, + "seq_dict": { + "type": "string", + "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", + "fa_icon": "fas fa-spell-check", + "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.\n\nFor example:\n\n```bash\n--seq_dict 'Mammoth_MT_Krause.dict'\n```" + }, + "large_ref": { + "type": "boolean", + "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", + "fa_icon": "fas fa-mountain", + "help_text": "This parameter is required to be set for large reference genomes. If your\nreference genome is larger than 3.5GB, the `samtools index` calls in the\npipeline need to generate `CSI` indices instead of `BAI` indices to compensate\nfor the size of the reference genome (with samtools: `-c`). This parameter is\nnot required for smaller references (including the human `hg19` or\n`grch37`/`grch38` references), but `>4GB` genomes have been shown to need `CSI`\nindices. Default: off" + }, + "save_reference": { + "type": "boolean", + "description": "If not already supplied by user, turns on saving of generated reference genome indices for later re-usage.", + "fa_icon": "far fa-save", + "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`" + } + }, + "description": "Specify locations of references and optionally, additional pre-made indices", + "help_text": "All nf-core/eager runs require a reference genome in FASTA format to map reads\nagainst to.\n\nIn addition we provide various options for indexing of different types of\nreference genomes (based on the tools used in the pipeline). nf-core/eager can\nindex reference genomes for you (with options to save these for other analysis),\nbut you can also supply your pre-made indices.\n\nSupplying pre-made indices saves time in pipeline execution and is especially\nadvised when running multiple times on the same cluster system for example. You\ncan even add a resource [specific profile](#profile) that sets paths to\npre-computed reference genomes, saving time when specifying these.\n\n> :warning: you must always supply a reference file. If you want to use\n functionality that does not require one, supply a small decoy genome such as\n phiX or the human mtDNA genome." + }, + "output_options": { + "title": "Output options", + "type": "object", + "description": "Specify where to put output files and optional saving of intermediate files", + "default": "", + "properties": { + "outdir": { + "type": "string", + "description": "The output directory where the results will be saved.", + "default": "./results", + "fa_icon": "fas fa-folder-open", + "help_text": "The output directory where the results will be saved. By default will be made in the directory you run the command in under `./results`." + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "hidden": true, + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ] + } + }, + "fa_icon": "fas fa-cloud-download-alt" + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "hidden": true, + "fa_icon": "fas fa-question-circle" + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "An email address to send a summary email to when the pipeline is completed.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true, + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run if it **fails**. Normally would be the same as in `--email` but can be different. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.\n\n> Note that this functionality requires either `mail` or `sendmail` to be installed on your system." + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true, + "help_text": "Set to receive plain-text e-mails instead of HTML formatted." + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true, + "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true, + "help_text": "Set to disable colourful command line output and live life in monochrome." + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "hidden": true, + "description": "Parameter used for checking conda channels to be set correctly." + }, + "schema_ignore_params": { + "type": "string", + "fa_icon": "fas fa-not-equal", + "description": "String to specify ignored parameters for parameter validation", + "hidden": true, + "default": "genomes" + } + }, + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`." + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } } - }, - "fa_icon": "fas fa-chart-line", - "help_text": "More documentation can be seen in the follow links for:\n\n- [DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n- [PMDTools documentation](https://github.com/pontussk/PMDtools)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane\nmerging. PMDtools and BAM Trimming is run after library merging of same-named\nlibrary BAMs that have the same type of UDG treatment. BAM Trimming is only\nperformed on non-UDG and half-UDG treated data.\n" - }, - "feature_annotation_statistics": { - "title": "Feature Annotation Statistics", - "type": "object", - "description": "Options for getting reference annotation statistics (e.g. gene coverages)", - "default": "", - "properties": { - "run_bedtools_coverage": { - "type": "boolean", - "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", - "fa_icon": "fas fa-chart-area", - "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n" - }, - "anno_file": { - "type": "string", - "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", - "fa_icon": "fas fa-file-signature", - "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" - } - }, - "fa_icon": "fas fa-scroll", - "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" - }, - "bam_trimming": { - "title": "BAM Trimming", - "type": "object", - "description": "Options for trimming of aligned reads (e.g. to remove damage prior genotyping).", - "default": "", - "properties": { - "run_trim_bam": { - "type": "boolean", - "description": "Turn on BAM trimming. Will only run on non-UDG or half-UDG libraries", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file. Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`).\n\n> Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping." - }, - "bamutils_clip_double_stranded_half_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_half_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_none_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_none_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_half_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_half_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_none_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_none_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_softclip": { - "type": "boolean", - "description": "Turn on using softclip instead of hard masking.", - "fa_icon": "fas fa-paint-roller", - "help_text": "By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.\n\n> Modifies bam trimBam parameter: `-c`" - } - }, - "fa_icon": "fas fa-eraser", - "help_text": "For some library preparation protocols, users might want to clip off damaged\nbases before applying genotyping methods. This can be done in nf-core/eager\nautomatically by turning on the `--run_trim_bam` parameter.\n\nMore documentation can be seen in the [bamUtil\ndocumentation](https://genome.sph.umich.edu/wiki/BamUtil:_trimBam)\n" - }, - "genotyping": { - "title": "Genotyping", - "type": "object", - "description": "Options for variant calling.", - "default": "", - "properties": { - "run_genotyping": { - "type": "boolean", - "description": "Turn on genotyping of BAM files.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on genotyping to run on all post-dedup and downstream BAMs. For example if `--run_pmdtools` and `--trim_bam` are both supplied, the genotyper will be run on all three BAM files i.e. post-deduplication, post-pmd and post-trimmed BAM files." - }, - "genotyping_tool": { - "type": "string", - "description": "Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.", - "fa_icon": "fas fa-tools", - "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller. Specify 'ug', 'hc', 'freebayes', 'pileupcaller' and 'angsd' respectively.\n\n> > Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does _de novo_ assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", - "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"] - }, - "genotyping_source": { - "type": "string", - "default": "raw", - "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", - "fa_icon": "fas fa-faucet", - "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", - "enum": ["raw", "pmd", "trimmed", "rescaled"] - }, - "gatk_call_conf": { - "type": "integer", - "default": 30, - "description": "Specify GATK phred-scaled confidence threshold.", - "fa_icon": "fas fa-balance-scale-right", - "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call. Default: `30`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" - }, - "gatk_ploidy": { - "type": "integer", - "default": 2, - "description": "Specify GATK organism ploidy.", - "fa_icon": "fas fa-pastafarianism", - "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms. Default: `2`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `--sample-ploidy`" - }, - "gatk_downsample": { - "type": "integer", - "default": 250, - "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", - "fa_icon": "fas fa-icicles", - "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to 250 reads. Default: `250`\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" - }, - "gatk_dbsnp": { - "type": "string", - "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", - "fa_icon": "fas fa-marker", - "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.\n" - }, - "gatk_hc_out_mode": { - "type": "string", - "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK genotyper HaplotypeCaller is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_ACTIVE_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK HaplotypeCaller parameter: `-output_mode`", - "enum": [ - "EMIT_ALL_ACTIVE_SITES", - "EMIT_ALL_CONFIDENT_SITES", - "EMIT_VARIANTS_ONLY" - ] - }, - "gatk_hc_emitrefconf": { - "type": "string", - "default": "GVCF", - "description": "Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK HaplotypeCaller is selected, mode for emitting reference confidence calls. Options: `'NONE'`, `'BP_RESOLUTION'`, `'GVCF'`. Default: `'GVCF'`\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`\n", - "enum": ["NONE", "GVCF", "BP_RESOLUTION"] - }, - "gatk_ug_out_mode": { - "type": "string", - "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", - "enum": [ - "EMIT_ALL_SITES", - "EMIT_ALL_CONFIDENT_SITES", - "EMIT_VARIANTS_ONLY" - ] - }, - "gatk_ug_genotype_model": { - "type": "string", - "default": "SNP", - "description": "Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'.", - "fa_icon": "fas fa-project-diagram", - "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc. Options: `'SNP'`, `'INDEL'`, `'BOTH'`, `'GENERALPLOIDYSNP'`, `'GENERALPLOIDYINDEL`'. Default: `'SNP'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`", - "enum": [ - "SNP", - "INDEL", - "BOTH", - "GENERALPLOIDYSNP", - "GENERALPLOIDYINDEL" - ] - }, - "gatk_ug_keep_realign_bam": { - "type": "boolean", - "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", - "fa_icon": "fas fa-align-left", - "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." - }, - "gatk_ug_defaultbasequalities": { - "type": "string", - "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", - "fa_icon": "fas fa-undo-alt", - "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to -1 which is to not set any default quality (turned off). Default: `-1`\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`" - }, - "freebayes_C": { - "type": "integer", - "default": 1, - "description": "Specify minimum required supporting observations to consider a variant.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify minimum required supporting observations to consider a variant. Default: `1`\n\n> Modifies freebayes parameter: `-C`" - }, - "freebayes_g": { - "type": "integer", - "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", - "fa_icon": "fab fa-think-peaks", - "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", - "default": 0 - }, - "freebayes_p": { - "type": "integer", - "default": 2, - "description": "Specify ploidy of sample in FreeBayes.", - "fa_icon": "fas fa-pastafarianism", - "help_text": "Specify ploidy of sample in FreeBayes. Default is diploid. Default: `2`\n\n> Modifies freebayes parameter: `-p`" - }, - "pileupcaller_bedfile": { - "type": "string", - "description": "Specify path to SNP panel in bed format for pileupCaller.", - "fa_icon": "fas fa-bed", - "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate pileup for pileupCaller.\n" - }, - "pileupcaller_snpfile": { - "type": "string", - "description": "Specify path to SNP panel in EIGENSTRAT format for pileupCaller.", - "fa_icon": "fas fa-sliders-h", - "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/tree/master/CONVERTF) format, pileupCaller will call these sites.\n" - }, - "pileupcaller_method": { - "type": "string", - "default": "randomHaploid", - "description": "Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'.", - "fa_icon": "fas fa-toolbox", - "help_text": "Specify calling method to use. Options: randomHaploid, randomDiploid, majorityCall. Default: `'randomHaploid'`\n\n> Modifies pileupCaller parameter: `--randomHaploid --randomDiploid --majorityCall`", - "enum": ["randomHaploid", "randomDiploid", "majorityCall"] - }, - "pileupcaller_transitions_mode": { - "type": "string", - "default": "AllSites", - "description": "Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'.", - "fa_icon": "fas fa-toggle-on", - "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. Options: `'AllSites'`, `'TransitionsMissing'`, `'SkipTransitions'`. Default: `'AllSites'`\n\n> Modifies pileupCaller parameter: `--skipTransitions --transitionsMissing`", - "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"] - }, - "pileupcaller_min_map_quality": { - "type": "integer", - "default": 30, - "description": "The minimum mapping quality to be used for genotyping.", - "fa_icon": "fas fa-filter", - "help_text": "The minimum mapping quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-q` parameter of samtools mpileup." - }, - "pileupcaller_min_base_quality": { - "type": "integer", - "default": 30, - "description": "The minimum base quality to be used for genotyping.", - "fa_icon": "fas fa-filter", - "help_text": "The minimum base quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-Q` parameter of samtools mpileup." - }, - "angsd_glmodel": { - "type": "string", - "default": "samtools", - "description": "Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'.", - "fa_icon": "fas fa-project-diagram", - "help_text": "Specify which genotype likelihood model to use. Options: `'samtools`, `'gatk'`, `'soapsnp'`, `'syk'`. Default: `'samtools'`\n\n> Modifies ANGSD parameter: `-GL`", - "enum": ["samtools", "gatk", "soapsnp", "syk"] - }, - "angsd_glformat": { - "type": "string", - "default": "binary", - "description": "Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'.", - "fa_icon": "fas fa-text-height", - "help_text": "Specifies what type of genotyping likelihood file format will be output. Options: `'text'`, `'binary'`, `'binary_three'`, `'beagle_binary'`. Default: `'text'`.\n\nThe options refer to the following descriptions respectively:\n\n- `text`: textoutput of all 10 log genotype likelihoods.\n- `binary`: binary all 10 log genotype likelihood\n- `binary_three`: binary 3 times likelihood\n- `beagle_binary`: beagle likelihood file\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlF`", - "enum": ["text", "binary", "binary_three", "beagle"] - }, - "angsd_createfasta": { - "type": "boolean", - "description": "Turn on creation of FASTA from ANGSD genotyping likelihood.", - "fa_icon": "fas fa-align-justify", - "help_text": "Turns on the ANGSD creation of a FASTA file from the BAM file.\n" - }, - "angsd_fastamethod": { - "type": "string", - "default": "random", - "description": "Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'.", - "fa_icon": "fas fa-toolbox", - "help_text": "The type of base calling to be performed when creating the ANGSD FASTA file. Options: `'random'` or `'common'`. Will output the most common non-N base at each given position, whereas 'random' will pick one at random. Default: `'random'`.\n\n> Modifies ANGSD parameter: `-doFasta -doCounts`", - "enum": ["random", "common"] }, - "run_bcftools_stats": { - "type": "boolean", - "default": true, - "description": "Turn on bcftools stats generation for VCF based variant calling statistics", - "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.", - "fa_icon": "far fa-chart-bar" - } - }, - "fa_icon": "fas fa-sliders-h", - "help_text": "There are options for different genotypers (or genotype likelihood calculators)\nto be used. We suggest you read the documentation of each tool to find the ones that\nsuit your needs.\n\nDocumentation for each tool:\n\n- [GATK\n UnifiedGenotyper](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php)\n- [GATK\n HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)\n- [FreeBayes](https://github.com/ekg/freebayes)\n- [ANGSD](http://www.popgen.dk/angsd/index.php/Genotype_Likelihoods)\n- [sequenceTools pileupCaller](https://github.com/stschiff/sequenceTools)\n\nIf using TSV input, genotyping is performed per sample (i.e. after all types of\nlibraries are merged), except for pileupCaller which gathers all double-stranded and\nsingle-stranded (same-type merged) libraries respectively." - }, - "consensus_sequence_generation": { - "title": "Consensus Sequence Generation", - "type": "object", - "description": "Options for creation of a per-sample FASTA sequence useful for downstream analysis (e.g. multi sequence alignment)", - "default": "", - "properties": { - "run_vcf2genome": { - "type": "boolean", - "description": "Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on consensus sequence genome creation via VCF2Genome. Only accepts GATK UnifiedGenotyper VCF files with the `--gatk_ug_out_mode 'EMIT_ALL_SITES'` and `--gatk_ug_genotype_model 'SNP` flags. Typically useful for small genomes such as mitochondria.\n" - }, - "vcf2genome_outfile": { - "type": "string", - "description": "Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name.", - "fa_icon": "fas fa-file-alt", - "help_text": "The name of your requested output FASTA file. Do not include `.fasta` suffix.\n" - }, - "vcf2genome_header": { - "type": "string", - "description": "Specify the header name of the consensus sequence entry within the FASTA file.", - "fa_icon": "fas fa-heading", - "help_text": "The name of the FASTA entry you would like in your FASTA file.\n" - }, - "vcf2genome_minc": { - "type": "integer", - "default": 5, - "description": "Minimum depth coverage required for a call to be included (else N will be called).", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "Minimum depth coverage for a SNP to be made. Else, a SNP will be called as N. Default: `5`\n\n> Modifies VCF2Genome parameter: `-minc`" - }, - "vcf2genome_minq": { - "type": "integer", - "default": 30, - "description": "Minimum genotyping quality of a call to be called. Else N will be called.", - "fa_icon": "fas fa-medal", - "help_text": "Minimum genotyping quality of a call to be made. Else N will be called. Default: `30`\n\n> Modifies VCF2Genome parameter: `-minq`" + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These generally should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog", + "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", + "fa_icon": "fas fa-users-cog" + }, + "hostnames": { + "type": "string", + "description": "Institutional configs hostname.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "awsqueue": { + "type": "string", + "description": "The AWSBatch JobQueue that needs to be set when running on AWSBatch", + "fa_icon": "fab fa-aws" + }, + "awsregion": { + "type": "string", + "default": "eu-west-1", + "description": "The AWS Region for your AWS Batch job to run on", + "fa_icon": "fab fa-aws" + }, + "awscli": { + "type": "string", + "description": "Path to the AWS CLI tool", + "fa_icon": "fab fa-aws" + } + } }, - "vcf2genome_minfreq": { - "type": "number", - "default": 0.8, - "description": "Minimum fraction of reads supporting a call to be included. Else N will be called.", - "fa_icon": "fas fa-percent", - "help_text": "In the case of two possible alleles, the frequency of the majority allele required for a call to be made. Else, a SNP will be called as N. Default: `0.8`\n\n> Modifies VCF2Genome parameter: `-minfreq`" + "skip_steps": { + "title": "Skip steps", + "type": "object", + "description": "Skip any of the mentioned steps.", + "default": "", + "properties": { + "skip_fastqc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off FastQC pre- and post-Adapter Removal, to speed up the pipeline. Use of this flag is most common when data has been previously pre-processed and the post-Adapter Removal mapped reads are being re-mapped to a new reference genome." + }, + "skip_adapterremoval": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off adapter trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`." + }, + "skip_preseq": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the computation of library complexity estimation." + }, + "skip_deduplication": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No duplicates will be removed on any data in the pipeline.\n" + }, + "skip_damage_calculation": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the DamageProfiler module to compute DNA damage profiles.\n" + }, + "skip_qualimap": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off QualiMap and thus does not compute coverage and other mapping metrics.\n" + } + }, + "fa_icon": "fas fa-fast-forward", + "help_text": "Some of the steps in the pipeline can be executed optionally. If you specify\nspecific steps to be skipped, there won't be any output related to these\nmodules." + }, + "complexity_filtering": { + "title": "Complexity filtering", + "type": "object", + "description": "Processing of Illumina two-colour chemistry data.", + "default": "", + "properties": { + "complexity_filter_poly_g": { + "type": "boolean", + "description": "Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries.", + "fa_icon": "fas fa-power-off", + "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using `fastp`, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n" + }, + "complexity_filter_poly_g_min": { + "type": "integer", + "default": 10, + "description": "Specify length of poly-g min for clipping to be performed.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming. By default, this is set to a value of `10` unless the user has chosen something specifically using this option.\n\n> Modifies fastp parameter: `--poly_g_min_len`" + } + }, + "fa_icon": "fas fa-filter", + "help_text": "More details can be seen in the [fastp\ndocumentation](https://github.com/OpenGene/fastp)\n\nIf using TSV input, this is performed per lane separately" + }, + "read_merging_and_adapter_removal": { + "title": "Read merging and adapter removal", + "type": "object", + "description": "Options for adapter clipping and paired-end merging.", + "default": "", + "properties": { + "clip_forward_adaptor": { + "type": "string", + "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", + "description": "Specify adapter sequence to be clipped off (forward strand).", + "fa_icon": "fas fa-cut", + "help_text": "Defines the adapter sequence to be used for the forward read. By default, this is set to `'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'`.\n\n> Modifies AdapterRemoval parameter: `--adapter1`" + }, + "clip_reverse_adaptor": { + "type": "string", + "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA", + "description": "Specify adapter sequence to be clipped off (reverse strand).", + "fa_icon": "fas fa-cut", + "help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`" + }, + "clip_adapters_list": { + "type": "string", + "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters", + "fa_icon": "fas fa-cut", + "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`" + }, + "clip_readlength": { + "type": "integer", + "default": 30, + "description": "Specify read minimum length to be kept for downstream analysis.", + "fa_icon": "fas fa-ruler", + "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" + }, + "clip_min_read_quality": { + "type": "integer", + "default": 20, + "description": "Specify minimum base quality for trimming off bases.", + "fa_icon": "fas fa-medal", + "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.\n\n> Modifies AdapterRemoval parameter: `--minquality`" + }, + "min_adap_overlap": { + "type": "integer", + "default": 1, + "description": "Specify minimum adapter overlap required for clipping.", + "fa_icon": "fas fa-hands-helping", + "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before adapters are trimmed from reads. Default is set to `1` base overlap.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" + }, + "skip_collapse": { + "type": "boolean", + "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" + }, + "skip_trim": { + "type": "boolean", + "description": "Skip adapter and quality trimming.", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off adapter AND quality trimming.\n\nFor example:\n\n```bash\n--skip_trim --input '*.fastq'\n```\n\n:warning: it is not possible to keep quality trimming (n or base quality) on,\n_and_ skip adapter trimming.\n\n:warning: it is not possible to turn off one or the other of quality\ntrimming or n trimming. i.e. --trimns --trimqualities are both given\nor neither. However setting quality in `--clip_min_read_quality` to 0 would\ntheoretically turn off base quality trimming.\n\n> Modifies AdapterRemoval parameters: `--trimns --trimqualities --adapter1 --adapter2`" + }, + "preserve5p": { + "type": "boolean", + "description": "Skip quality base trimming (n, score, window) of 5 prime end.", + "fa_icon": "fas fa-life-ring", + "help_text": "Turns off quality based trimming at the 5p end of reads when any of the --trimns, --trimqualities, or --trimwindows options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. Described [here](https://github.com/MikkelSchubert/adapterremoval/issues/32#issuecomment-504758137).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`" + }, + "mergedonly": { + "type": "boolean", + "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", + "fa_icon": "fas fa-handshake", + "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." + }, + "qualitymax": { + "type": "integer", + "description": "Specify the maximum Phred score used in input FASTQ files", + "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", + "default": 41, + "fa_icon": "fas fa-arrow-up" + }, + "run_post_ar_trimming": { + "type": "boolean", + "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)", + "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n" + }, + "post_ar_trim_front": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the front of a merged read or R1", + "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`" + }, + "post_ar_trim_tail": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the tail of of a merged read or R1", + "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`" + }, + "post_ar_trim_front2": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the front of R2", + "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`" + }, + "post_ar_trim_tail2": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the tail of R2", + "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`" + } + }, + "fa_icon": "fas fa-cut", + "help_text": "These options handle various parts of adapter clipping and read merging steps.\n\nMore details can be seen in the [AdapterRemoval\ndocumentation](https://adapterremoval.readthedocs.io/en/latest/)\n\nIf using TSV input, this is performed per lane separately.\n\n> :warning: `--skip_trim` will skip adapter clipping AND quality trimming\n> (n, base quality). It is currently not possible skip one or the other." + }, + "mapping": { + "title": "Read mapping to reference genome", + "type": "object", + "description": "Options for reference-genome mapping", + "default": "", + "properties": { + "mapper": { + "title": "Mapper", + "type": "string", + "description": "Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'.", + "default": "bwaaln", + "fa_icon": "fas fa-layer-group", + "help_text": "Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)\n", + "enum": [ + "bwaaln", + "bwamem", + "circularmapper", + "bowtie2" + ] + }, + "bwaalnn": { + "type": "number", + "default": 0.01, + "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", + "fa_icon": "fas fa-sort-numeric-down", + "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`" + }, + "bwaalnk": { + "type": "integer", + "default": 2, + "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", + "fa_icon": "fas fa-drafting-compass", + "help_text": "Configures the `bwa aln -k` parameter for the seeding phase in the mapping algorithm. Default is set to `2`.\n\n> Modifies BWA aln parameter: `-k`" + }, + "bwaalnl": { + "type": "integer", + "default": 1024, + "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Configures the length of the seed used in `bwa aln -l`. Default is set to be 'turned off' at the recommendation of Schubert et al. ([2012 _BMC Genomics_](https://doi.org/10.1186/1471-2164-13-178)) for ancient DNA with `1024`.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`\n" + }, + "bwaalno": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-people-arrows", + "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", + "help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n" + }, + "circularextension": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only).", + "fa_icon": "fas fa-external-link-alt", + "help_text": "The number of bases to extend the reference genome with. By default this is set to `500` if not specified otherwise.\n\n> Modifies circulargenerator and realignsamfile parameter: `-e`" + }, + "circulartarget": { + "type": "string", + "default": "MT", + "description": "Specify the FASTA header of the target chromosome to extend (circularmapper only).", + "fa_icon": "fas fa-bullseye", + "help_text": "The chromosome in your FASTA reference that you'd like to be treated as circular. By default this is set to `MT` but can be configured to match any other chromosome.\n\n> Modifies circulargenerator parameter: `-s`" + }, + "circularfilter": { + "type": "boolean", + "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", + "fa_icon": "fas fa-filter", + "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" + }, + "bt2_alignmode": { + "type": "string", + "default": "local", + "description": "Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'.", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105).\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", + "enum": [ + "local", + "end-to-end" + ] + }, + "bt2_sensitivity": { + "type": "string", + "default": "sensitive", + "description": "Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'.", + "fa_icon": "fas fa-microscope", + "help_text": "The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment)\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", + "enum": [ + "no-preset", + "very-fast", + "fast", + "sensitive", + "very-sensitive" + ] + }, + "bt2n": { + "type": "integer", + "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", + "fa_icon": "fas fa-sort-numeric-down", + "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", + "default": 0 + }, + "bt2l": { + "type": "integer", + "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", + "default": 0 + }, + "bt2_trim5": { + "type": "integer", + "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", + "fa_icon": "fas fa-cut", + "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", + "default": 0 + }, + "bt2_trim3": { + "type": "integer", + "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", + "fa_icon": "fas fa-cut", + "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", + "default": 0 + }, + "bt2_maxins": { + "type": "integer", + "default": 500, + "fa_icon": "fas fa-exchange-alt", + "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", + "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n See [Bowtie2 documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for more information.\n\n> Modifies Bowtie2 parameters: `--maxins`" + } + }, + "fa_icon": "fas fa-layer-group", + "help_text": "If using TSV input, mapping is performed at the library level, i.e. after lane merging.\n" + }, + "host_removal": { + "title": "Removal of Host-Mapped Reads", + "type": "object", + "description": "Options for production of host-read removed FASTQ files for privacy reasons.", + "default": "", + "properties": { + "hostremoval_input_fastq": { + "type": "boolean", + "description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", + "fa_icon": "fas fa-power-off", + "help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n" + }, + "hostremoval_mode": { + "type": "string", + "default": "remove", + "description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).", + "fa_icon": "fas fa-mask", + "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", + "enum": [ + "strip", + "replace", + "remove" + ] + } + }, + "fa_icon": "fas fa-user-shield", + "help_text": "These parameters are used for removing mapped reads from the original input\nFASTQ files, usually in the context of uploading the original FASTQ files to a\npublic read archive (NCBI SRA/EBI ENA/DDBJ SRA).\n\nThese flags will produce FASTQ files almost identical to your input files,\nexcept that reads with the same read ID as one found in the mapped bam file, are\neither removed or 'masked' (every base replaced with Ns).\n\nThis functionality allows you to provide other researchers who wish to re-use\nyour data to apply their own adapter removal/read merging procedures, while\nmaintaining anonymity for sample donors - for example with microbiome\nresearch.\n\nIf using TSV input, stripping is performed library, i.e. after lane merging." + }, + "bam_filtering": { + "title": "BAM Filtering", + "type": "object", + "description": "Options for quality filtering and how to deal with off-target unmapped reads.", + "default": "", + "properties": { + "run_bam_filtering": { + "type": "boolean", + "description": "Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on the bam filtering module for either mapping quality filtering or unmapped read treatment.\n" + }, + "bam_mapping_quality_threshold": { + "type": "integer", + "description": "Minimum mapping quality for reads filter.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", + "default": 0 + }, + "bam_filter_minreadlength": { + "type": "integer", + "fa_icon": "fas fa-ruler-horizontal", + "description": "Specify minimum read length to be kept after mapping.", + "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", + "default": 0 + }, + "bam_unmapped_type": { + "type": "string", + "default": "discard", + "description": "Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format Options: 'discard', 'bam', 'fastq', 'both'.", + "fa_icon": "fas fa-trash-alt", + "help_text": "Defines how to proceed with unmapped reads: `'discard'` removes all unmapped reads, `keep` keeps both unmapped and mapped reads in the same BAM file, `'bam'` keeps unmapped reads as BAM file, `'fastq'` keeps unmapped reads as FastQ file, `both` keeps both BAM and FASTQ files. Default is `discard`. `keep` is what would happen if `--run_bam_filtering` was _not_ supplied.\n\nNote that in all cases, if `--bam_mapping_quality_threshold` is also supplied, mapping quality filtering will still occur on the mapped reads.\n\n> Modifies samtools view parameter: `-f4 -F4`", + "enum": [ + "discard", + "keep", + "bam", + "fastq", + "both" + ] + } + }, + "fa_icon": "fas fa-sort-amount-down", + "help_text": "Users can configure to keep/discard/extract certain groups of reads efficiently\nin the nf-core/eager pipeline.\n\nIf using TSV input, filtering is performed library, i.e. after lane merging.\n\nThis module utilises `samtools view` and `filter_bam_fragment_length.py`" + }, + "deduplication": { + "title": "DeDuplication", + "type": "object", + "description": "Options for removal of PCR amplicon duplicates that can artificially inflate coverage.", + "default": "", + "properties": { + "dedupper": { + "type": "string", + "default": "markduplicates", + "description": "Deduplication method to use. Options: 'markduplicates', 'dedup'.", + "fa_icon": "fas fa-object-group", + "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n", + "enum": [ + "markduplicates", + "dedup" + ] + }, + "dedup_all_merged": { + "type": "boolean", + "description": "Turn on treating all reads as merged reads.", + "fa_icon": "fas fa-handshake", + "help_text": "Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above).\n\n> Modifies dedup parameter: `-m`" + } + }, + "fa_icon": "fas fa-clone", + "help_text": "If using TSV input, deduplication is performed per library, i.e. after lane merging." + }, + "library_complexity_analysis": { + "title": "Library Complexity Analysis", + "type": "object", + "description": "Options for calculating library complexity (i.e. how many unique reads are present).", + "default": "", + "properties": { + "preseq_mode": { + "type": "string", + "default": "c_curve", + "description": "Specify which mode of preseq to run.", + "fa_icon": "fas fa-toggle-on", + "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram", + "enum": [ + "c_curve", + "lc_extrap" + ] + }, + "preseq_step_size": { + "type": "integer", + "default": 1000, + "description": "Specify the step size of Preseq.", + "fa_icon": "fas fa-shoe-prints", + "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" + }, + "preseq_maxextrap": { + "type": "integer", + "default": 10000000000, + "description": "Specify the maximum extrapolation (lc_extrap mode only)", + "fa_icon": "fas fa-ban", + "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`" + }, + "preseq_terms": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`" + }, + "preseq_bootstrap": { + "type": "integer", + "default": 100, + "description": "Specify number of bootstraps to perform (lc_extrap mode only)", + "fa_icon": "fab fa-bootstrap", + "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`" + }, + "preseq_cval": { + "type": "number", + "default": 0.95, + "description": "Specify confidence interval level (lc_extrap mode only)", + "fa_icon": "fas fa-check-circle", + "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`" + } + }, + "fa_icon": "fas fa-bezier-curve", + "help_text": "nf-core/eager uses Preseq on mapped reads as one method to calculate library\ncomplexity. If DeDup is used, Preseq uses the histogram output of DeDup,\notherwise the sorted non-duplicated BAM file is supplied. Furthermore, if\npaired-end read collapsing is not performed, the `-P` flag is used." + }, + "adna_damage_analysis": { + "title": "(aDNA) Damage Analysis", + "type": "object", + "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", + "default": "", + "properties": { + "damage_estimation_tool": { + "type": "string", + "default": "damageprofiler", + "description": "Specify the tool to use for damage estimation.", + "fa_icon": "fas fa-tools", + "help_text": "Specify the tool to be used for damage estimation. Options: `damageprofiler`, `mapdamage`. By default, DamageProfiler is used.", + "enum": [ + "damageprofiler", + "mapdamage" + ] + }, + "damageprofiler_length": { + "type": "integer", + "default": 100, + "description": "Specify length filter for DamageProfiler.", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "Specifies the length filter for DamageProfiler. By default set to `100`.\n\n> Modifies DamageProfile parameter: `-l`" + }, + "damageprofiler_threshold": { + "type": "integer", + "default": 15, + "description": "Specify number of bases of each read to consider for DamageProfiler calculations.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Specifies the length of the read start and end to be considered for profile generation in DamageProfiler. By default set to `15` bases.\n\n> Modifies DamageProfile parameter: `-t`" + }, + "damageprofiler_yaxis": { + "type": "number", + "default": 0.3, + "description": "Specify the maximum misincorporation frequency that should be displayed on the damage plot. Set to 0 to 'autoscale'.", + "fa_icon": "fas fa-ruler-vertical", + "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the DamageProfiler damage plot. This is set to `0.30` (i.e. 30%) by default as this matches the popular [mapDamage2.0](https://ginolhac.github.io/mapDamage) program. However, the default behaviour of DamageProfiler is to 'autoscale' the y-axis maximum to zoom in on any _possible_ damage that may occur (e.g. if the damage is about 10%, the highest value on the y-axis would be set to 0.12). This 'autoscale' behaviour can be turned on by specifying the number to `0`. Default: `0.30`.\n\n> Modifies DamageProfile parameter: `-yaxis_damageplot`" + }, + "mapdamage_downsample": { + "type": "integer", + "default": 10000, + "description": "Specify the maximum number of reads to consider for damage estimation.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "The maximum number of reads used for damage estimation in mapDamage2. Can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n> Modifies mapDamage2 parameter: `-n`" + }, + "mapdamage_yaxis": { + "type": "number", + "default": 0.3, + "description": "Specify the maximum misincorporation frequency that should be displayed on the damage plot.", + "fa_icon": "fas fa-ruler-vertical", + "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the mapDamage2 damage plot. This defaults to `0.30` (i.e. 30%).\n\n> Modifies mapDamage2 parameter: `-y`" + }, + "run_pmdtools": { + "type": "boolean", + "description": "Turn on PMDtools", + "fa_icon": "fas fa-power-off", + "help_text": "Specifies to run PMDTools for damage based read filtering and assessment of DNA damage in sequencing libraries. By default turned off.\n" + }, + "pmdtools_range": { + "type": "integer", + "default": 10, + "description": "Specify range of bases for PMDTools to scan for damage.", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "Specifies the range in which to consider DNA damage from the ends of reads. By default set to `10`.\n\n> Modifies PMDTools parameter: `--range`" + }, + "pmdtools_threshold": { + "type": "integer", + "default": 3, + "description": "Specify PMDScore threshold for PMDTools.", + "fa_icon": "fas fa-chart-bar", + "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis. By default set to `3` if not set specifically by the user.\n\n> Modifies PMDTools parameter: `--threshold`" + }, + "pmdtools_reference_mask": { + "type": "string", + "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", + "fa_icon": "fas fa-mask", + "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." + }, + "pmdtools_max_reads": { + "type": "integer", + "default": 10000, + "description": "Specify the maximum number of reads to consider for metrics generation.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" + }, + "pmdtools_platypus": { + "type": "boolean", + "description": "Append big list of base frequencies for platypus to output.", + "fa_icon": "fas fa-power-off", + "help_text": "Enables the printing of a wider list of base frequencies used by platypus as an addition to the output base misincorporation frequency table. By default turned off.\n" + }, + "run_mapdamage_rescaling": { + "type": "boolean", + "fa_icon": "fas fa-map", + "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", + "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" + }, + "rescale_seqlength": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-ruler-horizontal", + "description": "Length of read sequence to use from each side for rescaling. Can be overridden by --rescale_length_*p.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale at both ends.\n\n> Modifies the `--seq-length` parameter of mapDamage2." + }, + "rescale_length_5p": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-balance-scale-right", + "description": "Length of read for mapDamage2 to rescale from 5p end. Only used if not 0 otherwise --rescale_seqlength used.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale. Overrides `--rescale_seqlength`.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." + }, + "rescale_length_3p": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-balance-scale-left", + "description": "Length of read for mapDamage2 to rescale from 3p end. Only used if not 0 otherwise --rescale_seqlength used..", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." + } + }, + "fa_icon": "fas fa-chart-line", + "help_text": "More documentation can be seen in the follow links for:\n\n- [DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n- [PMDTools documentation](https://github.com/pontussk/PMDtools)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane\nmerging. PMDtools and BAM Trimming is run after library merging of same-named\nlibrary BAMs that have the same type of UDG treatment. BAM Trimming is only\nperformed on non-UDG and half-UDG treated data.\n" + }, + "feature_annotation_statistics": { + "title": "Feature Annotation Statistics", + "type": "object", + "description": "Options for getting reference annotation statistics (e.g. gene coverages)", + "default": "", + "properties": { + "run_bedtools_coverage": { + "type": "boolean", + "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", + "fa_icon": "fas fa-chart-area", + "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n" + }, + "anno_file": { + "type": "string", + "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", + "fa_icon": "fas fa-file-signature", + "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" + } + }, + "fa_icon": "fas fa-scroll", + "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" + }, + "bam_trimming": { + "title": "BAM Trimming", + "type": "object", + "description": "Options for trimming of aligned reads (e.g. to remove damage prior genotyping).", + "default": "", + "properties": { + "run_trim_bam": { + "type": "boolean", + "description": "Turn on BAM trimming. Will only run on non-UDG or half-UDG libraries", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file. Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`).\n\n> Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping." + }, + "bamutils_clip_double_stranded_half_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_half_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_none_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_none_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_half_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_half_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_none_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_none_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_softclip": { + "type": "boolean", + "description": "Turn on using softclip instead of hard masking.", + "fa_icon": "fas fa-paint-roller", + "help_text": "By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.\n\n> Modifies bam trimBam parameter: `-c`" + } + }, + "fa_icon": "fas fa-eraser", + "help_text": "For some library preparation protocols, users might want to clip off damaged\nbases before applying genotyping methods. This can be done in nf-core/eager\nautomatically by turning on the `--run_trim_bam` parameter.\n\nMore documentation can be seen in the [bamUtil\ndocumentation](https://genome.sph.umich.edu/wiki/BamUtil:_trimBam)\n" + }, + "genotyping": { + "title": "Genotyping", + "type": "object", + "description": "Options for variant calling.", + "default": "", + "properties": { + "run_genotyping": { + "type": "boolean", + "description": "Turn on genotyping of BAM files.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on genotyping to run on all post-dedup and downstream BAMs. For example if `--run_pmdtools` and `--trim_bam` are both supplied, the genotyper will be run on all three BAM files i.e. post-deduplication, post-pmd and post-trimmed BAM files." + }, + "genotyping_tool": { + "type": "string", + "description": "Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.", + "fa_icon": "fas fa-tools", + "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller. Specify 'ug', 'hc', 'freebayes', 'pileupcaller' and 'angsd' respectively.\n\n> > Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does _de novo_ assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", + "enum": [ + "ug", + "hc", + "freebayes", + "pileupcaller", + "angsd" + ] + }, + "genotyping_source": { + "type": "string", + "default": "raw", + "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", + "fa_icon": "fas fa-faucet", + "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", + "enum": [ + "raw", + "pmd", + "trimmed", + "rescaled" + ] + }, + "gatk_call_conf": { + "type": "integer", + "default": 30, + "description": "Specify GATK phred-scaled confidence threshold.", + "fa_icon": "fas fa-balance-scale-right", + "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call. Default: `30`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" + }, + "gatk_ploidy": { + "type": "integer", + "default": 2, + "description": "Specify GATK organism ploidy.", + "fa_icon": "fas fa-pastafarianism", + "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms. Default: `2`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `--sample-ploidy`" + }, + "gatk_downsample": { + "type": "integer", + "default": 250, + "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", + "fa_icon": "fas fa-icicles", + "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to 250 reads. Default: `250`\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" + }, + "gatk_dbsnp": { + "type": "string", + "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", + "fa_icon": "fas fa-marker", + "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.\n" + }, + "gatk_hc_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK genotyper HaplotypeCaller is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_ACTIVE_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK HaplotypeCaller parameter: `-output_mode`", + "enum": [ + "EMIT_ALL_ACTIVE_SITES", + "EMIT_ALL_CONFIDENT_SITES", + "EMIT_VARIANTS_ONLY" + ] + }, + "gatk_hc_emitrefconf": { + "type": "string", + "default": "GVCF", + "description": "Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK HaplotypeCaller is selected, mode for emitting reference confidence calls. Options: `'NONE'`, `'BP_RESOLUTION'`, `'GVCF'`. Default: `'GVCF'`\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`\n", + "enum": [ + "NONE", + "GVCF", + "BP_RESOLUTION" + ] + }, + "gatk_ug_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "enum": [ + "EMIT_ALL_SITES", + "EMIT_ALL_CONFIDENT_SITES", + "EMIT_VARIANTS_ONLY" + ] + }, + "gatk_ug_genotype_model": { + "type": "string", + "default": "SNP", + "description": "Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'.", + "fa_icon": "fas fa-project-diagram", + "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc. Options: `'SNP'`, `'INDEL'`, `'BOTH'`, `'GENERALPLOIDYSNP'`, `'GENERALPLOIDYINDEL`'. Default: `'SNP'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`", + "enum": [ + "SNP", + "INDEL", + "BOTH", + "GENERALPLOIDYSNP", + "GENERALPLOIDYINDEL" + ] + }, + "gatk_ug_keep_realign_bam": { + "type": "boolean", + "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", + "fa_icon": "fas fa-align-left", + "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + }, + "gatk_ug_defaultbasequalities": { + "type": "string", + "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", + "fa_icon": "fas fa-undo-alt", + "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to -1 which is to not set any default quality (turned off). Default: `-1`\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`" + }, + "freebayes_C": { + "type": "integer", + "default": 1, + "description": "Specify minimum required supporting observations to consider a variant.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify minimum required supporting observations to consider a variant. Default: `1`\n\n> Modifies freebayes parameter: `-C`" + }, + "freebayes_g": { + "type": "integer", + "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", + "fa_icon": "fab fa-think-peaks", + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", + "default": 0 + }, + "freebayes_p": { + "type": "integer", + "default": 2, + "description": "Specify ploidy of sample in FreeBayes.", + "fa_icon": "fas fa-pastafarianism", + "help_text": "Specify ploidy of sample in FreeBayes. Default is diploid. Default: `2`\n\n> Modifies freebayes parameter: `-p`" + }, + "pileupcaller_bedfile": { + "type": "string", + "description": "Specify path to SNP panel in bed format for pileupCaller.", + "fa_icon": "fas fa-bed", + "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate pileup for pileupCaller.\n" + }, + "pileupcaller_snpfile": { + "type": "string", + "description": "Specify path to SNP panel in EIGENSTRAT format for pileupCaller.", + "fa_icon": "fas fa-sliders-h", + "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/tree/master/CONVERTF) format, pileupCaller will call these sites.\n" + }, + "pileupcaller_method": { + "type": "string", + "default": "randomHaploid", + "description": "Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'.", + "fa_icon": "fas fa-toolbox", + "help_text": "Specify calling method to use. Options: randomHaploid, randomDiploid, majorityCall. Default: `'randomHaploid'`\n\n> Modifies pileupCaller parameter: `--randomHaploid --randomDiploid --majorityCall`", + "enum": [ + "randomHaploid", + "randomDiploid", + "majorityCall" + ] + }, + "pileupcaller_transitions_mode": { + "type": "string", + "default": "AllSites", + "description": "Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'.", + "fa_icon": "fas fa-toggle-on", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. Options: `'AllSites'`, `'TransitionsMissing'`, `'SkipTransitions'`. Default: `'AllSites'`\n\n> Modifies pileupCaller parameter: `--skipTransitions --transitionsMissing`", + "enum": [ + "AllSites", + "TransitionsMissing", + "SkipTransitions" + ] + }, + "pileupcaller_min_map_quality": { + "type": "integer", + "default": 30, + "description": "The minimum mapping quality to be used for genotyping.", + "fa_icon": "fas fa-filter", + "help_text": "The minimum mapping quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-q` parameter of samtools mpileup." + }, + "pileupcaller_min_base_quality": { + "type": "integer", + "default": 30, + "description": "The minimum base quality to be used for genotyping.", + "fa_icon": "fas fa-filter", + "help_text": "The minimum base quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-Q` parameter of samtools mpileup." + }, + "angsd_glmodel": { + "type": "string", + "default": "samtools", + "description": "Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'.", + "fa_icon": "fas fa-project-diagram", + "help_text": "Specify which genotype likelihood model to use. Options: `'samtools`, `'gatk'`, `'soapsnp'`, `'syk'`. Default: `'samtools'`\n\n> Modifies ANGSD parameter: `-GL`", + "enum": [ + "samtools", + "gatk", + "soapsnp", + "syk" + ] + }, + "angsd_glformat": { + "type": "string", + "default": "binary", + "description": "Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'.", + "fa_icon": "fas fa-text-height", + "help_text": "Specifies what type of genotyping likelihood file format will be output. Options: `'text'`, `'binary'`, `'binary_three'`, `'beagle_binary'`. Default: `'text'`.\n\nThe options refer to the following descriptions respectively:\n\n- `text`: textoutput of all 10 log genotype likelihoods.\n- `binary`: binary all 10 log genotype likelihood\n- `binary_three`: binary 3 times likelihood\n- `beagle_binary`: beagle likelihood file\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlF`", + "enum": [ + "text", + "binary", + "binary_three", + "beagle" + ] + }, + "angsd_createfasta": { + "type": "boolean", + "description": "Turn on creation of FASTA from ANGSD genotyping likelihood.", + "fa_icon": "fas fa-align-justify", + "help_text": "Turns on the ANGSD creation of a FASTA file from the BAM file.\n" + }, + "angsd_fastamethod": { + "type": "string", + "default": "random", + "description": "Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'.", + "fa_icon": "fas fa-toolbox", + "help_text": "The type of base calling to be performed when creating the ANGSD FASTA file. Options: `'random'` or `'common'`. Will output the most common non-N base at each given position, whereas 'random' will pick one at random. Default: `'random'`.\n\n> Modifies ANGSD parameter: `-doFasta -doCounts`", + "enum": [ + "random", + "common" + ] + }, + "run_bcftools_stats": { + "type": "boolean", + "default": true, + "description": "Turn on bcftools stats generation for VCF based variant calling statistics", + "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.", + "fa_icon": "far fa-chart-bar" + } + }, + "fa_icon": "fas fa-sliders-h", + "help_text": "There are options for different genotypers (or genotype likelihood calculators)\nto be used. We suggest you read the documentation of each tool to find the ones that\nsuit your needs.\n\nDocumentation for each tool:\n\n- [GATK\n UnifiedGenotyper](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php)\n- [GATK\n HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)\n- [FreeBayes](https://github.com/ekg/freebayes)\n- [ANGSD](http://www.popgen.dk/angsd/index.php/Genotype_Likelihoods)\n- [sequenceTools pileupCaller](https://github.com/stschiff/sequenceTools)\n\nIf using TSV input, genotyping is performed per sample (i.e. after all types of\nlibraries are merged), except for pileupCaller which gathers all double-stranded and\nsingle-stranded (same-type merged) libraries respectively." + }, + "consensus_sequence_generation": { + "title": "Consensus Sequence Generation", + "type": "object", + "description": "Options for creation of a per-sample FASTA sequence useful for downstream analysis (e.g. multi sequence alignment)", + "default": "", + "properties": { + "run_vcf2genome": { + "type": "boolean", + "description": "Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on consensus sequence genome creation via VCF2Genome. Only accepts GATK UnifiedGenotyper VCF files with the `--gatk_ug_out_mode 'EMIT_ALL_SITES'` and `--gatk_ug_genotype_model 'SNP` flags. Typically useful for small genomes such as mitochondria.\n" + }, + "vcf2genome_outfile": { + "type": "string", + "description": "Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name.", + "fa_icon": "fas fa-file-alt", + "help_text": "The name of your requested output FASTA file. Do not include `.fasta` suffix.\n" + }, + "vcf2genome_header": { + "type": "string", + "description": "Specify the header name of the consensus sequence entry within the FASTA file.", + "fa_icon": "fas fa-heading", + "help_text": "The name of the FASTA entry you would like in your FASTA file.\n" + }, + "vcf2genome_minc": { + "type": "integer", + "default": 5, + "description": "Minimum depth coverage required for a call to be included (else N will be called).", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "Minimum depth coverage for a SNP to be made. Else, a SNP will be called as N. Default: `5`\n\n> Modifies VCF2Genome parameter: `-minc`" + }, + "vcf2genome_minq": { + "type": "integer", + "default": 30, + "description": "Minimum genotyping quality of a call to be called. Else N will be called.", + "fa_icon": "fas fa-medal", + "help_text": "Minimum genotyping quality of a call to be made. Else N will be called. Default: `30`\n\n> Modifies VCF2Genome parameter: `-minq`" + }, + "vcf2genome_minfreq": { + "type": "number", + "default": 0.8, + "description": "Minimum fraction of reads supporting a call to be included. Else N will be called.", + "fa_icon": "fas fa-percent", + "help_text": "In the case of two possible alleles, the frequency of the majority allele required for a call to be made. Else, a SNP will be called as N. Default: `0.8`\n\n> Modifies VCF2Genome parameter: `-minfreq`" + } + }, + "fa_icon": "fas fa-handshake", + "help_text": "If using TSV input, consensus generation is performed per sample (i.e. after all\ntypes of libraries are merged)." + }, + "snp_table_generation": { + "title": "SNP Table Generation", + "type": "object", + "description": "Options for creation of a SNP table useful for downstream analysis (e.g. estimation of cross-mapping of different species and multi-sequence alignment)", + "default": "", + "properties": { + "run_multivcfanalyzer": { + "type": "boolean", + "description": "Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on MultiVCFAnalyzer. Will only work when in combination with UnifiedGenotyper genotyping module.\n" + }, + "write_allele_frequencies": { + "type": "boolean", + "description": "Turn on writing write allele frequencies in the SNP table.", + "fa_icon": "fas fa-pen", + "help_text": "Specify whether to tell MultiVCFAnalyzer to write within the SNP table the frequencies of the allele at that position e.g. A (70%).\n" + }, + "min_genotype_quality": { + "type": "integer", + "default": 30, + "description": "Specify the minimum genotyping quality threshold for a SNP to be called.", + "fa_icon": "fas fa-medal", + "help_text": "The minimal genotyping quality for a SNP to be considered for processing by MultiVCFAnalyzer. The default threshold is `30`.\n" + }, + "min_base_coverage": { + "type": "integer", + "default": 5, + "description": "Specify the minimum number of reads a position needs to be covered to be considered for base calling.", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "The minimal number of reads covering a base for a SNP at that position to be considered for processing by MultiVCFAnalyzer. The default depth is `5`.\n" + }, + "min_allele_freq_hom": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call.", + "fa_icon": "fas fa-percent", + "help_text": "The minimal frequency of a nucleotide for a 'homozygous' SNP to be called. In other words, e.g. 90% of the reads covering that position must have that SNP to be called. If the threshold is not reached, and the previous two parameters are matched, a reference call is made (displayed as . in the SNP table). If the above two parameters are not met, an 'N' is called. The default allele frequency is `0.9`.\n" + }, + "min_allele_freq_het": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", + "fa_icon": "fas fa-percent", + "help_text": "The minimum frequency of a nucleotide for a 'heterozygous' SNP to be called. If\nthis parameter is set to the same as `--min_allele_freq_hom`, then only\nhomozygous calls are made. If this value is less than the previous parameter,\nthen a SNP call will be made. If it is between this and the previous parameter,\nit will be displayed as a IUPAC uncertainty call. Default is `0.9`." + }, + "additional_vcf_files": { + "type": "string", + "description": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files.", + "fa_icon": "fas fa-copy", + "help_text": "If you wish to add to the table previously created VCF files, specify here a path with wildcards (in quotes). These VCF files must be created the same way as your settings for [GATK UnifiedGenotyping](#genotyping-parameters) module above." + }, + "reference_gff_annotations": { + "type": "string", + "default": "NA", + "description": "Specify path to the reference genome annotations in '.gff' format. Optional.", + "fa_icon": "fas fa-file-signature", + "help_text": "If you wish to report in the SNP table annotation information for the regions\nSNPs fall in, provide a file in GFF format (the path must be in quotes).\n" + }, + "reference_gff_exclude": { + "type": "string", + "default": "NA", + "description": "Specify path to the positions to be excluded in '.gff' format. Optional.", + "fa_icon": "fas fa-times", + "help_text": "If you wish to exclude SNP regions from consideration by MultiVCFAnalyzer (such as for problematic regions), provide a file in GFF format (the path must be in quotes).\n" + }, + "snp_eff_results": { + "type": "string", + "default": "NA", + "description": "Specify path to the output file from SNP effect analysis in '.txt' format. Optional.", + "fa_icon": "fas fa-magic", + "help_text": "If you wish to include results from SNPEff effect analysis, supply the output\nfrom SNPEff in txt format (the path must be in quotes)." + } + }, + "fa_icon": "fas fa-table", + "help_text": "SNP Table Generation here is performed by MultiVCFAnalyzer. The current version\nof MultiVCFAnalyzer version only accepts GATK UnifiedGenotyper 3.5 VCF files,\nand when the ploidy was set to 2 (this allows MultiVCFAnalyzer to report\nfrequencies of polymorphic positions). A description of how the tool works can\nbe seen in the Supplementary Information of [Bos et al.\n(2014)](https://doi.org/10.1038/nature13591) under \"SNP Calling and Phylogenetic\nAnalysis\".\n\nMore can be seen in the [MultiVCFAnalyzer\ndocumentation](https://github.com/alexherbig/MultiVCFAnalyzer).\n\nIf using TSV input, MultiVCFAnalyzer is performed on all samples gathered\ntogether." + }, + "mitochondrial_to_nuclear_ratio": { + "title": "Mitochondrial to Nuclear Ratio", + "type": "object", + "description": "Options for the calculation of ratio of reads to one chromosome/FASTA entry against all others.", + "default": "", + "properties": { + "run_mtnucratio": { + "type": "boolean", + "description": "Turn on mitochondrial to nuclear ratio calculation.", + "fa_icon": "fas fa-balance-scale-left", + "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.\n" + }, + "mtnucratio_header": { + "type": "string", + "default": "MT", + "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", + "fa_icon": "fas fa-heading", + "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts\nas the mitochondrial 'chromosome' to base the ratio calculation on. The tool\nonly accepts the first section of the header before the first space. The default\nchromosome name is based on hs37d5/GrCH37 human reference genome. Default: 'MT'" + } + }, + "fa_icon": "fas fa-balance-scale-left", + "help_text": "If using TSV input, Mitochondrial to Nuclear Ratio calculation is calculated per\ndeduplicated library (after lane merging)" + }, + "human_sex_determination": { + "title": "Human Sex Determination", + "type": "object", + "description": "Options for the calculation of biological sex of human individuals.", + "default": "", + "properties": { + "run_sexdeterrmine": { + "type": "boolean", + "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", + "fa_icon": "fas fa-transgender-alt", + "help_text": "Specify to run the optional process of sex determination.\n" + }, + "sexdeterrmine_bedfile": { + "type": "string", + "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", + "fa_icon": "fas fa-bed", + "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind. The path must be in quotes." + } + }, + "fa_icon": "fas fa-transgender", + "help_text": "An optional process for human DNA. It can be used to calculate the relative\ncoverage of X and Y chromosomes compared to the autosomes (X-/Y-rate). Standard\nerrors for these measurements are also calculated, assuming a binomial\ndistribution of reads across the SNPs.\n\nIf using TSV input, SexDetERRmine is performed on all samples gathered together." + }, + "nuclear_contamination_for_human_dna": { + "title": "Nuclear Contamination for Human DNA", + "type": "object", + "description": "Options for the estimation of contamination of human DNA.", + "default": "", + "properties": { + "run_nuclear_contamination": { + "type": "boolean", + "description": "Turn on nuclear contamination estimation for human reference genomes.", + "fa_icon": "fas fa-power-off", + "help_text": "Specify to run the optional processes for (human) nuclear DNA contamination estimation.\n" + }, + "contamination_chrom_name": { + "type": "string", + "default": "X", + "description": "The name of the X chromosome in your bam/FASTA header. 'X' for hs37d5, 'chrX' for HG19.", + "fa_icon": "fas fa-address-card", + "help_text": "The name of the human chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. Defaults to `'X'`." + } + }, + "fa_icon": "fas fa-radiation-alt" + }, + "metagenomic_screening": { + "title": "Metagenomic Screening", + "type": "object", + "description": "Options for metagenomic screening of off-target reads.", + "default": "", + "properties": { + "metagenomic_complexity_filter": { + "type": "boolean", + "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", + "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", + "fa_icon": "fas fa-filter" + }, + "metagenomic_complexity_entropy": { + "type": "number", + "default": 0.3, + "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", + "minimum": 0, + "maximum": 1, + "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", + "fa_icon": "fas fa-percent" + }, + "run_metagenomic_screening": { + "type": "boolean", + "description": "Turn on metagenomic screening module for reference-unmapped reads.", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on the metagenomic screening module.\n" + }, + "metagenomic_tool": { + "type": "string", + "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", + "fa_icon": "fas fa-tools", + "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" + }, + "database": { + "type": "string", + "description": "Specify path to classifier database directory. For Kraken2 this can also be a `.tar.gz` of the directory.", + "fa_icon": "fas fa-database", + "help_text": "Specify the path to the _directory_ containing your taxonomic classifier's database (malt or kraken).\n\nFor Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database." + }, + "metagenomic_min_support_reads": { + "type": "integer", + "default": 1, + "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with --malt_min_support_mode 'percent'.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + }, + "percent_identity": { + "type": "integer", + "default": 85, + "description": "Percent identity value threshold for MALT.", + "fa_icon": "fas fa-id-card", + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + }, + "malt_mode": { + "type": "string", + "default": "BlastN", + "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", + "fa_icon": "fas fa-align-left", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "enum": [ + "BlastN", + "BlastP", + "BlastX" + ] + }, + "malt_alignment_mode": { + "type": "string", + "default": "SemiGlobal", + "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "enum": [ + "Local", + "SemiGlobal" + ] + }, + "malt_top_percent": { + "type": "integer", + "default": 1, + "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", + "fa_icon": "fas fa-percent", + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + }, + "malt_min_support_mode": { + "type": "string", + "default": "percent", + "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", + "fa_icon": "fas fa-drumstick-bite", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "enum": [ + "percent", + "reads" + ] + }, + "malt_min_support_percent": { + "type": "number", + "default": 0.01, + "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", + "fa_icon": "fas fa-percentage", + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + }, + "malt_max_queries": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of queries a read can have for MALT.", + "fa_icon": "fas fa-phone", + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + }, + "malt_memory_mode": { + "type": "string", + "default": "load", + "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", + "fa_icon": "fas fa-memory", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "enum": [ + "load", + "page", + "map" + ] + }, + "malt_sam_output": { + "type": "boolean", + "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", + "fa_icon": "fas fa-file-alt", + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" + } + }, + "fa_icon": "fas fa-search", + "help_text": "\nAn increasingly common line of analysis in high-throughput aDNA analysis today\nis simultaneously screening off target reads of the host for endogenous\nmicrobial signals - particularly of pathogens. Metagenomic screening is\ncurrently offered via MALT with aDNA specific verification via MaltExtract, or\nKraken2.\n\nPlease note the following:\n\n- :warning: Metagenomic screening is only performed on _unmapped_ reads from a\n mapping step.\n - You _must_ supply the `--run_bam_filtering` flag with unmapped reads in\n FASTQ format.\n - If you wish to run solely MALT (i.e. the HOPS pipeline), you must still\n supply a small decoy genome like phiX or human mtDNA `--fasta`.\n- MALT database construction functionality is _not_ included within the pipeline\n - this should be done independently, **prior** the nf-core/eager run.\n - To use `malt-build` from the same version as `malt-run`, load either the\n Docker, Singularity or Conda environment.\n- MALT can often require very large computing resources depending on your\n database. We set a absolute minimum of 16 cores and 128GB of memory (which is\n 1/4 of the recommendation from the developer). Please leave an issue on the\n [nf-core github](https://github.com/nf-core/eager/issues) if you would like to\n see this changed.\n\n> :warning: Running MALT on a server with less than 128GB of memory should be\n> performed at your own risk.\n\nIf using TSV input, metagenomic screening is performed on all samples gathered\ntogether." + }, + "metagenomic_authentication": { + "title": "Metagenomic Authentication", + "type": "object", + "description": "Options for authentication of metagenomic screening performed by MALT.", + "default": "", + "properties": { + "run_maltextract": { + "type": "boolean", + "description": "Turn on MaltExtract for MALT aDNA characteristics authentication.", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT.\n\nMore can be seen in the [MaltExtract documentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" + }, + "maltextract_taxon_list": { + "type": "string", + "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", + "fa_icon": "fas fa-list-ul", + "help_text": "\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\n\nOnly when `--metagenomic_tool malt` is also supplied." + }, + "maltextract_ncbifiles": { + "type": "string", + "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", + "fa_icon": "fas fa-database", + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\n\nOnly when `--metagenomic_tool malt` is also supplied." + }, + "maltextract_filter": { + "type": "string", + "default": "def_anc", + "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", + "fa_icon": "fas fa-filter", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-f`", + "enum": [ + "def_anc", + "default", + "ancient", + "scan", + "crawl", + "srna" + ] + }, + "maltextract_toppercent": { + "type": "number", + "default": 0.01, + "description": "Specify percent of top alignments to use.", + "fa_icon": "fas fa-percent", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\n\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\n\nDefault: `0.01`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-a`" + }, + "maltextract_destackingoff": { + "type": "boolean", + "description": "Turn off destacking.", + "fa_icon": "fas fa-align-center", + "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\nremoved (leaving a depth coverage of 1).\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--destackingOff`" + }, + "maltextract_downsamplingoff": { + "type": "boolean", + "description": "Turn off downsampling.", + "fa_icon": "fab fa-creative-commons-sampling", + "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--downSampOff`" + }, + "maltextract_duplicateremovaloff": { + "type": "boolean", + "description": "Turn off duplicate removal.", + "fa_icon": "fas fa-align-left", + "help_text": "\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--dupRemOff`" + }, + "maltextract_matches": { + "type": "boolean", + "description": "Turn on exporting alignments of hits in BLAST format.", + "fa_icon": "fas fa-equals", + "help_text": "\nExport alignments of hits for each node in BLAST format. By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--matches`" + }, + "maltextract_megansummary": { + "type": "boolean", + "description": "Turn on export of MEGAN summary files.", + "fa_icon": "fas fa-download", + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--meganSummary`" + }, + "maltextract_percentidentity": { + "type": "number", + "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", + "default": 85.0, + "fa_icon": "fas fa-id-card", + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" + }, + "maltextract_topalignment": { + "type": "boolean", + "description": "Turn on using top alignments per read after filtering.", + "fa_icon": "fas fa-star-half-alt", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--useTopAlignment`" + } + }, + "fa_icon": "fas fa-tasks", + "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic\noutput from MALT.\n\nMore can be seen in the [MaltExtract\ndocumentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" } - }, - "fa_icon": "fas fa-handshake", - "help_text": "If using TSV input, consensus generation is performed per sample (i.e. after all\ntypes of libraries are merged)." }, - "snp_table_generation": { - "title": "SNP Table Generation", - "type": "object", - "description": "Options for creation of a SNP table useful for downstream analysis (e.g. estimation of cross-mapping of different species and multi-sequence alignment)", - "default": "", - "properties": { - "run_multivcfanalyzer": { - "type": "boolean", - "description": "Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on MultiVCFAnalyzer. Will only work when in combination with UnifiedGenotyper genotyping module.\n" + "allOf": [ + { + "$ref": "#/definitions/input_output_options" }, - "write_allele_frequencies": { - "type": "boolean", - "description": "Turn on writing write allele frequencies in the SNP table.", - "fa_icon": "fas fa-pen", - "help_text": "Specify whether to tell MultiVCFAnalyzer to write within the SNP table the frequencies of the allele at that position e.g. A (70%).\n" + { + "$ref": "#/definitions/input_data_additional_options" }, - "min_genotype_quality": { - "type": "integer", - "default": 30, - "description": "Specify the minimum genotyping quality threshold for a SNP to be called.", - "fa_icon": "fas fa-medal", - "help_text": "The minimal genotyping quality for a SNP to be considered for processing by MultiVCFAnalyzer. The default threshold is `30`.\n" + { + "$ref": "#/definitions/reference_genome_options" }, - "min_base_coverage": { - "type": "integer", - "default": 5, - "description": "Specify the minimum number of reads a position needs to be covered to be considered for base calling.", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "The minimal number of reads covering a base for a SNP at that position to be considered for processing by MultiVCFAnalyzer. The default depth is `5`.\n" + { + "$ref": "#/definitions/output_options" }, - "min_allele_freq_hom": { - "type": "number", - "default": 0.9, - "description": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call.", - "fa_icon": "fas fa-percent", - "help_text": "The minimal frequency of a nucleotide for a 'homozygous' SNP to be called. In other words, e.g. 90% of the reads covering that position must have that SNP to be called. If the threshold is not reached, and the previous two parameters are matched, a reference call is made (displayed as . in the SNP table). If the above two parameters are not met, an 'N' is called. The default allele frequency is `0.9`.\n" + { + "$ref": "#/definitions/generic_options" }, - "min_allele_freq_het": { - "type": "number", - "default": 0.9, - "description": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", - "fa_icon": "fas fa-percent", - "help_text": "The minimum frequency of a nucleotide for a 'heterozygous' SNP to be called. If\nthis parameter is set to the same as `--min_allele_freq_hom`, then only\nhomozygous calls are made. If this value is less than the previous parameter,\nthen a SNP call will be made. If it is between this and the previous parameter,\nit will be displayed as a IUPAC uncertainty call. Default is `0.9`." + { + "$ref": "#/definitions/max_job_request_options" }, - "additional_vcf_files": { - "type": "string", - "description": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files.", - "fa_icon": "fas fa-copy", - "help_text": "If you wish to add to the table previously created VCF files, specify here a path with wildcards (in quotes). These VCF files must be created the same way as your settings for [GATK UnifiedGenotyping](#genotyping-parameters) module above." + { + "$ref": "#/definitions/institutional_config_options" }, - "reference_gff_annotations": { - "type": "string", - "default": "NA", - "description": "Specify path to the reference genome annotations in '.gff' format. Optional.", - "fa_icon": "fas fa-file-signature", - "help_text": "If you wish to report in the SNP table annotation information for the regions\nSNPs fall in, provide a file in GFF format (the path must be in quotes).\n" + { + "$ref": "#/definitions/skip_steps" }, - "reference_gff_exclude": { - "type": "string", - "default": "NA", - "description": "Specify path to the positions to be excluded in '.gff' format. Optional.", - "fa_icon": "fas fa-times", - "help_text": "If you wish to exclude SNP regions from consideration by MultiVCFAnalyzer (such as for problematic regions), provide a file in GFF format (the path must be in quotes).\n" + { + "$ref": "#/definitions/complexity_filtering" }, - "snp_eff_results": { - "type": "string", - "default": "NA", - "description": "Specify path to the output file from SNP effect analysis in '.txt' format. Optional.", - "fa_icon": "fas fa-magic", - "help_text": "If you wish to include results from SNPEff effect analysis, supply the output\nfrom SNPEff in txt format (the path must be in quotes)." - } - }, - "fa_icon": "fas fa-table", - "help_text": "SNP Table Generation here is performed by MultiVCFAnalyzer. The current version\nof MultiVCFAnalyzer version only accepts GATK UnifiedGenotyper 3.5 VCF files,\nand when the ploidy was set to 2 (this allows MultiVCFAnalyzer to report\nfrequencies of polymorphic positions). A description of how the tool works can\nbe seen in the Supplementary Information of [Bos et al.\n(2014)](https://doi.org/10.1038/nature13591) under \"SNP Calling and Phylogenetic\nAnalysis\".\n\nMore can be seen in the [MultiVCFAnalyzer\ndocumentation](https://github.com/alexherbig/MultiVCFAnalyzer).\n\nIf using TSV input, MultiVCFAnalyzer is performed on all samples gathered\ntogether." - }, - "mitochondrial_to_nuclear_ratio": { - "title": "Mitochondrial to Nuclear Ratio", - "type": "object", - "description": "Options for the calculation of ratio of reads to one chromosome/FASTA entry against all others.", - "default": "", - "properties": { - "run_mtnucratio": { - "type": "boolean", - "description": "Turn on mitochondrial to nuclear ratio calculation.", - "fa_icon": "fas fa-balance-scale-left", - "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.\n" + { + "$ref": "#/definitions/read_merging_and_adapter_removal" }, - "mtnucratio_header": { - "type": "string", - "default": "MT", - "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", - "fa_icon": "fas fa-heading", - "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts\nas the mitochondrial 'chromosome' to base the ratio calculation on. The tool\nonly accepts the first section of the header before the first space. The default\nchromosome name is based on hs37d5/GrCH37 human reference genome. Default: 'MT'" - } - }, - "fa_icon": "fas fa-balance-scale-left", - "help_text": "If using TSV input, Mitochondrial to Nuclear Ratio calculation is calculated per\ndeduplicated library (after lane merging)" - }, - "human_sex_determination": { - "title": "Human Sex Determination", - "type": "object", - "description": "Options for the calculation of biological sex of human individuals.", - "default": "", - "properties": { - "run_sexdeterrmine": { - "type": "boolean", - "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", - "fa_icon": "fas fa-transgender-alt", - "help_text": "Specify to run the optional process of sex determination.\n" - }, - "sexdeterrmine_bedfile": { - "type": "string", - "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", - "fa_icon": "fas fa-bed", - "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind. The path must be in quotes." - } - }, - "fa_icon": "fas fa-transgender", - "help_text": "An optional process for human DNA. It can be used to calculate the relative\ncoverage of X and Y chromosomes compared to the autosomes (X-/Y-rate). Standard\nerrors for these measurements are also calculated, assuming a binomial\ndistribution of reads across the SNPs.\n\nIf using TSV input, SexDetERRmine is performed on all samples gathered together." - }, - "nuclear_contamination_for_human_dna": { - "title": "Nuclear Contamination for Human DNA", - "type": "object", - "description": "Options for the estimation of contamination of human DNA.", - "default": "", - "properties": { - "run_nuclear_contamination": { - "type": "boolean", - "description": "Turn on nuclear contamination estimation for human reference genomes.", - "fa_icon": "fas fa-power-off", - "help_text": "Specify to run the optional processes for (human) nuclear DNA contamination estimation.\n" + { + "$ref": "#/definitions/mapping" }, - "contamination_chrom_name": { - "type": "string", - "default": "X", - "description": "The name of the X chromosome in your bam/FASTA header. 'X' for hs37d5, 'chrX' for HG19.", - "fa_icon": "fas fa-address-card", - "help_text": "The name of the human chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. Defaults to `'X'`." - } - }, - "fa_icon": "fas fa-radiation-alt" - }, - "metagenomic_screening": { - "title": "Metagenomic Screening", - "type": "object", - "description": "Options for metagenomic screening of off-target reads.", - "default": "", - "properties": { - "metagenomic_complexity_filter": { - "type": "boolean", - "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", - "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", - "fa_icon": "fas fa-filter" + { + "$ref": "#/definitions/host_removal" }, - "metagenomic_complexity_entropy": { - "type": "number", - "default": 0.3, - "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", - "minimum": 0, - "maximum": 1, - "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", - "fa_icon": "fas fa-percent" + { + "$ref": "#/definitions/bam_filtering" }, - "run_metagenomic_screening": { - "type": "boolean", - "description": "Turn on metagenomic screening module for reference-unmapped reads.", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on the metagenomic screening module.\n" + { + "$ref": "#/definitions/deduplication" }, - "metagenomic_tool": { - "type": "string", - "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", - "fa_icon": "fas fa-tools", - "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" + { + "$ref": "#/definitions/library_complexity_analysis" }, - "database": { - "type": "string", - "description": "Specify path to classifier database directory. For Kraken2 this can also be a `.tar.gz` of the directory.", - "fa_icon": "fas fa-database", - "help_text": "Specify the path to the _directory_ containing your taxonomic classifier's database (malt or kraken).\n\nFor Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database." + { + "$ref": "#/definitions/adna_damage_analysis" }, - "metagenomic_min_support_reads": { - "type": "integer", - "default": 1, - "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with --malt_min_support_mode 'percent'.", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + { + "$ref": "#/definitions/feature_annotation_statistics" }, - "percent_identity": { - "type": "integer", - "default": 85, - "description": "Percent identity value threshold for MALT.", - "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + { + "$ref": "#/definitions/bam_trimming" }, - "malt_mode": { - "type": "string", - "default": "BlastN", - "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", - "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", - "enum": ["BlastN", "BlastP", "BlastX"] + { + "$ref": "#/definitions/genotyping" }, - "malt_alignment_mode": { - "type": "string", - "default": "SemiGlobal", - "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", - "enum": ["Local", "SemiGlobal"] + { + "$ref": "#/definitions/consensus_sequence_generation" }, - "malt_top_percent": { - "type": "integer", - "default": 1, - "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", - "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + { + "$ref": "#/definitions/snp_table_generation" }, - "malt_min_support_mode": { - "type": "string", - "default": "percent", - "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", - "fa_icon": "fas fa-drumstick-bite", - "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", - "enum": ["percent", "reads"] + { + "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" }, - "malt_min_support_percent": { - "type": "number", - "default": 0.01, - "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", - "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + { + "$ref": "#/definitions/human_sex_determination" }, - "malt_max_queries": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of queries a read can have for MALT.", - "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + { + "$ref": "#/definitions/nuclear_contamination_for_human_dna" }, - "malt_memory_mode": { - "type": "string", - "default": "load", - "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", - "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", - "enum": ["load", "page", "map"] + { + "$ref": "#/definitions/metagenomic_screening" }, - "malt_sam_output": { - "type": "boolean", - "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", - "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" + { + "$ref": "#/definitions/metagenomic_authentication" } - }, - "fa_icon": "fas fa-search", - "help_text": "\nAn increasingly common line of analysis in high-throughput aDNA analysis today\nis simultaneously screening off target reads of the host for endogenous\nmicrobial signals - particularly of pathogens. Metagenomic screening is\ncurrently offered via MALT with aDNA specific verification via MaltExtract, or\nKraken2.\n\nPlease note the following:\n\n- :warning: Metagenomic screening is only performed on _unmapped_ reads from a\n mapping step.\n - You _must_ supply the `--run_bam_filtering` flag with unmapped reads in\n FASTQ format.\n - If you wish to run solely MALT (i.e. the HOPS pipeline), you must still\n supply a small decoy genome like phiX or human mtDNA `--fasta`.\n- MALT database construction functionality is _not_ included within the pipeline\n - this should be done independently, **prior** the nf-core/eager run.\n - To use `malt-build` from the same version as `malt-run`, load either the\n Docker, Singularity or Conda environment.\n- MALT can often require very large computing resources depending on your\n database. We set a absolute minimum of 16 cores and 128GB of memory (which is\n 1/4 of the recommendation from the developer). Please leave an issue on the\n [nf-core github](https://github.com/nf-core/eager/issues) if you would like to\n see this changed.\n\n> :warning: Running MALT on a server with less than 128GB of memory should be\n> performed at your own risk.\n\nIf using TSV input, metagenomic screening is performed on all samples gathered\ntogether." - }, - "metagenomic_authentication": { - "title": "Metagenomic Authentication", - "type": "object", - "description": "Options for authentication of metagenomic screening performed by MALT.", - "default": "", - "properties": { - "run_maltextract": { - "type": "boolean", - "description": "Turn on MaltExtract for MALT aDNA characteristics authentication.", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT.\n\nMore can be seen in the [MaltExtract documentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" - }, - "maltextract_taxon_list": { - "type": "string", - "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", - "fa_icon": "fas fa-list-ul", - "help_text": "\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\n\nOnly when `--metagenomic_tool malt` is also supplied." - }, - "maltextract_ncbifiles": { - "type": "string", - "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", - "fa_icon": "fas fa-database", - "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\n\nOnly when `--metagenomic_tool malt` is also supplied." - }, - "maltextract_filter": { - "type": "string", - "default": "def_anc", - "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", - "fa_icon": "fas fa-filter", - "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-f`", - "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] - }, - "maltextract_toppercent": { - "type": "number", - "default": 0.01, - "description": "Specify percent of top alignments to use.", - "fa_icon": "fas fa-percent", - "help_text": "Specify frequency of top alignments for each read to be considered for each node.\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\n\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\n\nDefault: `0.01`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-a`" - }, - "maltextract_destackingoff": { - "type": "boolean", - "description": "Turn off destacking.", - "fa_icon": "fas fa-align-center", - "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\nremoved (leaving a depth coverage of 1).\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--destackingOff`" - }, - "maltextract_downsamplingoff": { - "type": "boolean", - "description": "Turn off downsampling.", - "fa_icon": "fab fa-creative-commons-sampling", - "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--downSampOff`" - }, - "maltextract_duplicateremovaloff": { - "type": "boolean", - "description": "Turn off duplicate removal.", - "fa_icon": "fas fa-align-left", - "help_text": "\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--dupRemOff`" - }, - "maltextract_matches": { - "type": "boolean", - "description": "Turn on exporting alignments of hits in BLAST format.", - "fa_icon": "fas fa-equals", - "help_text": "\nExport alignments of hits for each node in BLAST format. By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--matches`" - }, - "maltextract_megansummary": { - "type": "boolean", - "description": "Turn on export of MEGAN summary files.", - "fa_icon": "fas fa-download", - "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--meganSummary`" - }, - "maltextract_percentidentity": { - "type": "number", - "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", - "default": 85, - "fa_icon": "fas fa-id-card", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" - }, - "maltextract_topalignment": { - "type": "boolean", - "description": "Turn on using top alignments per read after filtering.", - "fa_icon": "fas fa-star-half-alt", - "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--useTopAlignment`" - } - }, - "fa_icon": "fas fa-tasks", - "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic\noutput from MALT.\n\nMore can be seen in the [MaltExtract\ndocumentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" - } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" - }, - { - "$ref": "#/definitions/input_data_additional_options" - }, - { - "$ref": "#/definitions/reference_genome_options" - }, - { - "$ref": "#/definitions/output_options" - }, - { - "$ref": "#/definitions/generic_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/skip_steps" - }, - { - "$ref": "#/definitions/complexity_filtering" - }, - { - "$ref": "#/definitions/read_merging_and_adapter_removal" - }, - { - "$ref": "#/definitions/mapping" - }, - { - "$ref": "#/definitions/host_removal" - }, - { - "$ref": "#/definitions/bam_filtering" - }, - { - "$ref": "#/definitions/deduplication" - }, - { - "$ref": "#/definitions/library_complexity_analysis" - }, - { - "$ref": "#/definitions/adna_damage_analysis" - }, - { - "$ref": "#/definitions/feature_annotation_statistics" - }, - { - "$ref": "#/definitions/bam_trimming" - }, - { - "$ref": "#/definitions/genotyping" - }, - { - "$ref": "#/definitions/consensus_sequence_generation" - }, - { - "$ref": "#/definitions/snp_table_generation" - }, - { - "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" - }, - { - "$ref": "#/definitions/human_sex_determination" - }, - { - "$ref": "#/definitions/nuclear_contamination_for_human_dna" - }, - { - "$ref": "#/definitions/metagenomic_screening" - }, - { - "$ref": "#/definitions/metagenomic_authentication" - } - ] -} + ] +} \ No newline at end of file From 3d878c6e4317915784b448409b451f0295975048 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 11:17:30 +0200 Subject: [PATCH 05/30] Add mapdamage as alternative damage estimator --- main.nf | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 4ac17d697..b8564ea1a 100644 --- a/main.nf +++ b/main.nf @@ -186,7 +186,7 @@ if("${params.fasta}".endsWith(".gz")){ path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path` output: - path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_unmasked_fasta_for_masking,ch_unmasked_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats + path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler, ch_fasta_for_mapdamage ,ch_fasta_for_qualimap,ch_unmasked_fasta_for_masking,ch_unmasked_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats script: unzip = zipped_fasta.toString() - '.gz' @@ -197,7 +197,7 @@ if("${params.fasta}".endsWith(".gz")){ } else { fasta_for_indexing = Channel .fromPath("${params.fasta}", checkIfExists: true) - .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_unmasked_fasta_for_masking; ch_unmasked_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer; ch_fasta_for_genotyping_angsd; ch_fasta_for_damagerescaling; ch_fasta_for_bcftools_stats } + .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_mapdamage; ch_fasta_for_qualimap; ch_unmasked_fasta_for_masking; ch_unmasked_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer; ch_fasta_for_genotyping_angsd; ch_fasta_for_damagerescaling; ch_fasta_for_bcftools_stats } } // Check that fasta index file path ends in '.fai' @@ -1903,10 +1903,10 @@ process markduplicates{ // form of library merging. if ( params.skip_deduplication ) { ch_skiprmdup_for_libeval.mix(ch_dedup_for_libeval, ch_markdup_for_libeval) - .into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_for_nuclear_contamination; ch_rmdup_formtnucratio } + .into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_rmdup_for_mapdamage; ch_for_nuclear_contamination; ch_rmdup_formtnucratio } } else { ch_dedup_for_libeval.mix(ch_markdup_for_libeval) - .into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_for_nuclear_contamination; ch_rmdup_formtnucratio } + .into{ ch_rmdup_for_preseq; ch_rmdup_for_damageprofiler; ch_rmdup_for_mapdamage; ch_for_nuclear_contamination; ch_rmdup_formtnucratio } } // Merge independent libraries sequenced but with same treatment (often done to @@ -2077,7 +2077,7 @@ process bedtools { /* -- ANCIENT DNA EVALUATION AND BAM MODIFICATION -- */ ////////////////////////////////////////////////////////////// -// Calculate typical aDNA damage frequency distribution +// Calculate typical aDNA damage frequency distribution with DamageProfiler process damageprofiler { label 'sc_small' @@ -2086,7 +2086,7 @@ process damageprofiler { publishDir "${params.outdir}/damageprofiler", mode: params.publish_dir_mode when: - !params.skip_damage_calculation + !params.skip_damage_calculation && params.damage_estimation_tool == 'damageprofiler' input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damageprofiler @@ -2106,6 +2106,33 @@ process damageprofiler { """ } +// Calculate typical aDNA damage frequency distribution with mapDamage + +process mapdamage_estimation { + label 'sc_small' + tag "${libraryid}" + + publishDir "${params.outdir}/mapdamage", mode: params.publish_dir_mode + + when: + !params.skip_damage_calculation && params.damage_estimation_tool == 'mapdamage' + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_mapdamage + file fasta from ch_fasta_for_mapdamage.collect() + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("results_*") into ch_output_from_mapdamage + + script: + def base = "${bam.baseName}" + def singlestranded = strandedness == "single" ? '--single-stranded' : '' + def downsample = params.mapdamage_downsample != 0 ? "-n ${params.mapdamage_downsample} --downsample-seed=1" : '' // Include seed to make results consistent between runs + """ + mapDamage -i ${bam} -r ${fasta} ${singlestranded} ${downsample} --ymax=${params.mapdamage_yaxis} --no-stats + """ +} + // Damage rescaling with mapDamage process mapdamage_rescaling { From 72854eebb928bc5b91627026a3bb50981812a706 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 11:21:39 +0200 Subject: [PATCH 06/30] rename parameter --- main.nf | 4 ++-- nextflow.config | 2 +- nextflow_schema.json | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index b8564ea1a..e2ecf5acd 100644 --- a/main.nf +++ b/main.nf @@ -2086,7 +2086,7 @@ process damageprofiler { publishDir "${params.outdir}/damageprofiler", mode: params.publish_dir_mode when: - !params.skip_damage_calculation && params.damage_estimation_tool == 'damageprofiler' + !params.skip_damage_calculation && params.damage_calculation_tool == 'damageprofiler' input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damageprofiler @@ -2115,7 +2115,7 @@ process mapdamage_estimation { publishDir "${params.outdir}/mapdamage", mode: params.publish_dir_mode when: - !params.skip_damage_calculation && params.damage_estimation_tool == 'mapdamage' + !params.skip_damage_calculation && params.damage_calculation_tool == 'mapdamage' input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_mapdamage diff --git a/nextflow.config b/nextflow.config index b202c68c3..6874a22a0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -121,7 +121,7 @@ params { preseq_terms = 100 //Damage estimation settings - damage_estimation_tool = 'damageprofiler' + damage_calculation_tool = 'damageprofiler' damageprofiler_length = 100 damageprofiler_threshold = 15 damageprofiler_yaxis = 0.30 diff --git a/nextflow_schema.json b/nextflow_schema.json index af8779f65..6f1b32a74 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -848,12 +848,12 @@ "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", "default": "", "properties": { - "damage_estimation_tool": { + "damage_calculation_tool": { "type": "string", "default": "damageprofiler", - "description": "Specify the tool to use for damage estimation.", + "description": "Specify the tool to use for damage calculation.", "fa_icon": "fas fa-tools", - "help_text": "Specify the tool to be used for damage estimation. Options: `damageprofiler`, `mapdamage`. By default, DamageProfiler is used.", + "help_text": "Specify the tool to be used for damage calculation. Options: `damageprofiler`, `mapdamage`. By default, DamageProfiler is used.", "enum": [ "damageprofiler", "mapdamage" @@ -883,9 +883,9 @@ "mapdamage_downsample": { "type": "integer", "default": 10000, - "description": "Specify the maximum number of reads to consider for damage estimation.", + "description": "Specify the maximum number of reads to consider for damage calculation.", "fa_icon": "fas fa-greater-than-equal", - "help_text": "The maximum number of reads used for damage estimation in mapDamage2. Can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n> Modifies mapDamage2 parameter: `-n`" + "help_text": "The maximum number of reads used for damage calculation in mapDamage2. Can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n> Modifies mapDamage2 parameter: `-n`" }, "mapdamage_yaxis": { "type": "number", From 09ff881bca56da4ce3ed0a7b9c425d8cc57a7b79 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 11:34:33 +0200 Subject: [PATCH 07/30] Do not subsample by default in mapdamage --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6f1b32a74..042f69074 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -882,8 +882,8 @@ }, "mapdamage_downsample": { "type": "integer", - "default": 10000, - "description": "Specify the maximum number of reads to consider for damage calculation.", + "default": 0, + "description": "Specify the maximum number of reads to consider for damage calculation. Defaults value is `0` (i.e. no downsampling is performed).", "fa_icon": "fas fa-greater-than-equal", "help_text": "The maximum number of reads used for damage calculation in mapDamage2. Can be used to significantly reduce the amount of time required for damage assessment. Note that a too low value can also obtain incorrect results.\n\n> Modifies mapDamage2 parameter: `-n`" }, From eba08a2ab323b6bc1d13c138276bc13c1c5f8712 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 11:39:47 +0200 Subject: [PATCH 08/30] Disable downsampling by default --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6874a22a0..be549865d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -125,7 +125,7 @@ params { damageprofiler_length = 100 damageprofiler_threshold = 15 damageprofiler_yaxis = 0.30 - mapdamage_downsample = 10000 + mapdamage_downsample = 0 mapdamage_yaxis = 0.30 //PMDTools settings From d794b8e240251f5951114bff45ce82558899a35a Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 12:00:03 +0200 Subject: [PATCH 09/30] Add tests --- .github/workflows/ci.yml | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 363c4fea4..2dd7fcbcb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.07.1', '22.10.6'] + nxf_ver: ["20.07.1", "22.10.6"] steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -58,7 +58,7 @@ jobs: run: | git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data - name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs - run: | + run: | if [[ $NXF_VER = '' ]]; then sleep 1200; fi - name: BASIC Run the basic pipeline with directly supplied single-end FASTQ run: | @@ -74,7 +74,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --save_reference - name: REFERENCE Basic workflow, with supplied indices run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai' - name: REFERENCE Run the basic pipeline with FastA reference with `fna` extension run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker @@ -107,7 +107,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' - name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_post_ar_trimming @@ -141,6 +141,9 @@ jobs: - name: BEDTOOLS Test bedtools feature annotation run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bedtools_coverage --anno_file 'https://github.com/nf-core/test-datasets/raw/eager/reference/Mammoth/Mammoth_MT_Krause.gff3' + - name: MAPDAMAGE2 damage calculation + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --damage_calculation_tool 'mapdamage' - name: GENOTYPING_HC Test running GATK HaplotypeCaller run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker --run_genotyping --genotyping_tool 'hc' --gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' --gatk_hc_emitrefconf 'BP_RESOLUTION' @@ -193,11 +196,11 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter - name: MALTEXTRACT Download resource files run: | - mkdir -p databases/maltextract - for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done + mkdir -p databases/maltextract + for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done - name: MALTEXTRACT Basic with MALT plus MaltExtract run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq' From 6aedb96dde76c1cc5d8ea4cda2df7bb7a515ff05 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 12:00:29 +0200 Subject: [PATCH 10/30] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b7e90c6e..1fe4fc026 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` +- [#1020](https://github.com/nf-core/eager/issues/1020) Added mapdamage2 as an alternative for damage calculation. + ### `Fixed` - [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments. From 1cae42d8253d800cca79a5cda9f8c32a4e7fc67b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 18 Aug 2023 12:00:52 +0200 Subject: [PATCH 11/30] Partial update to output.md --- docs/output.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/output.md b/docs/output.md index 9b6950779..d6561b88f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -494,11 +494,11 @@ Plateauing can be caused by a number of reasons: * You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio * You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) -### DamageProfiler +### Damage Calculation #### Background -DamageProfiler is a tool which calculates a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. +DamageProfiler and mapDamage2 are tools which calculate a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. Therefore, three main characteristics of ancient DNA are: @@ -509,7 +509,7 @@ Therefore, three main characteristics of ancient DNA are: You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Misincorporation Plots - + The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. When looking at the misincorporation plots, keep the following in mind: @@ -525,7 +525,7 @@ When looking at the misincorporation plots, keep the following in mind:

-> **NB:** An important difference to note compared to the MapDamage tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is +> **NB:** An important difference to note compared to the MapDamage2 tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is #### Length Distribution @@ -686,6 +686,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. * `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. * `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. +* `mapdamage/`: this contains sample specific directories containing raw statistics and damage plots from mapDamage2. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. The `Runtime_log.txt` file contains runtime information. * `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. * `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). * `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. From 7dc233413f941b59726ba487b541157d88cd0fb8 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 22 Aug 2023 14:57:43 +0200 Subject: [PATCH 12/30] pass mapdamage output to MQC --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index e2ecf5acd..9e91e4fdb 100644 --- a/main.nf +++ b/main.nf @@ -2123,6 +2123,7 @@ process mapdamage_estimation { output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("results_*") into ch_output_from_mapdamage + path ("results_*") into ch_mapdamage_for_multiqc script: def base = "${bam.baseName}" @@ -3257,6 +3258,7 @@ process multiqc { file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_for_multiqc.collect().ifEmpty([]) file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) + path ('mapdamage/*') from ch_mapdamage_for_multiqc.collect().ifEmpty([]).dump(tag:'ch_mapdamage_for_multiqc') file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) file ('markdup/*') from ch_markdup_results_for_multiqc.collect().ifEmpty([]) file ('dedup*/*') from ch_dedup_results_for_multiqc.collect().ifEmpty([]) From b133d8becdf1cf2e185d4f0c0cf54e0089db687f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 22 Aug 2023 15:10:39 +0200 Subject: [PATCH 13/30] Update output.md --- docs/output.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/output.md b/docs/output.md index d6561b88f..09f5d07d9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -80,7 +80,7 @@ The possible columns displayed by default are as follows (note you may see addit * **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. * **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. * **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). -* **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. +* **X Prime Y>Z N base** These columns are from DamageProfiler or mapDamage. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. * **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. * **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. * **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. @@ -498,7 +498,7 @@ Plateauing can be caused by a number of reasons: #### Background -DamageProfiler and mapDamage2 are tools which calculate a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. +DamageProfiler and mapDamage are tools which calculate a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. Therefore, three main characteristics of ancient DNA are: @@ -509,8 +509,8 @@ Therefore, three main characteristics of ancient DNA are: You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Misincorporation Plots - -The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. + +The MultiQC DamageProfiler and mapDamage module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. When looking at the misincorporation plots, keep the following in mind: @@ -520,6 +520,7 @@ When looking at the misincorporation plots, keep the following in mind: * If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. * We generally expect that the older the sample, or the less-ideal preservational environment (hot/wet) the greater the frequency of C to T/G to A. * The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable. +* If the `mapdamage_downsample` parameter was specified and mapDamage was used for damage calculation, the damage frequency for each base is based only on the specified number of reads.

@@ -529,13 +530,14 @@ When looking at the misincorporation plots, keep the following in mind: #### Length Distribution -The MultiQC DamageProfiler module length distribution plots show the frequency of read lengths across forward and reverse reads respectively. +The MultiQC DamageProfiler and mapDamage module length distribution plots show the frequency of read lengths across forward and reverse reads respectively. When looking at the length distribution plots, keep in mind the following: * Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. * You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA * You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. +* If the `mapdamage_downsample` parameter was specified and mapDamage was used for damage calculation, the length distribution is based only on the specified number of reads. ### QualiMap @@ -686,10 +688,10 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. * `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. * `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. -* `mapdamage/`: this contains sample specific directories containing raw statistics and damage plots from mapDamage2. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. The `Runtime_log.txt` file contains runtime information. +* `mapdamage/`: this contains sample specific directories containing raw statistics and damage plots from mapDamage. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. The `Runtime_log.txt` file contains runtime information. * `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. * `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). -* `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. +* `damage_rescaling/`: this contains rescaled BAM files from mapDamage. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. * `genotyping/`: this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. * `multivcfanalyzer/`: this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. * `sex_determination/`: this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. From 1d03083ea8778a8745e22b7bfb6d896bbb8015bb Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Wed, 23 Aug 2023 15:18:14 +0200 Subject: [PATCH 14/30] Apply suggestions from code review Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index 09f5d07d9..21230e3e6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -498,7 +498,7 @@ Plateauing can be caused by a number of reasons: #### Background -DamageProfiler and mapDamage are tools which calculate a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. +DamageProfiler and mapDamage are tools that calculate a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. Therefore, three main characteristics of ancient DNA are: diff --git a/main.nf b/main.nf index 9e91e4fdb..1b238cdaf 100644 --- a/main.nf +++ b/main.nf @@ -2108,7 +2108,7 @@ process damageprofiler { // Calculate typical aDNA damage frequency distribution with mapDamage -process mapdamage_estimation { +process mapdamage_calculation { label 'sc_small' tag "${libraryid}" From 8190c7540d3bab7690abbd3dfca27baf5b50facb Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 23 Aug 2023 15:30:54 +0200 Subject: [PATCH 15/30] Standardise formatting of mapDamage across --- README.md | 4 ++-- docs/output.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5e41133ba..7797a58b8 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ By default the pipeline currently performs the following: * Sequencing adapter removal, paired-end data merging (`AdapterRemoval`) * Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`) * Post-mapping processing, statistics and conversion to bam (`samtools`) -* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) +* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler` or `mapDamage`) * PCR duplicate removal (`DeDup` or `MarkDuplicates`) * Post-mapping statistics and BAM quality control (`Qualimap`) * Library Complexity Estimation (`preseq`) @@ -246,7 +246,7 @@ In addition, references of tools and data used in this pipeline are as follows: * **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) * **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) -* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) +* **mapDamage** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) * **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) ## Data References diff --git a/docs/output.md b/docs/output.md index 21230e3e6..107df5c38 100644 --- a/docs/output.md +++ b/docs/output.md @@ -526,7 +526,7 @@ When looking at the misincorporation plots, keep the following in mind:

-> **NB:** An important difference to note compared to the MapDamage2 tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is +> **NB:** An important difference to note compared to the mapDamage tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is #### Length Distribution From 42d9dca6d79b61c650acc6ea15e6562d6e14363f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 30 Aug 2023 16:02:45 +0200 Subject: [PATCH 16/30] document besswarm plot fix --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fe4fc026..3d3e052b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,10 +12,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` - [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments. +- [#1024](https://github.com/nf-core/eager/issues/1024) `multiqc_general_stats.txt` is now generated even if the table is a beeswarm plot in the report. ### `Dependencies` -- `multiqc`: 1.14 -> 1.15 +- `multiqc`: 1.14 -> 1.15 ### `Deprecated` From 3a84510c863a768931204a3729cfeba9064fad61 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 13 Oct 2023 10:43:16 +0200 Subject: [PATCH 17/30] bump MultiQC version --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 00cc32e88..a1813bebb 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8 - - bioconda::multiqc=1.15 + - bioconda::multiqc=1.16 - bioconda::pmdtools=0.60 - bioconda::bedtools=2.30.0 - conda-forge::libiconv=1.16 From 83163bbb1362483f0b97336a5c0b44cf661b4503 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 13 Oct 2023 10:53:42 +0200 Subject: [PATCH 18/30] mention NXF version cap. fix some broken URLs --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7797a58b8..633fda086 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,9 @@ [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23eager-4A154B?logo=slack)](https://nfcore.slack.com/channels/eager) +>[!IMPORTANT] +> nf-core/eager versions 2.* are only compatible with Nextflow versions up to 22.10.6! + ## Introduction @@ -28,7 +31,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool ## Quick Start -1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=20.07.1`) +1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=20.07.1` && `<=22.10.6`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ @@ -52,7 +55,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool nextflow clean -f -k ``` -See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. +See [usage docs](https://nf-co.re/eager/usage) for all of the available options when running the pipeline. **N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html` @@ -133,9 +136,9 @@ The nf-core/eager pipeline comes with documentation about the pipeline: [usage]( * [Pipeline installation](https://nf-co.re/usage/local_installation) * [Adding your own system config](https://nf-co.re/usage/adding_own_config) * [Reference genomes](https://nf-co.re/usage/reference_genomes) -3. [Running the pipeline](https://nf-co.re/eager/docs/usage.md) +3. [Running the pipeline](https://nf-co.re/eager/usage) * This includes tutorials, FAQs, and troubleshooting instructions -4. [Output and how to interpret the results](https://nf-co.re/eager/docs/output.md) +4. [Output and how to interpret the results](https://nf-co.re/eager/output) ## Credits From 23b74e96765443567095d24e344db33aff7d26a6 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 13 Oct 2023 11:54:37 +0200 Subject: [PATCH 19/30] attempt to add mapadamage to multiqc --- assets/multiqc_config.yaml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 1f8b66e03..820bd04a9 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -17,6 +17,7 @@ run_modules: - gatk - kraken - malt + - mapdamage - mtnucratio - multivcfanalyzer - picard @@ -91,6 +92,7 @@ top_modules: path_filters: - "*.preseq" - "damageprofiler" + - "mapdamage" - "mtnucratio" - "qualimap" - "sexdeterrmine" @@ -155,6 +157,11 @@ table_columns_visible: 3 Prime2: False mean_readlength: True median: True + mapDamage: + 5 Prime1: True + 5 Prime2: True + 3 Prime1: False + 3 Prime2: False mtnucratio: mt_nuc_ratio: True QualiMap: @@ -240,10 +247,15 @@ table_columns_placement: 3 Prime2: 730 mean_readlength: 740 median: 750 + mapDamage: + 5 Prime1: 760 + 5 Prime2: 765 + 3 Prime1: 770 + 3 Prime2: 775 mtnucratio: - mtreads: 760 - mt_cov_avg: 770 - mt_nuc_ratio: 780 + mtreads: 780 + mt_cov_avg: 785 + mt_nuc_ratio: 790 QualiMap: mapped_reads: 800 mean_coverage: 805 From 322fdcd857aaa06cf8ca7247db3614a717c85e91 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 27 Oct 2023 10:54:17 +0200 Subject: [PATCH 20/30] tweak mqc mapdamage linking --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 1b238cdaf..2d597795f 100644 --- a/main.nf +++ b/main.nf @@ -2122,11 +2122,11 @@ process mapdamage_calculation { file fasta from ch_fasta_for_mapdamage.collect() output: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("results_*") into ch_output_from_mapdamage - path ("results_*") into ch_mapdamage_for_multiqc + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("results_${base}") into ch_output_from_mapdamage + path ("results_${base}") into ch_mapdamage_for_multiqc script: - def base = "${bam.baseName}" + base = "${bam.baseName}" def singlestranded = strandedness == "single" ? '--single-stranded' : '' def downsample = params.mapdamage_downsample != 0 ? "-n ${params.mapdamage_downsample} --downsample-seed=1" : '' // Include seed to make results consistent between runs """ @@ -3258,7 +3258,7 @@ process multiqc { file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_for_multiqc.collect().ifEmpty([]) file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) - path ('mapdamage/*') from ch_mapdamage_for_multiqc.collect().ifEmpty([]).dump(tag:'ch_mapdamage_for_multiqc') + file ('mapdamage/*') from ch_mapdamage_for_multiqc.collect().ifEmpty([]).dump(tag:'ch_mapdamage_for_multiqc') file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) file ('markdup/*') from ch_markdup_results_for_multiqc.collect().ifEmpty([]) file ('dedup*/*') from ch_dedup_results_for_multiqc.collect().ifEmpty([]) From dabdee2a9082215d54582b008859232e035487c7 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 27 Oct 2023 10:54:27 +0200 Subject: [PATCH 21/30] correct mqc version --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d3e052b3..594f1a04b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Dependencies` -- `multiqc`: 1.14 -> 1.15 +- `multiqc`: 1.14 -> 1.16 ### `Deprecated` From a61e83a651f7b54b8ee2aecaa91a7bac6b2fabd8 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 27 Oct 2023 11:14:16 +0200 Subject: [PATCH 22/30] Update RG tags --- CHANGELOG.md | 1 + main.nf | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 594f1a04b..b4e1f1641 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments. - [#1024](https://github.com/nf-core/eager/issues/1024) `multiqc_general_stats.txt` is now generated even if the table is a beeswarm plot in the report. +- Updated RG tags for all mappers. RG-id now includes Sample as well as Library ID. Added `LB:` tag with the library ID. ### `Dependencies` diff --git a/main.nf b/main.nf index 2d597795f..306a2a1c2 100644 --- a/main.nf +++ b/main.nf @@ -1276,14 +1276,14 @@ process bwa { """ bwa aln -t ${task.cpus} $fasta ${r1} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -o ${params.bwaalno} -f ${libraryid}.r1.sai bwa aln -t ${task.cpus} $fasta ${r2} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -o ${params.bwaalno} -f ${libraryid}.r2.sai - bwa sampe -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.r1.sai ${libraryid}.r2.sai ${r1} ${r2} | samtools sort -@ ${task.cpus - 1} -O bam - > ${libraryid}_"${seqtype}".mapped.bam + bwa sampe -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.r1.sai ${libraryid}.r2.sai ${r1} ${r2} | samtools sort -@ ${task.cpus - 1} -O bam - > ${libraryid}_"${seqtype}".mapped.bam samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } else { //PE collapsed, or SE data """ bwa aln -t ${task.cpus} ${fasta} ${r1} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -o ${params.bwaalno} -f ${libraryid}.sai - bwa samse -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.sai $r1 | samtools sort -@ ${task.cpus - 1} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam + bwa samse -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $fasta ${libraryid}.sai $r1 | samtools sort -@ ${task.cpus - 1} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } @@ -1314,12 +1314,12 @@ process bwamem { if (!params.single_end && params.skip_collapse){ """ - bwa mem -t ${split_cpus} $fasta $r1 $r2 -R "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam + bwa mem -t ${split_cpus} $fasta $r1 $r2 -R ""@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam samtools index ${size} -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam """ } else { """ - bwa mem -t ${split_cpus} $fasta $r1 -R "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam + bwa mem -t ${split_cpus} $fasta $r1 -R ""@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam samtools index -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } @@ -1382,7 +1382,7 @@ process circularmapper{ """ bwa aln -t ${task.cpus} $elongated_root $r1 -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.r1.sai bwa aln -t ${task.cpus} $elongated_root $r2 -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.r2.sai - bwa sampe -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.r1.sai ${libraryid}.r2.sai $r1 $r2 > tmp.out + bwa sampe -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.r1.sai ${libraryid}.r2.sai $r1 $r2 > tmp.out realignsamfile -Xmx${task.memory.toGiga()}g -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${libraryid}_"${seqtype}".mapped.bam samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} @@ -1390,7 +1390,7 @@ process circularmapper{ } else { """ bwa aln -t ${task.cpus} $elongated_root $r1 -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.sai - bwa samse -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${samplename}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.sai $r1 > tmp.out + bwa samse -r "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" $elongated_root ${libraryid}.sai $r1 > tmp.out realignsamfile -Xmx${task.memory.toGiga()}g -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${libraryid}"_"${seqtype}".mapped.bam samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} @@ -1460,13 +1460,13 @@ process bowtie2 { //PE data without merging, PE data without any AR applied if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){ """ - bowtie2 -x ${fasta} -1 ${r1} -2 ${r2} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --maxins ${params.bt2_maxins} --rg-id ILLUMINA-${libraryid} --rg SM:${samplename} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam + bowtie2 -x ${fasta} -1 ${r1} -2 ${r2} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --maxins ${params.bt2_maxins} --rg-id ILLUMINA-${samplename}_${libraryid} --rg SM:${samplename} --rg LB:${libraryid} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } else { //PE collapsed, or SE data """ - bowtie2 -x ${fasta} -U ${r1} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --rg-id ILLUMINA-${libraryid} --rg SM:${samplename} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam + bowtie2 -x ${fasta} -U ${r1} -p ${split_cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} --rg-id ILLUMINA-${samplename}_${libraryid} --rg SM:${samplename} --rg LB:${libraryid} --rg PL:illumina --rg PU:ILLUMINA-${libraryid}-${seqtype} 2> "${libraryid}"_bt2.log | samtools sort -@ ${split_cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } From e87bad6503d23e7350d41a915e91b2604a2717a1 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 27 Oct 2023 11:17:57 +0200 Subject: [PATCH 23/30] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 107df5c38..9ad6cb9ba 100644 --- a/docs/output.md +++ b/docs/output.md @@ -526,7 +526,7 @@ When looking at the misincorporation plots, keep the following in mind:

-> **NB:** An important difference to note compared to the mapDamage tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is +> **NB:** An important difference to note compared to the mapDamage tool, which DamageProfiler is otherwise an exact-re-implementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is #### Length Distribution From 92d8377d836a70b321b60c760d75b55703e926d3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 1 Nov 2023 12:19:49 +0100 Subject: [PATCH 24/30] always index fasta if no fasta_index --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 306a2a1c2..185eff2b0 100644 --- a/main.nf +++ b/main.nf @@ -561,7 +561,7 @@ process makeFastaIndex { else null } - when: !params.fasta_index && params.fasta && ( params.mapper == 'bwaaln' || params.mapper == 'bwamem' || params.mapper == 'circularmapper') + when: !params.fasta_index && params.fasta input: path fasta from ch_fasta_for_faidx From f0bd5beac1846b34232779b6d7ca5160e9f52585 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 1 Nov 2023 12:28:46 +0100 Subject: [PATCH 25/30] Update Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4e1f1641..d73868edc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments. - [#1024](https://github.com/nf-core/eager/issues/1024) `multiqc_general_stats.txt` is now generated even if the table is a beeswarm plot in the report. - Updated RG tags for all mappers. RG-id now includes Sample as well as Library ID. Added `LB:` tag with the library ID. +- [#1031](https://github.com/nf-core/eager/issues/1031) Always index fasta regardless of mapper. This ensures that DamageProfiler and genotyping processes get submitted when using bowtie2 and not providing a fasta index. ### `Dependencies` From 754d5d86b23e274cf2c1a93309b55cfd1b1deb34 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 1 Nov 2023 12:50:55 +0100 Subject: [PATCH 26/30] add date to changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d73868edc..ebb41f875 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [2.5.0] - 2023-XX-XX +## [2.5.0] - 2023-11-01 ### `Added` From b9c15ef0c27a46e6642401af29ed941b77e7b1bc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 1 Nov 2023 16:22:44 +0100 Subject: [PATCH 27/30] small bugfix --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 185eff2b0..11df2b80c 100644 --- a/main.nf +++ b/main.nf @@ -1314,12 +1314,12 @@ process bwamem { if (!params.single_end && params.skip_collapse){ """ - bwa mem -t ${split_cpus} $fasta $r1 $r2 -R ""@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam + bwa mem -t ${split_cpus} $fasta $r1 $r2 -R "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam samtools index ${size} -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam """ } else { """ - bwa mem -t ${split_cpus} $fasta $r1 -R ""@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam + bwa mem -t ${split_cpus} $fasta $r1 -R "@RG\\tID:ILLUMINA-${samplename}_${libraryid}\\tSM:${samplename}\\tLB:${libraryid}\\tPL:illumina\\tPU:ILLUMINA-${libraryid}-${seqtype}" | samtools sort -@ ${split_cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam samtools index -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } From 37d1482abb890bb82986575ffc2bc614b82602d0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 1 Nov 2023 18:40:39 +0100 Subject: [PATCH 28/30] bump versions --- .github/workflows/ci.yml | 4 ++-- Dockerfile | 4 ++-- environment.yml | 4 ++-- nextflow.config | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c6a0fff3..015c3e972 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,13 +37,13 @@ jobs: - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/eager:dev + run: docker build --no-cache . -t nfcore/eager:2.5.0 - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/eager:dev - docker tag nfcore/eager:dev nfcore/eager:dev + docker tag nfcore/eager:dev nfcore/eager:2.5.0 - name: Install Nextflow env: diff --git a/Dockerfile b/Dockerfile index 302afc92e..7ca5bccca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ COPY environment.yml / RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-eager-2.4.8dev/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.5.0/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-eager-2.4.8dev > nf-core-eager-2.4.8dev.yml \ No newline at end of file +RUN conda env export --name nf-core-eager-2.5.0 > nf-core-eager-2.5.0.yml \ No newline at end of file diff --git a/environment.yml b/environment.yml index 2da2056b3..c349a8cd5 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-eager-2.4.8dev +name: nf-core-eager-2.5.0 channels: - conda-forge - bioconda @@ -50,4 +50,4 @@ dependencies: - bioconda::eigenstratdatabasetools=1.0.2 - bioconda::mapdamage2=2.2.1 - bioconda::bbmap=38.92 - - bioconda::bcftools=1.12 + - bioconda::bcftools=1.12 \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index f2f55e5c5..b53b80b76 100644 --- a/nextflow.config +++ b/nextflow.config @@ -288,7 +288,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/eager:dev' +process.container = 'nfcore/eager:2.5.0' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -418,7 +418,7 @@ manifest { description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline' mainScript = 'main.nf' nextflowVersion = '>=20.07.1' - version = '2.4.8dev' + version = '2.5.0' } // Function to ensure that resource requirements don't go beyond From 5f2bffd49fe119eb54c171374f6fb3a026fd68e5 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 3 Nov 2023 10:37:38 +0100 Subject: [PATCH 29/30] Apply suggestions from code review Co-authored-by: James A. Fellows Yates --- CHANGELOG.md | 6 +++--- main.nf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebb41f875..f57a119ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,17 +3,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [2.5.0] - 2023-11-01 +## [2.5.0] - Bopfingen - 2023-11-03 ### `Added` -- [#1020](https://github.com/nf-core/eager/issues/1020) Added mapdamage2 as an alternative for damage calculation. +- [#1020](https://github.com/nf-core/eager/issues/1020) Added mapDamage2 as an alternative for damage calculation. ### `Fixed` - [#1017](https://github.com/nf-core/eager/issues/1017) Fixed file name collision in niche cases with multiple libraries of multiple UDG treatments. - [#1024](https://github.com/nf-core/eager/issues/1024) `multiqc_general_stats.txt` is now generated even if the table is a beeswarm plot in the report. -- Updated RG tags for all mappers. RG-id now includes Sample as well as Library ID. Added `LB:` tag with the library ID. +- [#655](https://github.com/nf-core/eager/issues/655) Updated RG tags for all mappers. RG-id now includes Sample as well as Library ID. Added `LB:` tag with the library ID. - [#1031](https://github.com/nf-core/eager/issues/1031) Always index fasta regardless of mapper. This ensures that DamageProfiler and genotyping processes get submitted when using bowtie2 and not providing a fasta index. ### `Dependencies` diff --git a/main.nf b/main.nf index 11df2b80c..b7bc3432c 100644 --- a/main.nf +++ b/main.nf @@ -3258,7 +3258,7 @@ process multiqc { file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_for_multiqc.collect().ifEmpty([]) file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) - file ('mapdamage/*') from ch_mapdamage_for_multiqc.collect().ifEmpty([]).dump(tag:'ch_mapdamage_for_multiqc') + file ('mapdamage/*') from ch_mapdamage_for_multiqc.collect().ifEmpty([]) file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) file ('markdup/*') from ch_markdup_results_for_multiqc.collect().ifEmpty([]) file ('dedup*/*') from ch_dedup_results_for_multiqc.collect().ifEmpty([]) From f3cde496b7b7d7565cfab696efc8aa596852d4c3 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 3 Nov 2023 15:00:45 +0100 Subject: [PATCH 30/30] Update nextflow_schema.json --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 042f69074..1c11403ab 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -853,7 +853,7 @@ "default": "damageprofiler", "description": "Specify the tool to use for damage calculation.", "fa_icon": "fas fa-tools", - "help_text": "Specify the tool to be used for damage calculation. Options: `damageprofiler`, `mapdamage`. By default, DamageProfiler is used.", + "help_text": "Specify the tool to be used for damage calculation. DamageProfiler is generally faster than mapDamage2, but the latter has an option to limit the number of reads used. This can significantly speed up the processing of very large files, where the damage estimates are already accurate after processing only a fraction of the input. Options: `damageprofiler`, `mapdamage`. By default, DamageProfiler is used.", "enum": [ "damageprofiler", "mapdamage"