Skip to content

Commit

Permalink
Merge pull request #762 from nf-core/fix-database-inputs
Browse files Browse the repository at this point in the history
Fix parameter input validation for file/directory based parameters
  • Loading branch information
jfy133 authored Feb 6, 2025
2 parents aed22fb + 7c32cf2 commit a822bdd
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 25 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#748](https://github.com/nf-core/mag/pull/748) - Fix broken phix reference channel when skipping phix removal (reported by @amizeranschi, fix by @muabnezor)
- [#752](https://github.com/nf-core/mag/pull/752) - Fix QUAST results not being displayed when skipping certain steps (reported by @amizeranschi, fix by @jfy133)
- [#753](https://github.com/nf-core/mag/pull/753) - Fix iGenomes reference support for host removal reference genome (reported by @Thomieh73, fix by @jfy133)
- [#759](https://github.com/nf-core/mag/pull/758) - Fixed parameters that allow both files or directories to not error with directories, and general file input validation improvements (repoted by @mjfi2sb3, fix by @jfy133)

### `Deprecated`

Expand Down
79 changes: 54 additions & 25 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"input": {
"type": "string",
"mimetype": "text/csv",
"format": "file-path-pattern",
"format": "file-path",
"exists": true,
"schema": "assets/schema_input.json",
"pattern": "^\\S+\\.csv$",
Expand All @@ -32,7 +32,7 @@
"assembly_input": {
"type": "string",
"mimetype": "text/csv",
"format": "file-path-pattern",
"format": "file-path",
"exists": true,
"schema": "assets/schema_assembly_input.json",
"pattern": "^\\S+\\.csv$",
Expand Down Expand Up @@ -324,12 +324,16 @@
"host_fasta": {
"type": "string",
"description": "Fasta reference file for host contamination removal.",
"help_text": "This parameter is mutually exclusive with `--host_genome`. The reference can be masked. Host read removal is done with Bowtie2."
"help_text": "This parameter is mutually exclusive with `--host_genome`. The reference can be masked. Host read removal is done with Bowtie2.",
"format": "file-path",
"exists": true
},
"host_fasta_bowtie2index": {
"type": "string",
"description": "Bowtie2 index directory corresponding to `--host_fasta` reference file for host contamination removal.",
"help_text": "This parameter must be used in combination with `--host_fasta`, and should be a directory containing files from the output of `bowtie2-build`, i.e. files ending in `.bt2`"
"help_text": "This parameter must be used in combination with `--host_fasta`, and should be a directory containing files from the output of `bowtie2-build`, i.e. files ending in `.bt2`",
"format": "directory-path",
"exists": true
},
"host_removal_verysensitive": {
"type": "boolean",
Expand All @@ -351,7 +355,9 @@
"type": "string",
"default": "${baseDir}/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz",
"description": "Genome reference used to remove Illumina PhiX contaminant reads.",
"hidden": true
"hidden": true,
"format": "file-path",
"exists": true
},
"skip_clipping": {
"type": "boolean",
Expand Down Expand Up @@ -419,7 +425,9 @@
"type": "string",
"default": "${baseDir}/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz",
"hidden": true,
"description": "Genome reference used to remove ONT Lambda contaminant reads."
"description": "Genome reference used to remove ONT Lambda contaminant reads.",
"format": "file-path",
"exists": true
},
"save_lambdaremoved_reads": {
"type": "boolean",
Expand Down Expand Up @@ -455,21 +463,24 @@
"properties": {
"centrifuge_db": {
"type": "string",
"format": "file-path",
"format": "path",
"exists": true,
"description": "Database for taxonomic binning with centrifuge.",
"help_text": "Local directory containing `*.cf` files, or a URL or local path to a downloaded compressed tar archive of a Centrifuge database. E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz."
},
"kraken2_db": {
"type": "string",
"format": "file-path",
"format": "path",
"exists": true,
"description": "Database for taxonomic binning with kraken2.",
"help_text": "Path to a local directory, archive file, or a URL to compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz."
},
"krona_db": {
"type": "string",
"description": "Database for taxonomic binning with krona",
"help_text": "Path to `taxonomy.tab` file for Krona, instead of downloading the default file. Point at the `.tab` file."
"help_text": "Path to `taxonomy.tab` file for Krona, instead of downloading the default file. Point at the `.tab` file.",
"format": "file-path",
"exists": true
},
"skip_krona": {
"type": "boolean",
Expand All @@ -478,7 +489,9 @@
"cat_db": {
"type": "string",
"description": "Database for taxonomic classification of metagenome assembled genomes. Can be either a zipped file or a directory containing the extracted output of such.",
"help_text": "E.g. https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz. This parameter is mutually exclusive with `--cat_db_generate`. The file needs to contain a folder named `*taxonomy*` and `*database*` that hold the respective files."
"help_text": "E.g. https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz. This parameter is mutually exclusive with `--cat_db_generate`. The file needs to contain a folder named `*taxonomy*` and `*database*` that hold the respective files.",
"format": "path",
"exists": true
},
"cat_db_generate": {
"type": "boolean",
Expand All @@ -501,31 +514,35 @@
"gtdb_db": {
"type": "string",
"description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.",
"default": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz"
"default": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
"format": "path",
"exists": true
},
"gtdb_mash": {
"type": "string",
"description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step"
"description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step",
"format": "path",
"exists": true
},
"gtdbtk_min_completeness": {
"type": "number",
"default": 50.0,
"default": 50,
"description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
"help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
"minimum": 0.01,
"maximum": 100
},
"gtdbtk_max_contamination": {
"type": "number",
"default": 10.0,
"default": 10,
"description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
"help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
"minimum": 0,
"maximum": 100
},
"gtdbtk_min_perc_aa": {
"type": "number",
"default": 10.0,
"default": 10,
"description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
"minimum": 0,
"maximum": 100
Expand All @@ -547,11 +564,6 @@
"type": "boolean",
"description": "Speed up pplacer step of GTDB-Tk by loading to memory.",
"help_text": "Will be faster than writing to disk (default setting), however at the expense of much larger memory (RAM) requirements for GDTBTK/CLASSIFY."
},
"genomad_db": {
"type": "string",
"description": "Database for virus classification with geNomad",
"help_text": "Must be a directory containing the uncompressed contents from https://zenodo.org/doi/10.5281/zenodo.6994741 (nf-core/mag tested with v1.1)"
}
}
},
Expand Down Expand Up @@ -629,7 +641,9 @@
"metaeuk_db": {
"type": "string",
"description": "Path to either a local fasta file of protein sequences, or to a directory containing an mmseqs2-formatted database, for annotation of eukaryotic genomes.",
"help_text": "One option would be the databases from the MetaEuk publication (https://wwwuser.gwdg.de/~compbiol/metaeuk/), however it should be noted that these are focused on marine eukaryotes."
"help_text": "One option would be the databases from the MetaEuk publication (https://wwwuser.gwdg.de/~compbiol/metaeuk/), however it should be noted that these are focused on marine eukaryotes.",
"format": "file-path",
"exists": true
},
"save_mmseqs_db": {
"type": "boolean",
Expand All @@ -646,6 +660,13 @@
"type": "boolean",
"description": "Run virus identification."
},
"genomad_db": {
"type": "string",
"description": "Database for virus classification with geNomad",
"help_text": "Must be a directory containing the uncompressed contents from https://zenodo.org/doi/10.5281/zenodo.6994741 (nf-core/mag tested with v1.1)",
"format": "path",
"exists": true
},
"genomad_min_score": {
"type": "number",
"default": 0.7,
Expand Down Expand Up @@ -757,7 +778,9 @@
"busco_db": {
"type": "string",
"description": "Download URL for BUSCO lineage dataset, or path to a tar.gz archive, or local directory containing already downloaded and unpacked lineage datasets.",
"help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/."
"help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/.",
"format": "path",
"exists": true
},
"busco_auto_lineage_prok": {
"type": "boolean",
Expand All @@ -783,7 +806,9 @@
"checkm_db": {
"type": "string",
"description": "Path to local folder containing already downloaded and uncompressed CheckM database.",
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`."
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`.",
"format": "directory-path",
"exists": true
},
"save_checkm_data": {
"type": "boolean",
Expand All @@ -793,7 +818,9 @@
"checkm2_db": {
"type": "string",
"description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).",
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`)."
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`).",
"format": "directory-path",
"exists": true
},
"checkm2_db_version": {
"type": "integer",
Expand Down Expand Up @@ -828,7 +855,9 @@
},
"gunc_db": {
"type": "string",
"description": "Specify a path to a pre-downloaded GUNC dmnd database file"
"description": "Specify a path to a pre-downloaded GUNC dmnd database file",
"format": "file-path",
"exists": true
},
"gunc_database_type": {
"type": "string",
Expand Down

0 comments on commit a822bdd

Please sign in to comment.