diff --git a/CHANGELOG.md b/CHANGELOG.md index ce0fc99e..df1449e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam) +- [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133) +- [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) +- [#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133) ### `Fixed` diff --git a/conf/modules.config b/conf/modules.config index e3fb0f18..f5a5e631 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,15 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: UNTAR { + ext.prefix = { "${archive.simpleName}" } + publishDir = [ + path: { "${params.outdir}/untar/databases" }, + mode: params.publish_dir_mode, + enabled: params.save_untarred_databases + ] + } + withName: FASTQC { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } @@ -512,6 +521,14 @@ process { ] } + withName: KRAKENTOOLS_KREPORT2KRONA { + publishDir = [ + enabled: false, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + } + withName: KRONA_CLEANUP { ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" } publishDir = [ diff --git a/docs/output.md b/docs/output.md index cf4678c3..2cebd463 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,6 +10,7 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [UNTAR](#untar) - Optionally saved decompressed input databases - [FastQC](#fastqc) - Raw read QC - [falco](#fastqc) - Alternative to FastQC for raw read QC - [fastp](#fastp) - Adapter trimming for Illumina data @@ -40,6 +41,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ![](images/taxprofiler_tube.png) +### untar + +untar is used in nf-core/taxprofiler to decompress various input files ending in `.tar.gz`. This process is mainly used for decompressing input database archive files. + +
+Output files + +- `untar/` + - `database/` + - ``: directory containing contents of the decompressed archive + +
+ +This directory will only be present if `--save_untarred_databases` is supplied. The contained directories can be useful for moving the decompressed directories to a central 'cache' location allowing users to re-use the same databases. This is useful to save unnecessary computational time of decompressing the archives on every run. + ### FastQC or Falco
diff --git a/nextflow.config b/nextflow.config index 26d23d96..9cf67533 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,6 +6,8 @@ ---------------------------------------------------------------------------------------- */ +nextflow.enable.strict = true + // Global default params, used in configs params { @@ -30,6 +32,7 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false + monochromeLogs = false // required so nf-validation nextflow.enabled.strict works nicely together hook_url = null help = false version = false @@ -51,12 +54,13 @@ params { // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta' + validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta,monochromeLogs' validationShowHiddenParams = false validate_params = true // Databases - databases = null + databases = null + save_untarred_databases = false // FASTQ preprocessing skip_preprocessing_qc = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 73364791..1ce1ee54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -34,6 +34,12 @@ "description": "Path to comma-separated file containing information about databases and profiling parameters for each taxonomic profiler", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/dev/usage#full-database-sheet).\n\nProfilers will only be executed if a corresponding database are supplied. \n\nWe recommend storing this database sheet somewhere centrally and accessible by others members of your lab/institutions, as this file will likely be regularly reused." }, + "save_untarred_databases": { + "type": "boolean", + "fa_icon": "fas fa-database", + "description": "Specify to save decompressed user-supplied TAR archives of databases", + "help_text": "If input databases are supplied as gzipped TAR archives, in some cases you may want to move and re-use these for future runs. Specifying this parameter will save these to `--outdir results/` under a directory called `untar`." + }, "outdir": { "type": "string", "format": "directory-path", diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index cc6955cf..6e079164 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -153,21 +153,37 @@ workflow TAXPROFILER { skip: true } // Filter the channel to untar only those databases for tools that are selected to be run by the user. - ch_input_untar = ch_dbs_for_untar.untar + // Also, to ensure only untar once per file, group together all databases of one file + ch_inputdb_untar = ch_dbs_for_untar.untar .filter { db_meta, db_path -> params[ "run_${db_meta.tool}" ] } - UNTAR ( ch_input_untar ) - - ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar ) - ch_final_dbs - .map { db_meta, db -> [ db_meta.db_params ] - def corrected_db_params = db_meta.db_params == null ? '' : db_meta.db_params - db_meta.db_params = corrected_db_params - [ db_meta, db ] + .groupTuple(by: 1) + .map { + meta, dbfile -> + def new_meta = [ 'id': dbfile.baseName ] + [ 'meta': meta ] + [new_meta , dbfile ] } + + // Untar the databases + UNTAR ( ch_inputdb_untar ) ch_versions = ch_versions.mix( UNTAR.out.versions.first() ) + // Spread out the untarred and shared databases + ch_outputdb_from_untar = UNTAR.out.untar + .map { + meta, db -> + [meta.meta, db] + } + .transpose(by: 0) + + ch_final_dbs = ch_dbs_for_untar.skip + .mix( ch_outputdb_from_untar ) + .map { db_meta, db -> + def corrected_db_params = db_meta.db_params ? [ db_params: db_meta.db_params ] : [ db_params: '' ] + [ db_meta + corrected_db_params, db ] + } + /* MODULE: Run FastQC */