Merge pull request #307 from nf-core/standardisation-database-linking

Fix possible wrong database being assigned in post-taxprofiling steps
nf-core · Jul 17, 2023 · 6d81565 · 6d81565
2 parents 36d3156 + a4ed304
commit 6d81565
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -31,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#290](https://github.com/nf-core/taxprofiler/pull/290) Minor database input documentation improvements (♥ to @alneberg for reporting, fix by @jfy133)
 - [#305](https://github.com/nf-core/taxprofiler/pull/305) Fix docker/podman registry definition for tower compatibility (fix by @adamrtalbot, @jfy133)
 - [#304](https://github.com/nf-core/taxprofiler/pull/304) Correct mistake in kaiju2table documentation, only single rank can be supplied (♥ to @artur-matysik for reporting, fix by @jfy133)
+- [#307](https://github.com/nf-core/taxprofiler/pull/307) Fix databases being sometimes associated with the wrong tool (e.g. Kaiju) (fix by @jfy133)
 - [#313](https://github.com/nf-core/taxprofiler/pull/304) Fix pipeline not providing error when database sheet does not have a header (♥ to @noah472 for reporting, fix by @jfy133)
 
 ### `Dependencies`

diff --git a/conf/modules.config b/conf/modules.config
@@ -676,6 +676,7 @@ process {
     }
 
     withName: TAXPASTA_MERGE {
+        tag = { "${meta.tool}|${meta.id}" }
         ext.args =  {
             [
                 "-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}",
@@ -693,6 +694,7 @@ process {
     }
 
     withName: TAXPASTA_STANDARDISE {
+        tag = { "${meta.tool}|${meta.id}" }
         ext.args =  {
             [
                 "-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}",

diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
@@ -75,17 +75,17 @@ workflow PROFILING {
                     // as we don't run run on a per-sample basis due to huge datbaases
                     // so all samples are in one run and so sample-specific metadata
                     // unnecessary. Set as database name to prevent `null` job ID and prefix.
-                    def temp_meta = [ id: meta['db_name'] ]
+                    def temp_meta = [ id: meta.db_name ]
 
                     // Extend database parameters to specify whether to save alignments or not
                     def new_db_meta = db_meta.clone()
                     def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : ""
-                    new_db_meta['db_params'] = db_meta['db_params'] + sam_format
+                    new_db_meta.db_params = db_meta.db_params + sam_format
 
                     // Combine reduced sample metadata with updated database parameters metadata,
                     // make sure id is db_name for publishing purposes.
                     def new_meta = temp_meta + new_db_meta
-                    new_meta['id'] = new_meta['db_name']
+                    new_meta.id = new_meta.db_name
 
                     [ new_meta, reads, db ]
 
@@ -165,8 +165,8 @@ workflow PROFILING {
         def ch_kraken2_output = KRAKEN2_KRAKEN2.out.report
             .filter {
                 meta, report ->
-                    if ( meta['instrument_platform'] == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Bracken has not been evaluated for Nanopore data. Skipping Bracken for sample ${meta.id}."
-                    meta['tool'] == 'bracken' && meta['instrument_platform'] != 'OXFORD_NANOPORE'
+                    if ( meta.instrument_platform == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Bracken has not been evaluated for Nanopore data. Skipping Bracken for sample ${meta.id}."
+                    meta.tool == 'bracken' && meta.instrument_platform != 'OXFORD_NANOPORE'
             }
 
         // If necessary, convert the eight column output to six column output.
@@ -176,12 +176,12 @@ workflow PROFILING {
 
         // Extract the database name to combine by.
         ch_bracken_databases = databases
-            .filter { meta, db -> meta['tool'] == 'bracken' }
-            .map { meta, db -> [meta['db_name'], meta, db] }
+            .filter { meta, db -> meta.tool == 'bracken' }
+            .map { meta, db -> [meta.db_name, meta, db] }
 
         // Combine back with the reads
         ch_input_for_bracken = ch_kraken2_output
-            .map { meta, report -> [meta['db_name'], meta, report] }
+            .map { meta, report -> [meta.db_name, meta, report] }
             .combine(ch_bracken_databases, by: 0)
             .map {
 
@@ -190,7 +190,7 @@ workflow PROFILING {
 
                     // Have to pick second element if using bracken, as first element
                     // contains kraken parameters
-                    if ( db_meta['tool'] == 'bracken' ) {
+                    if ( db_meta.tool == 'bracken' ) {
 
                         // Only take second element if one exists
                         def parsed_params = db_meta_new['db_params'].split(";")
@@ -231,9 +231,26 @@ workflow PROFILING {
                                 }
 
         CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_reads, params.centrifuge_save_reads, params.centrifuge_save_reads  )
-        CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.report, ch_input_for_centrifuge.db)
         ch_versions            = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() )
         ch_raw_classifications = ch_raw_classifications.mix( CENTRIFUGE_CENTRIFUGE.out.results )
+
+        // Ensure the correct database goes with the generated report for KREPORT
+        ch_database_for_centrifugekreport = databases
+                                                .filter { meta, db -> meta.tool == 'centrifuge' }
+                                                .map { meta, db -> [meta.db_name, meta, db] }
+
+        ch_input_for_centrifuge_kreport = CENTRIFUGE_CENTRIFUGE.out.report
+                                            .map { meta, profile -> [meta.db_name, meta, profile] }
+                                            .combine(ch_database_for_centrifugekreport, by: 0)
+                                            .multiMap {
+                                                key, meta, profile, db_meta, db ->
+                                                    profile: [meta, profile]
+                                                    db: db
+                                            }
+
+        // Generate profile
+        CENTRIFUGE_KREPORT (ch_input_for_centrifuge_kreport.profile, ch_input_for_centrifuge_kreport.db)
+        ch_versions            = ch_versions.mix( CENTRIFUGE_KREPORT.out.versions.first() )
         ch_raw_profiles        = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport )
         ch_multiqc_files       = ch_multiqc_files.mix( CENTRIFUGE_KREPORT.out.kreport )
 
@@ -267,10 +284,25 @@ workflow PROFILING {
         ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() )
         ch_raw_classifications = ch_raw_classifications.mix( KAIJU_KAIJU.out.results )
 
-        KAIJU_KAIJU2TABLE_SINGLE ( KAIJU_KAIJU.out.results, ch_input_for_kaiju.db, params.kaiju_taxon_rank)
+        // Ensure the correct database goes with the generated report for KAIJU2TABLE
+        ch_database_for_kaiju2table = databases
+                                                .filter { meta, db -> meta.tool == 'kaiju' }
+                                                .map { meta, db -> [meta.db_name, meta, db] }
+
+        ch_input_for_kaiju2table = KAIJU_KAIJU.out.results
+                                            .map { meta, profile -> [meta.db_name, meta, profile] }
+                                            .combine(ch_database_for_kaiju2table, by: 0)
+                                            .multiMap {
+                                                key, meta, profile, db_meta, db ->
+                                                    profile: [meta, profile]
+                                                    db: db
+                                            }
+
+        // Generate profile
+        KAIJU_KAIJU2TABLE_SINGLE ( ch_input_for_kaiju2table.profile, ch_input_for_kaiju2table.db, params.kaiju_taxon_rank)
         ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE_SINGLE.out.versions )
         ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary )
-        ch_raw_profiles    = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary )
+        ch_raw_profiles  = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary )
     }
 
     if ( params.run_diamond ) {
@@ -342,8 +374,8 @@ workflow PROFILING {
         ch_input_for_ganonclassify = ch_input_for_profiling.ganon
                                 .filter {
                                     meta, reads, meta_db, db ->
-                                        if ( meta['instrument_platform'] == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Ganon has not been evaluated for Nanopore data. Skipping Ganon for sample ${meta.id}."
-                                        meta_db['tool'] == 'ganon' && meta['instrument_platform'] != 'OXFORD_NANOPORE'
+                                        if ( meta.instrument_platform == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Ganon has not been evaluated for Nanopore data. Skipping Ganon for sample ${meta.id}."
+                                        meta_db.tool == 'ganon' && meta.instrument_platform != 'OXFORD_NANOPORE'
                                 }
                                 .multiMap {
                                     it ->
@@ -359,7 +391,7 @@ workflow PROFILING {
         ch_report_for_ganonreport = GANON_CLASSIFY.out.report
                                         .map{
                                             meta, report ->
-                                                def meta_db = [ meta['db_name'] ]
+                                                def meta_db = [ meta.db_name ]
 
                                             [ meta_db, meta, report ]
 
@@ -368,7 +400,7 @@ workflow PROFILING {
         ch_database_for_ganonreport = databases
                                         .map{
                                             meta, database ->
-                                                def meta_db = [ meta['db_name'] ]
+                                                def meta_db = [ meta.db_name ]
 
                                         [ meta_db, meta, database ]
 

diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
@@ -117,7 +117,16 @@ workflow STANDARDISATION_PROFILES {
                                     [[id:it[0]], it[1]]
                                 }
 
-    KAIJU_KAIJU2TABLE_COMBINED ( ch_profiles_for_kaiju, ch_input_databases.kaiju.map{it[1]}, params.kaiju_taxon_rank)
+    ch_input_for_kaiju2tablecombine = ch_profiles_for_kaiju
+                                        .map { meta, profile -> [meta.id, meta, profile] }
+                                        .combine(ch_input_databases.kaiju.map{meta, db -> [meta.db_name, meta, db]}, by: 0)
+                                        .multiMap {
+                                            key, meta, profile, db_meta, db ->
+                                                profile: [meta, profile]
+                                                db: db
+                                        }
+
+    KAIJU_KAIJU2TABLE_COMBINED ( ch_input_for_kaiju2tablecombine.profile, ch_input_for_kaiju2tablecombine.db, params.kaiju_taxon_rank)
     ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE_COMBINED.out.summary )
     ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE_COMBINED.out.versions )
 
@@ -163,7 +172,16 @@ workflow STANDARDISATION_PROFILES {
                                     [[id:it[0]], it[1]]
                                 }
 
-    MOTUS_MERGE ( ch_profiles_for_motus, ch_input_databases.motus.map{it[1]}, motu_version )
+    ch_input_for_motusmerge = ch_profiles_for_motus
+                                        .map { meta, profile -> [meta.id, meta, profile] }
+                                        .combine(ch_input_databases.motus.map{meta, db -> [meta.db_name, meta, db]}, by: 0)
+                                        .multiMap {
+                                            key, meta, profile, db_meta, db ->
+                                                profile: [meta, profile]
+                                                db: db
+                                        }
+
+    MOTUS_MERGE ( ch_input_for_motusmerge.profile, ch_input_for_motusmerge.db, motu_version )
     ch_versions = ch_versions.mix( MOTUS_MERGE.out.versions )
 
     // Ganon