phac-nml · kylacochrane · Oct 24, 2024 · Sep 27, 2024 · Sep 27, 2024 · Oct 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,15 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.0] - 2024/10/04
+
+### `Added`
+
+- Enhanced the pipeline to integrate _optional_ user-provided reference profiles and cluster addresses for additional samples:
+  - Added support for `--db_profiles` via the `APPEND_PROFILES` process
+  - Added support for `--db_clusters` via the `APPEND_CLUSTERS` process
+- Added tests to verify the additional databases can be incorporated and that both databases are required together for their respective processes.
+
 ## [0.2.3] - 2024/09/25
 
 ### `Changed`

diff --git a/README.md b/README.md
@@ -73,6 +73,28 @@ The following can be used to adjust parameters for the [gas call][] tool.
 - `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_.
 - `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`.
 
+## Optional Profile and Cluster Address Databases (as used by IRIDA-Next)
+
+In addition to the reference samples included in the input samplesheet (which already contain a pre-computed cluster address), users can incorporate additional pre-computed reference profiles and cluster addresses by providing them as parameterized databases.
+
+- `--db_profiles`: Specifies the path to the database containing pre-merged MLST profiles in tab-separated format. The database should follow this structure:
+
+| sample_id | l1  | l2  | l3  |
+| --------- | --- | --- | --- |
+| sampleA   | 1   | 1   | 1   |
+| sampleB   | 1   | 1   | 2   |
+| sampleC   | 2   | 1   | 1   |
+
+- `--db_clusters`: Specifies the path to the database containing cluster addresses for additional samples in tab-separated format. The structure of this database should be as follows:
+
+| id      | address | level_1 | level_2 | level_3 |
+| ------- | ------- | ------- | ------- | ------- |
+| sampleA | 1.1.1   | 1       | 1       | 1       |
+| sampleB | 1.1.2   | 1       | 1       | 2       |
+| sampleC | 2.1.1   | 2       | 1       | 1       |
+
+_Note: To add additional reference samples to the pipeline, both `--db_profiles` and `--db_clusters` must be provided together, and all `sample_id`'s in `--db_profiles` must match the `id`'s in `--db_clusters`_
+
 ## Other
 
 Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schmea.json).

diff --git a/docs/output.md b/docs/output.md
@@ -6,6 +6,7 @@ This document describes the output produced by the pipeline.
 
 The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 
+- append: Contains reference MLST profile and cluster address files if additional databases were provided by the user.
 - call: The cluster addresses from the [genomic_address_service](https://github.com/phac-nml/genomic_address_service).
 - cluster: The cluster file required by GAS_call.
 - distances: Distances between genomes from [profile_dists](https://github.com/phac-nml/profile_dists).
@@ -22,8 +23,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 - [Input assure](#input-assure) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found.
 - [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples.
+- [Append profiles](#append-profiles) - Appends additional MLST profile information to reference samples if provided by user.
 - [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences.
 - [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call.
+- [Append clusters](#append-clusters) - Appends additional cluster information to reference samples if provided by user.
 - [GAS call](#gas-call) - Generates hierarchical cluster addresses.
 - [Filter query](#filter-query) - Filters and generates a csv file containing only the cluster addresses for query samples.
 - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next
@@ -51,6 +54,16 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 </details>
 
+### Append Profiles
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `append/`
+  - profiles: `profiles_ref.tsv`
+
+</details>
+
 ### Profile Dists
 
 <details markdown="1">
@@ -75,6 +88,16 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 </details>
 
+### Append Clusters
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `append/`
+  - clusters: `reference_clusters.tsv`
+
+</details>
+
 ### GAS call
 
 <details markdown="1">

diff --git a/main.nf b/main.nf
@@ -22,7 +22,15 @@ if (params.validate_params) {
     validateParameters()
 }
 
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Ensure both --db_profiles and --db_clusters are provided together
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
 
+if ((params.db_profiles && !params.db_clusters) || (!params.db_profiles && params.db_clusters)) {
+    error "Both '--db_profiles' and '--db_clusters' parameters must be provided together."
+}
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/modules/local/append_clusters/main.nf b/modules/local/append_clusters/main.nf
@@ -0,0 +1,21 @@
+process APPEND_CLUSTERS {
+    tag "Append additional clusters from database"
+    label 'process_single'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' :
+        'biocontainers/csvtk:0.22.0--h9ee0642_1' }"
+
+    input:
+    path(initial_clusters)
+    path(additional_clusters)
+
+    output:
+    path("reference_clusters.tsv")
+
+    script:
+    """
+    csvtk concat -t ${initial_clusters} ${additional_clusters} > reference_clusters.tsv
+
+    """
+}
diff --git a/modules/local/append_profiles/main.nf b/modules/local/append_profiles/main.nf
@@ -0,0 +1,20 @@
+process APPEND_PROFILES {
+    tag "Append additional reference profiles"
+    label 'process_single'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' :
+        'biocontainers/csvtk:0.22.0--h9ee0642_1' }"
+
+    input:
+    path(reference_profiles)
+    path(additional_profiles)
+
+    output:
+    path("*.tsv")
+
+    script:
+    """
+    csvtk concat -t ${reference_profiles} ${additional_profiles} > profiles_ref.tsv
+    """
+}
diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf
@@ -6,7 +6,7 @@ process CLUSTER_FILE {
     val meta
 
     output:
-    path("reference_clusters.txt"), emit: text
+    path("clusters.tsv")
 
     exec:
     def outputLines = []
@@ -37,7 +37,7 @@ process CLUSTER_FILE {
     }
 
     // Write the text file, iterating over each sample
-    task.workDir.resolve("reference_clusters.txt").withWriter { writer ->
+    task.workDir.resolve("clusters.tsv").withWriter { writer ->
         outputLines.each { line ->
             writer.writeLine(line)
         }

diff --git a/nextflow.config b/nextflow.config
@@ -59,6 +59,10 @@ params {
     gm_method = "average"
     gm_delimiter = "."
 
+    // Additional Profile and Cluster Databases for Reference Samples
+    db_profiles = null
+    db_clusters = null
+
 }
 
 // Load base.config by default for all pipelines

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -89,6 +89,27 @@
                 }
             }
         },
+        "databases": {
+            "title": "Databases",
+            "type": "object",
+            "description": "The optional databases of profiles and cluster addresses that can be used.",
+            "fa_icon": "fas fa-terminal",
+            "default": "",
+            "properties": {
+                "db_profiles": {
+                    "type": "string",
+                    "pattern": "^\\S+\\.tsv$",
+                    "format": "file-path",
+                    "description": "Path to optional tab-separated file containing additional sample profiles"
+                },
+                "db_clusters": {
+                    "type": "string",
+                    "pattern": "^\\S+\\.tsv$",
+                    "format": "file-path",
+                    "description": "Path to optional text file containing additional sample cluster addresses"
+                }
+            }
+        },
         "input_output_options": {
             "title": "Input/output options",
             "type": "object",
@@ -286,6 +307,9 @@
         {
             "$ref": "#/definitions/input_output_options"
         },
+        {
+            "$ref": "#/definitions/databases"
+        },
         {
             "$ref": "#/definitions/institutional_config_options"
         },

diff --git a/tests/data/called/expected_results_append.txt b/tests/data/called/expected_results_append.txt
@@ -0,0 +1,8 @@
+id	address	level_1	level_2	level_3
+sample1	1.1.1	1	1	1
+sample2	1.1.1	1	1	1
+sample3	1.1.2	1	1	2
+sampleA	1.1.1	1	1	1
+sampleB	1.1.2	1	1	2
+sampleC	2.1.1	2	1	1
+sampleQ	1.1.3	1	1	3
diff --git a/tests/data/clusters/expected_clusters_append.txt b/tests/data/clusters/expected_clusters_append.txt
@@ -0,0 +1,7 @@
+id	address	level_1	level_2	level_3
+sample1	1.1.1	1	1	1
+sample2	1.1.1	1	1	1
+sample3	1.1.2	1	1	2
+sampleA	1.1.1	1	1	1
+sampleB	1.1.2	1	1	2
+sampleC	2.1.1	2	1	1
diff --git a/tests/data/databases/additional_clusters.tsv b/tests/data/databases/additional_clusters.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3
+sampleA	1.1.1	1	1	1
+sampleB	1.1.2	1	1	2
+sampleC	2.1.1	2	1	1
diff --git a/tests/data/databases/additional_profiles.tsv b/tests/data/databases/additional_profiles.tsv
@@ -0,0 +1,4 @@
+sample_id	l1	l2	l3
+sampleA	1	1	1
+sampleB	1	1	2
+sampleC	2	1	1
diff --git a/tests/data/distances/expected_dists_append.txt b/tests/data/distances/expected_dists_append.txt
@@ -0,0 +1,8 @@
+query_id	ref_id	dist
+sampleQ	sampleQ	0
+sampleQ	sample1	1
+sampleQ	sample2	1
+sampleQ	sampleA	1
+sampleQ	sample3	2
+sampleQ	sampleB	2
+sampleQ	sampleC	2
diff --git a/tests/data/irida/append_iridanext.output.json b/tests/data/irida/append_iridanext.output.json
@@ -0,0 +1,13 @@
+{
+    "files": {
+        "global": [],
+        "samples": {}
+    },
+    "metadata": {
+        "samples": {
+            "sampleQ": {
+                "address": "1.1.3"
+            }
+        }
+    }
+}
diff --git a/tests/data/profiles/expected_profiles_append.tsv b/tests/data/profiles/expected_profiles_append.tsv
@@ -0,0 +1,8 @@
+sample_id	l1	l2	l3
+sampleQ	1	2	1
+sample1	1	1	1
+sample2	1	1	1
+sample3	1	1	2
+sampleA	1	1	1
+sampleB	1	1	2
+sampleC	2	1	1
diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test
@@ -26,7 +26,7 @@ nextflow_process {
             assert path("$launchDir/cluster_results").exists()
 
             // Check reference_clusters file
-            def actual_clusters = path("$launchDir/cluster_results/cluster/reference_clusters.txt")
+            def actual_clusters = path("$launchDir/cluster_results/cluster/clusters.tsv")
             def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt")
             assert actual_clusters.text == expected_clusters.text
         }

diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
@@ -33,7 +33,7 @@ nextflow_pipeline {
             assert actual_distances.text == expected_distances.text
 
             // Verify cluster file
-            def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt")
+            def actual_cluster = path("$launchDir/results/cluster/clusters.tsv")
             def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt")
             assert actual_cluster.text == expected_cluster.text
 
@@ -134,7 +134,7 @@ nextflow_pipeline {
             assert actual_distances.text == expected_distances.text
 
             // Verify cluster file
-            def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt")
+            def actual_cluster = path("$launchDir/results/cluster/clusters.tsv")
             def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt")
             assert actual_cluster.text == expected_cluster.text