From be3ba2671b0e9358d3031c917560bfe7539bc5dc Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Thu, 6 Jun 2024 16:45:58 -0400
Subject: [PATCH 01/21] Revise 'input_check' to 'input_assure'; enforce JSON
 key alteration to match the sample ID if a mismatch is detected

---
 bin/input_check.py                            | 24 +++++++++++++++----
 .../{input_check => input_assure}/main.nf     |  2 +-
 workflows/gas_nomenclature.nf                 | 19 ++++-----------
 3 files changed, 24 insertions(+), 21 deletions(-)
 rename modules/local/{input_check => input_assure}/main.nf (97%)

diff --git a/bin/input_check.py b/bin/input_check.py
index a21f0c7..bd7ac70 100755
--- a/bin/input_check.py
+++ b/bin/input_check.py
@@ -4,11 +4,18 @@
 import argparse
 import sys
 import csv
+import gzip
 
+def open_file(file_path, mode):
+    # Open a file based on the file extension
+    if file_path.endswith('.gz'):
+        return gzip.open(file_path, mode)
+    else:
+        return open(file_path, mode)
 
 def check_inputs(json_file, sample_id, address, output_match_file, output_error_file):
     # Define a variable to store the match_status (True or False)
-    with open(json_file, "r") as f:
+    with open(json_file, "rt") as f:
         json_data = json.load(f)
     match_status = sample_id in json_data
 
@@ -16,23 +23,30 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_
     with open(output_match_file, "w") as f:
         f.write(str(match_status))
 
+    # Define the original key in the JSON data
+    original_key = list(json_data.keys())[0]
+
     # Define error message based on meta.address (query or reference)
     if address == "null":
-        error_message = f"Query {sample_id} removed from pipeline"
+        error_message = f"Query {sample_id} ID did not match the JSON key in {json_file} - User must manually check input files to ensure correctness."
     else:
-        error_message = f"Pipeline stopped: Reference {sample_id}'s input ID and MLST JSON file key DO NOT MATCH"
+        error_message = f"Reference {sample_id}'s sample ID and JSON key in {json_file} DO NOT MATCH: the '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}'."
 
     # Write sample ID and JSON key to error report CSV if not matched; include error message
     if not match_status:
         with open(output_error_file, "w", newline="") as f:
             writer = csv.writer(f)
             writer.writerow(["sample", "JSON_key", "error_message"])
-            writer.writerow([sample_id, list(json_data.keys())[0], error_message])
+            writer.writerow([sample_id, original_key, error_message])
 
+    # Update the JSON file with the new sample ID
+        json_data[sample_id] = json_data.pop(original_key)
+        with open(json_file, "wt") as f:
+            json.dump(json_data, f, indent=4)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Check sample inputs and generate an error report."
+        description="Check sample inputs, force change if ID ≠ KEY, and generate an error report."
     )
     parser.add_argument("--input", help="Path to the mlst.json file.", required=True)
     parser.add_argument(
diff --git a/modules/local/input_check/main.nf b/modules/local/input_assure/main.nf
similarity index 97%
rename from modules/local/input_check/main.nf
rename to modules/local/input_assure/main.nf
index 79a2242..90260a2 100644
--- a/modules/local/input_check/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -1,4 +1,4 @@
-process INPUT_CHECK{
+process INPUT_ASSURE {
     tag "Check Sample Inputs and Generate Error Report"
     label 'process_single'
 
diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf
index b0fb977..dfa029d 100644
--- a/workflows/gas_nomenclature.nf
+++ b/workflows/gas_nomenclature.nf
@@ -22,7 +22,7 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet  } from 'plugin/nf
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
 
-include { INPUT_CHECK                           } from "../modules/local/input_check/main"
+include { INPUT_ASSURE                          } from "../modules/local/input_assure/main"
 include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF    } from "../modules/local/locidex/merge/main"
 include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY  } from "../modules/local/locidex/merge/main"
 include { PROFILE_DISTS                         } from "../modules/local/profile_dists/main"
@@ -72,7 +72,7 @@ workflow GAS_NOMENCLATURE {
     input = Channel.fromSamplesheet("input")
 
     // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key
-    id_key = INPUT_CHECK(input)
+    id_key = INPUT_ASSURE(input)
     ch_versions = ch_versions.mix(id_key.versions)
 
     // Update metadata to include the id_key.match data
@@ -80,21 +80,10 @@ workflow GAS_NOMENCLATURE {
         def id_match = file.text.trim()
         [meta + [id_match: id_match == 'True'], json]
     }
-
-    // If samples have a disparity between meta.id and JSON key: Exclude the queried samples OR halt the pipeline with an error if sample has an associated cluster address (reference)
-    new_input = match.filter { meta, json ->
-        if (meta.id_match) {
-            return true // Keep the sample
-        } else if (meta.address == null && !meta.id_match) {
-            return false // Remove the sample
-        } else if (meta.address != null && !meta.id_match) {
-            // Exit with error statement
-            throw new RuntimeException("Pipeline exiting: sample with ID ${meta.id} does not have matching MLST JSON file.")
-        }
-    }
+    match.view()
 
     // Prepare reference and query TSV files for LOCIDEX_MERGE
-    profiles = new_input.branch{
+    profiles = match.branch {
         query: !it[0].address
     }
     reference_values = input.collect{ meta, profile -> profile}

From 95e40f6979ab60cbe6f8531bb72996c45cf1c56f Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Fri, 7 Jun 2024 08:47:18 -0400
Subject: [PATCH 02/21] Remove id_match from meta

---
 bin/input_check.py                 | 11 ++---------
 modules/local/input_assure/main.nf |  5 ++---
 workflows/gas_nomenclature.nf      |  9 +--------
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/bin/input_check.py b/bin/input_check.py
index bd7ac70..19c099d 100755
--- a/bin/input_check.py
+++ b/bin/input_check.py
@@ -13,16 +13,12 @@ def open_file(file_path, mode):
     else:
         return open(file_path, mode)
 
-def check_inputs(json_file, sample_id, address, output_match_file, output_error_file):
+def check_inputs(json_file, sample_id, address, output_error_file):
     # Define a variable to store the match_status (True or False)
     with open(json_file, "rt") as f:
         json_data = json.load(f)
     match_status = sample_id in json_data
 
-    # Write match status to file
-    with open(output_match_file, "w") as f:
-        f.write(str(match_status))
-
     # Define the original key in the JSON data
     original_key = list(json_data.keys())[0]
 
@@ -58,12 +54,9 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_
     parser.add_argument(
         "--output_error", help="Path to the error report file.", required=True
     )
-    parser.add_argument(
-        "--output_match", help="Path to the match status file.", required=True
-    )
 
     args = parser.parse_args()
 
     check_inputs(
-        args.input, args.sample_id, args.address, args.output_match, args.output_error
+        args.input, args.sample_id, args.address, args.output_error
     )
diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index 90260a2..1b22242 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -10,7 +10,7 @@ process INPUT_ASSURE {
     tuple val(meta), path(mlst)
 
     output:
-    tuple val(meta), path("${meta.id}_match.txt"), path(mlst),      emit: match
+    tuple val(meta), path(mlst),      emit: match
     tuple val(meta), path("*_error_report.csv"), optional: true,    emit: error_report
     path("versions.yml"),                                           emit: versions
 
@@ -21,8 +21,7 @@ process INPUT_ASSURE {
         --input ${mlst} \\
         --sample_id ${meta.id} \\
         --address ${meta.address} \\
-        --output_error ${meta.id}_error_report.csv \\
-        --output_match ${meta.id}_match.txt
+        --output_error ${meta.id}_error_report.csv 
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf
index dfa029d..d527777 100644
--- a/workflows/gas_nomenclature.nf
+++ b/workflows/gas_nomenclature.nf
@@ -75,15 +75,8 @@ workflow GAS_NOMENCLATURE {
     id_key = INPUT_ASSURE(input)
     ch_versions = ch_versions.mix(id_key.versions)
 
-    // Update metadata to include the id_key.match data
-    match = id_key.match.map { meta, file, json ->
-        def id_match = file.text.trim()
-        [meta + [id_match: id_match == 'True'], json]
-    }
-    match.view()
-
     // Prepare reference and query TSV files for LOCIDEX_MERGE
-    profiles = match.branch {
+    profiles = id_key.match.branch {
         query: !it[0].address
     }
     reference_values = input.collect{ meta, profile -> profile}

From 9e20417c028621a6c18fd8011aef9bfbb1885956 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Fri, 7 Jun 2024 08:49:36 -0400
Subject: [PATCH 03/21] Fix linting

---
 modules/local/input_assure/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index 1b22242..c01c319 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -21,7 +21,7 @@ process INPUT_ASSURE {
         --input ${mlst} \\
         --sample_id ${meta.id} \\
         --address ${meta.address} \\
-        --output_error ${meta.id}_error_report.csv 
+        --output_error ${meta.id}_error_report.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From deb43495bf65e0e599d4f645f6ca389f6698fcdc Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Fri, 7 Jun 2024 09:26:53 -0400
Subject: [PATCH 04/21] Updated error_message from input_assure

---
 bin/input_check.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/input_check.py b/bin/input_check.py
index 19c099d..5c8365d 100755
--- a/bin/input_check.py
+++ b/bin/input_check.py
@@ -24,9 +24,9 @@ def check_inputs(json_file, sample_id, address, output_error_file):
 
     # Define error message based on meta.address (query or reference)
     if address == "null":
-        error_message = f"Query {sample_id} ID did not match the JSON key in {json_file} - User must manually check input files to ensure correctness."
+        error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
     else:
-        error_message = f"Reference {sample_id}'s sample ID and JSON key in {json_file} DO NOT MATCH: the '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}'."
+        error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
 
     # Write sample ID and JSON key to error report CSV if not matched; include error message
     if not match_status:

From 07fe2c66dee1631279630a3bdda33281f30b4b2b Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 10 Jun 2024 16:52:24 -0400
Subject: [PATCH 05/21] Update python script name to match process:
 input_assure.py

---
 bin/{input_check.py => input_assure.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename bin/{input_check.py => input_assure.py} (100%)

diff --git a/bin/input_check.py b/bin/input_assure.py
similarity index 100%
rename from bin/input_check.py
rename to bin/input_assure.py

From 23c1397efd2c31f8af2588a846809af16a81a0fc Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 10 Jun 2024 16:54:35 -0400
Subject: [PATCH 06/21] Add 'fair = true' to input_assure process in
 modules.config for reproducibility

---
 conf/modules.config | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index a898c53..00855c7 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -13,8 +13,6 @@
 process {
 
     // Publish directory names
-    assembly_directory_name = "assembly"
-    summary_directory_name = "summary"
     profile_dists_directory_name = "distances"
     gas_call_directory_name = "call"
 
@@ -27,6 +25,10 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
+    withName: INPUT_ASSURE {
+        fair = true
+    }
+
     withName: LOCIDEX_MERGE_REF {
         publishDir = [
             path: locidex_merge_ref_directory_name,

From c7252cfeb77a4f32fce0f0811e711961ce07b19a Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 10 Jun 2024 16:56:49 -0400
Subject: [PATCH 07/21] Update input_assure.py to include additional check for
 multiple keys

---
 bin/input_assure.py                | 52 +++++++++++++++++++-----------
 modules/local/input_assure/main.nf |  6 ++--
 workflows/gas_nomenclature.nf      | 10 +++---
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 5c8365d..2705eae 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -2,7 +2,6 @@
 
 import json
 import argparse
-import sys
 import csv
 import gzip
 
@@ -14,31 +13,48 @@ def open_file(file_path, mode):
         return open(file_path, mode)
 
 def check_inputs(json_file, sample_id, address, output_error_file):
-    # Define a variable to store the match_status (True or False)
-    with open(json_file, "rt") as f:
+    with open_file(json_file, "rt") as f:
         json_data = json.load(f)
-    match_status = sample_id in json_data
 
-    # Define the original key in the JSON data
-    original_key = list(json_data.keys())[0]
+    # Define a variable to store the match_status (True or False)
+    match_status = sample_id in json_data
+    
+    keys = list (json_data.keys())
+    original_key = keys[0]
 
+    # Initialize the error message
+    error_message = None 
+    
+    # Check for multiple keys in the JSON file and define error message
+    if len(keys) > 1:
+        # Check if sample_id matches any key
+        if not match_status:
+            error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
+            # Retain only the specified sample ID 
+            json_data = {sample_id: json_data.pop(original_key)}
+        else:
+            error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
+            # Remove all keys expect the one matching sample_id
+            json_data = {sample_id: json_data[sample_id]} 
+    elif not match_status:
     # Define error message based on meta.address (query or reference)
-    if address == "null":
-        error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
-    else:
-        error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
-
-    # Write sample ID and JSON key to error report CSV if not matched; include error message
-    if not match_status:
+        if address == "null":
+            error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+        else:
+            error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
+        # Update the JSON file with the new sample ID
+        json_data[sample_id] = json_data.pop(original_key)
+   
+    # Write file containing relevant error messages
+    if error_message:
         with open(output_error_file, "w", newline="") as f:
             writer = csv.writer(f)
             writer.writerow(["sample", "JSON_key", "error_message"])
-            writer.writerow([sample_id, original_key, error_message])
+            writer.writerow([sample_id, keys, error_message])
 
-    # Update the JSON file with the new sample ID
-        json_data[sample_id] = json_data.pop(original_key)
-        with open(json_file, "wt") as f:
-            json.dump(json_data, f, indent=4)
+    # Write the updated JSON data back to the original file 
+    with open_file(json_file, "wt") as f:
+        json.dump(json_data, f, indent=4)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index c01c319..e0376ac 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -1,5 +1,5 @@
 process INPUT_ASSURE {
-    tag "Check Sample Inputs and Generate Error Report"
+    tag "Assures Inputs are Consistent"
     label 'process_single'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -10,14 +10,14 @@ process INPUT_ASSURE {
     tuple val(meta), path(mlst)
 
     output:
-    tuple val(meta), path(mlst),      emit: match
+    tuple val(meta), path(mlst),                                    emit: result
     tuple val(meta), path("*_error_report.csv"), optional: true,    emit: error_report
     path("versions.yml"),                                           emit: versions
 
     script:
 
     """
-    input_check.py \\
+    input_assure.py \\
         --input ${mlst} \\
         --sample_id ${meta.id} \\
         --address ${meta.address} \\
diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf
index d527777..4531ff1 100644
--- a/workflows/gas_nomenclature.nf
+++ b/workflows/gas_nomenclature.nf
@@ -72,15 +72,15 @@ workflow GAS_NOMENCLATURE {
     input = Channel.fromSamplesheet("input")
 
     // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key
-    id_key = INPUT_ASSURE(input)
-    ch_versions = ch_versions.mix(id_key.versions)
+    input_assure = INPUT_ASSURE(input)
+    ch_versions = ch_versions.mix(input_assure.versions)
 
     // Prepare reference and query TSV files for LOCIDEX_MERGE
-    profiles = id_key.match.branch {
+    profiles = input_assure.result.branch {
         query: !it[0].address
     }
-    reference_values = input.collect{ meta, profile -> profile}
-    query_values = profiles.query.collect{ meta, profile -> profile }
+    reference_values = input_assure.result.collect{ meta, mlst -> mlst}
+    query_values = profiles.query.collect{ meta, mlst -> mlst }
 
     // LOCIDEX modules
     ref_tag = Channel.value("ref")

From f7ed9d3325e78da2a394d9f5d96b70e10f904a33 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 10 Jun 2024 16:59:14 -0400
Subject: [PATCH 08/21] Fixed linting issues

---
 bin/input_assure.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 2705eae..779e888 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -5,46 +5,48 @@
 import csv
 import gzip
 
+
 def open_file(file_path, mode):
     # Open a file based on the file extension
-    if file_path.endswith('.gz'):
+    if file_path.endswith(".gz"):
         return gzip.open(file_path, mode)
     else:
         return open(file_path, mode)
 
+
 def check_inputs(json_file, sample_id, address, output_error_file):
     with open_file(json_file, "rt") as f:
         json_data = json.load(f)
 
     # Define a variable to store the match_status (True or False)
     match_status = sample_id in json_data
-    
-    keys = list (json_data.keys())
+
+    keys = list(json_data.keys())
     original_key = keys[0]
 
     # Initialize the error message
-    error_message = None 
-    
+    error_message = None
+
     # Check for multiple keys in the JSON file and define error message
     if len(keys) > 1:
         # Check if sample_id matches any key
         if not match_status:
             error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
-            # Retain only the specified sample ID 
+            # Retain only the specified sample ID
             json_data = {sample_id: json_data.pop(original_key)}
         else:
             error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
             # Remove all keys expect the one matching sample_id
-            json_data = {sample_id: json_data[sample_id]} 
+            json_data = {sample_id: json_data[sample_id]}
     elif not match_status:
-    # Define error message based on meta.address (query or reference)
+        # Define error message based on meta.address (query or reference)
         if address == "null":
             error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
         else:
             error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
         # Update the JSON file with the new sample ID
         json_data[sample_id] = json_data.pop(original_key)
-   
+
     # Write file containing relevant error messages
     if error_message:
         with open(output_error_file, "w", newline="") as f:
@@ -52,10 +54,11 @@ def check_inputs(json_file, sample_id, address, output_error_file):
             writer.writerow(["sample", "JSON_key", "error_message"])
             writer.writerow([sample_id, keys, error_message])
 
-    # Write the updated JSON data back to the original file 
+    # Write the updated JSON data back to the original file
     with open_file(json_file, "wt") as f:
         json.dump(json_data, f, indent=4)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Check sample inputs, force change if ID ≠ KEY, and generate an error report."
@@ -73,6 +76,4 @@ def check_inputs(json_file, sample_id, address, output_error_file):
 
     args = parser.parse_args()
 
-    check_inputs(
-        args.input, args.sample_id, args.address, args.output_error
-    )
+    check_inputs(args.input, args.sample_id, args.address, args.output_error)

From 7592bd3bea310522d45de6725804a6f3093050c5 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 15:13:49 -0400
Subject: [PATCH 09/21] Resolve conflicts between dev and input_assure

---
 modules/local/input_assure/main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index 30dfdaa..e0376ac 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -1,7 +1,6 @@
 process INPUT_ASSURE {
     tag "Assures Inputs are Consistent"
     label 'process_single'
-    fair true
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/python:3.8.3' :

From 32663302aa75ba71c3b606c08b2ec62e3dea3c03 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 16:08:44 -0400
Subject: [PATCH 10/21] Add test with gzipped MLST JSON file

---
 tests/data/reports/sample1.mlst.json.gz      | Bin 0 -> 84 bytes
 tests/data/samplesheets/samplesheet_gzip.csv |   5 +++
 tests/pipelines/main.nf.test                 |  35 +++++++++++++++++++
 3 files changed, 40 insertions(+)
 create mode 100644 tests/data/reports/sample1.mlst.json.gz
 create mode 100644 tests/data/samplesheets/samplesheet_gzip.csv

diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..94f25c81407bfee01777e59fb4af80f5998dbf30
GIT binary patch
literal 84
zcmb2|=HNK`KQoQ#e{o`NK~AcnUT#itiC$K5ejdZBXF&{1OCKG%(=;c<Syj6=YXd`s
o0sE0~qmWCELPm!~jM_LFU9)y(F)}bP{Qu9)aN(c$mLMP-0Fa>{<NyEw

literal 0
HcmV?d00001

diff --git a/tests/data/samplesheets/samplesheet_gzip.csv b/tests/data/samplesheets/samplesheet_gzip.csv
new file mode 100644
index 0000000..a4b5bad
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet_gzip.csv
@@ -0,0 +1,5 @@
+sample,mlst_alleles,address
+sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
+sample1,/root/working_directory/gas/gasnomenclature/tests/data/reports/sample1.mlst.json.gz,1.1.1
+sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1
+sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index 53ad3d1..d47b7ed 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -109,6 +109,41 @@ nextflow_pipeline {
         }
     }
 
+    test("Small-scale test of full pipeline with gzipped MLST JSON"){
+        tag "pipeline_success_gzipped_JSON"
+
+        when{
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet_gzip.csv"
+                outdir = "results"
+            }
+        }
+
+        then {
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // Check is sample1.mlst.json.gz exists and is gzipped
+            def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz")
+            assert gzipped_json.exists()
+            
+            // Check called clusters
+            def actual_calls = path("$launchDir/results/call/Called/results.text")
+            def expected_calls = path("$baseDir/tests/data/called/expected_results.txt")
+            assert actual_calls.text == expected_calls.text
+
+            // Check IRIDA Next JSON output
+            assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json
+
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+
+            assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ")
+            assert iridanext_metadata.sampleQ."address" == "1.1.3"
+        }
+    }
+
     test("Integration test where input contains reference sample with mismatched MLST JSON file"){
         tag "pipeline_failure"
 

From 82c3a0d9ceea539a45cfe5c8b57b488a4a4657cf Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 16:56:07 -0400
Subject: [PATCH 11/21] Added test for mismatched IDs

---
 .../irida/mismatched_iridanext.output.json    | 29 ++++++++
 .../samplesheet-mismatched_IDs.csv            |  7 ++
 tests/pipelines/main.nf.test                  | 66 +++++++------------
 3 files changed, 61 insertions(+), 41 deletions(-)
 create mode 100644 tests/data/irida/mismatched_iridanext.output.json
 create mode 100644 tests/data/samplesheets/samplesheet-mismatched_IDs.csv

diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json
new file mode 100644
index 0000000..ec418dc
--- /dev/null
+++ b/tests/data/irida/mismatched_iridanext.output.json
@@ -0,0 +1,29 @@
+{
+    "files": {
+        "global": [
+            
+        ],
+        "samples": {
+            "sampleR": [
+                {
+                    "path": "input/sampleR_error_report.csv"
+                }
+            ],
+            "sample2": [
+                {
+                    "path": "input/sample2_error_report.csv"
+                }
+            ]
+        }
+    },
+    "metadata": {
+        "samples": {
+            "sampleQ": {
+                "address": "2.2.3"
+            },
+            "sampleR": {
+                "address": "2.2.3"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/data/samplesheets/samplesheet-mismatched_IDs.csv b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv
new file mode 100644
index 0000000..73230d4
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv
@@ -0,0 +1,7 @@
+sample,mlst_alleles,address
+sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
+sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json,
+sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1
+sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1
+sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2
+
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index d47b7ed..9892b40 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -105,12 +105,12 @@ nextflow_pipeline {
             assert iridanext_metadata.containsKey("sampleN")
 
             assert iridanext_metadata.sampleQ."address" == "2.2.3"
-            assert iridanext_metadata.sampleN.address == "2.2.3"
+            assert iridanext_metadata.sampleN."address" == "2.2.3"
         }
     }
 
     test("Small-scale test of full pipeline with gzipped MLST JSON"){
-        tag "pipeline_success_gzipped_JSON"
+        tag "Gzipped_MLST_JSON"
 
         when{
             params {
@@ -144,37 +144,15 @@ nextflow_pipeline {
         }
     }
 
-    test("Integration test where input contains reference sample with mismatched MLST JSON file"){
-        tag "pipeline_failure"
-
-        when {
-            params {
-                input = "$baseDir/tests/data/samplesheets/samplesheet_test1.csv"
-                outdir = "results"
-            }
-        }
-
-        then {
-            assert workflow.failed
-            assert (workflow.stdout =~ /Pipeline exiting: sample with ID sample2 does not have matching MLST JSON file./).find()
-
-            assert path("$launchDir/results").exists()
-            assert path("$launchDir/results/input").exists()
-
-            // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query)
-            def lines = []
-
-            lines = path("$launchDir/results/input/sample2_error_report.csv").readLines()
-            assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH")
-        }
-    }
-
-    test("Integration test where input contains a single query sample with mismatched MLST JSON file"){
-        tag "pipeline_success_after_query_removal"
+    test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){
+        // IDs in the sample sheet and IDs in the individual MLST JSON files will not match.
+        // This tests the pipelines ability to handle and correct for this problem.
+        
+        tag "mismatched_IDs"
 
         when{
             params {
-                input = "$baseDir/tests/data/samplesheets/samplesheet_test2.csv"
+                input = "$baseDir/tests/data/samplesheets/samplesheet-mismatched_IDs.csv"
                 outdir = "results"
             }
         }
@@ -182,29 +160,35 @@ nextflow_pipeline {
         then {
             assert workflow.success
             assert path("$launchDir/results").exists()
-            assert path("$launchDir/results/input").exists()
-            assert path("$launchDir/results/filter").exists()
-
+            
             // Check outputs
             def lines = []
 
-            // Ensure that the error_report is generated for removed query sampleR
+            // Ensure that the error_reports are generated for query and reference samples
+            lines = path("$launchDir/results/input/sample2_error_report.csv").readLines()
+            assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.")
+            
             lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines()
-            assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline")
+            assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.")
 
-            // Check query output csv
+            // Check filter_query csv file
             lines = path("$launchDir/results/filter/new_addresses.csv").readLines()
-            assert lines.contains("sampleQ,1.1.3")
+            assert lines.contains("sampleQ,2.2.3")
+            assert lines.contains("sampleR,2.2.3")
 
-            // Check IRIDA Next JSON output
-            assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test2_iridanext.output.json").json
+           // Check IRIDA Next JSON output
+            assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json
 
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json
             def iridanext_samples = iridanext_json.files.samples
             def iridanext_metadata = iridanext_json.metadata.samples
 
-            assert iridanext_samples.sampleR.findAll { it.path == "input/sampleR_error_report.csv" }.size() == 1
-            assert iridanext_metadata.sampleQ."address" == "1.1.3"
+            assert iridanext_metadata.size() == 2
+            assert iridanext_metadata.containsKey("sampleQ")
+            assert iridanext_metadata.containsKey("sampleR")
+
+            assert iridanext_metadata.sampleQ."address" == "2.2.3"
+            assert iridanext_metadata.sampleR."address" == "2.2.3"
         }
     }
 

From 001709087fdbb07d23aa25193d2379dea17dcbe3 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 16:59:27 -0400
Subject: [PATCH 12/21] Update paths in samplesheet

---
 tests/data/samplesheets/samplesheet_gzip.csv | 2 +-
 tests/pipelines/main.nf.test                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/data/samplesheets/samplesheet_gzip.csv b/tests/data/samplesheets/samplesheet_gzip.csv
index a4b5bad..2337c78 100644
--- a/tests/data/samplesheets/samplesheet_gzip.csv
+++ b/tests/data/samplesheets/samplesheet_gzip.csv
@@ -1,5 +1,5 @@
 sample,mlst_alleles,address
 sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
-sample1,/root/working_directory/gas/gasnomenclature/tests/data/reports/sample1.mlst.json.gz,1.1.1
+sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample1.mlst.json.gz,1.1.1
 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1
 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index 9892b40..77c2672 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -166,7 +166,7 @@ nextflow_pipeline {
 
             // Ensure that the error_reports are generated for query and reference samples
             lines = path("$launchDir/results/input/sample2_error_report.csv").readLines()
-            assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.")
+            #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.")
             
             lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines()
             assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.")

From 3f181eb99c1e458977b03abdbdc83dad2ba2610e Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 17:00:48 -0400
Subject: [PATCH 13/21] Fix EC issues

---
 tests/pipelines/main.nf.test | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index 77c2672..43449ea 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -126,7 +126,7 @@ nextflow_pipeline {
             // Check is sample1.mlst.json.gz exists and is gzipped
             def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz")
             assert gzipped_json.exists()
-            
+
             // Check called clusters
             def actual_calls = path("$launchDir/results/call/Called/results.text")
             def expected_calls = path("$baseDir/tests/data/called/expected_results.txt")
@@ -147,7 +147,7 @@ nextflow_pipeline {
     test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){
         // IDs in the sample sheet and IDs in the individual MLST JSON files will not match.
         // This tests the pipelines ability to handle and correct for this problem.
-        
+
         tag "mismatched_IDs"
 
         when{
@@ -160,14 +160,14 @@ nextflow_pipeline {
         then {
             assert workflow.success
             assert path("$launchDir/results").exists()
-            
+
             // Check outputs
             def lines = []
 
             // Ensure that the error_reports are generated for query and reference samples
             lines = path("$launchDir/results/input/sample2_error_report.csv").readLines()
             #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.")
-            
+
             lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines()
             assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.")
 
@@ -176,7 +176,7 @@ nextflow_pipeline {
             assert lines.contains("sampleQ,2.2.3")
             assert lines.contains("sampleR,2.2.3")
 
-           // Check IRIDA Next JSON output
+            // Check IRIDA Next JSON output
             assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json
 
             def iridanext_json = path("$launchDir/results/iridanext.output.json").json

From 1f525294704b6b8a7c9110e2bdd35085578e8ddf Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 17:02:29 -0400
Subject: [PATCH 14/21] Fix EC issues

---
 tests/data/irida/mismatched_iridanext.output.json | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json
index ec418dc..750523b 100644
--- a/tests/data/irida/mismatched_iridanext.output.json
+++ b/tests/data/irida/mismatched_iridanext.output.json
@@ -1,8 +1,6 @@
 {
     "files": {
-        "global": [
-            
-        ],
+        "global": [],
         "samples": {
             "sampleR": [
                 {
@@ -26,4 +24,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}

From ec347e46b808810f5ebfd6fc532aed832f4e91d3 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Wed, 12 Jun 2024 17:08:30 -0400
Subject: [PATCH 15/21] Removed unexpected character (#) in main.nf.test

---
 tests/pipelines/main.nf.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index 43449ea..f1df721 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -166,7 +166,7 @@ nextflow_pipeline {
 
             // Ensure that the error_reports are generated for query and reference samples
             lines = path("$launchDir/results/input/sample2_error_report.csv").readLines()
-            #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.")
+            assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.")
 
             lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines()
             assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.")

From 7c1b5dc31536243a5870ed050f5b90286b33ff67 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Thu, 13 Jun 2024 10:13:08 -0400
Subject: [PATCH 16/21] Add test data for multiple keyed JSON file

---
 tests/data/reports/sample3_multiplekeys.mlst.json    | 12 ++++++++++++
 .../reports/sample3_multiplekeys_nomatch.mlst.json   | 12 ++++++++++++
 .../data/samplesheets/samplesheet-multiple_keys.csv  |  5 +++++
 .../samplesheet-multiplekeys_nomatch.csv             |  5 +++++
 4 files changed, 34 insertions(+)
 create mode 100644 tests/data/reports/sample3_multiplekeys.mlst.json
 create mode 100644 tests/data/reports/sample3_multiplekeys_nomatch.mlst.json
 create mode 100644 tests/data/samplesheets/samplesheet-multiple_keys.csv
 create mode 100644 tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv

diff --git a/tests/data/reports/sample3_multiplekeys.mlst.json b/tests/data/reports/sample3_multiplekeys.mlst.json
new file mode 100644
index 0000000..5d85e65
--- /dev/null
+++ b/tests/data/reports/sample3_multiplekeys.mlst.json
@@ -0,0 +1,12 @@
+{
+    "extra_key": {
+        "l1": "1",
+        "l2": "1",
+        "l3": "2"
+    },
+    "sample3": {
+        "l1": "1",
+        "l2": "1",
+        "l3": "2"
+    }
+}
diff --git a/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json
new file mode 100644
index 0000000..6d7878d
--- /dev/null
+++ b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json
@@ -0,0 +1,12 @@
+{
+    "sample4": {
+        "l1": "1",
+        "l2": "1",
+        "l3": "2"
+    },
+    "extra_key": {
+        "l1": "1",
+        "l2": "1",
+        "l3": "2"
+    }
+}
diff --git a/tests/data/samplesheets/samplesheet-multiple_keys.csv b/tests/data/samplesheets/samplesheet-multiple_keys.csv
new file mode 100644
index 0000000..867d7d6
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-multiple_keys.csv
@@ -0,0 +1,5 @@
+sample,mlst_alleles,address
+sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
+sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1
+sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1
+sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys.mlst.json,1.1.2
diff --git a/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv
new file mode 100644
index 0000000..cdd0bf0
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv
@@ -0,0 +1,5 @@
+sample,mlst_alleles,address
+sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
+sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1
+sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1
+sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json,1.1.2

From 8e8ffa446b816d973e5aa04888cc26d75b9c18b4 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Thu, 13 Jun 2024 11:21:17 -0400
Subject: [PATCH 17/21] Tests added to handle when there are multiple sample
 entries (keys) in provided MLST JSON file(s)

---
 .../irida/multiplekeys_iridanext.output.json  | 19 ++++
 tests/pipelines/main.nf.test                  | 97 +++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 tests/data/irida/multiplekeys_iridanext.output.json

diff --git a/tests/data/irida/multiplekeys_iridanext.output.json b/tests/data/irida/multiplekeys_iridanext.output.json
new file mode 100644
index 0000000..f7b872f
--- /dev/null
+++ b/tests/data/irida/multiplekeys_iridanext.output.json
@@ -0,0 +1,19 @@
+{
+    "files": {
+        "global": [],
+        "samples": {
+            "sample3": [
+                {
+                    "path": "input/sample3_error_report.csv"
+                }
+            ]
+        }
+    },
+    "metadata": {
+        "samples": {
+            "sampleQ": {
+                "address": "1.1.3"
+            }
+        }
+    }
+}
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index f1df721..4cec606 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -192,4 +192,101 @@ nextflow_pipeline {
         }
     }
 
+    test("Testing data removal in MLST JSON with a matching sampleID key."){
+        // There are multiple sample entries (keys) in the MLST JSON and one of them matches the sampleID.
+        // This test evaluates the pipeline's ability to address this issue by removing keys that do not match the sampleID.
+
+        tag "multiple_keys_with_matching_ID"
+
+        when{
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_keys.csv"
+                outdir = "results"
+            }
+        }
+
+        then {
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // Check called clusters
+            def actual_calls = path("$launchDir/results/call/Called/results.text")
+            def expected_calls = path("$baseDir/tests/data/called/expected_results.txt")
+            assert actual_calls.text == expected_calls.text
+
+            // Check outputs
+            def lines = []
+
+            // Ensure that the error_reports are generated for query and reference samples
+            lines = path("$launchDir/results/input/sample3_error_report.csv").readLines()
+            assert lines.contains('sample3,"[\'extra_key\', \'sample3\']","MLST JSON file (sample3_multiplekeys.mlst.json) contains multiple keys: [\'extra_key\', \'sample3\']. The MLST JSON file has been modified to retain only the \'sample3\' entry"')
+
+            // Check filtered query csv results
+            lines = path("$launchDir/results/filter/new_addresses.csv").readLines()
+            assert lines.contains("sampleQ,1.1.3")
+
+            // Check IRIDA Next JSON output
+            assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json
+
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+
+            assert iridanext_samples.sample3.size() == 1
+            assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv'
+
+            assert iridanext_metadata.size() == 1
+            assert iridanext_metadata.containsKey("sampleQ")
+            assert iridanext_metadata.sampleQ."address" == "1.1.3"
+        }
+    }
+
+    test("Testing the removal of data in MLST JSON with no sampleID match."){
+        // There are multiple sample entries (keys) in the MLST JSON and none of them match the sampleID..
+        // This test ensures the pipeline can handle and resolve this issue by retaining only the first JSON key entry and renaming it to match the sampleID.
+
+        tag "multiple_keys_without_matching_ID"
+
+        when{
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv"
+                outdir = "results"
+            }
+        }
+
+        then {
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // Check called clusters
+            def actual_calls = path("$launchDir/results/call/Called/results.text")
+            def expected_calls = path("$baseDir/tests/data/called/expected_results.txt")
+            assert actual_calls.text == expected_calls.text
+
+            // Check outputs
+            def lines = []
+
+            // Ensure that the error_reports are generated for query and reference samples
+            lines = path("$launchDir/results/input/sample3_error_report.csv").readLines()
+            assert lines.contains("sample3,\"[\'sample4\', \'extra_key\']\",No key in the MLST JSON file (sample3_multiplekeys_nomatch.mlst.json) matches the specified sample ID \'sample3\'. The first key \'sample4\' has been forcefully changed to \'sample3\' and all other keys have been removed.")
+
+            // Check filtered query csv results
+            lines = path("$launchDir/results/filter/new_addresses.csv").readLines()
+            assert lines.contains("sampleQ,1.1.3")
+
+            // Check IRIDA Next JSON output
+            assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json
+
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+
+            assert iridanext_samples.sample3.size() == 1
+            assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv'
+
+            assert iridanext_metadata.size() == 1
+            assert iridanext_metadata.containsKey("sampleQ")
+            assert iridanext_metadata.sampleQ."address" == "1.1.3"
+        }
+    }
 }

From 79096738d4e792ceddf399c53d356d10f463bb55 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Thu, 13 Jun 2024 16:10:21 -0400
Subject: [PATCH 18/21] Updated input_assure to identify when MLST JSON is
 empty. Added corresponding test

---
 bin/input_assure.py                              | 12 ++++++++----
 tests/data/reports/sample2_empty.mlst.json       |  1 +
 .../data/samplesheets/samplesheet_emptyJSON.csv  |  6 ++++++
 tests/pipelines/main.nf.test                     | 16 ++++++++++++++++
 4 files changed, 31 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/reports/sample2_empty.mlst.json
 create mode 100644 tests/data/samplesheets/samplesheet_emptyJSON.csv

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 779e888..5e749b5 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -4,6 +4,7 @@
 import argparse
 import csv
 import gzip
+import sys
 
 
 def open_file(file_path, mode):
@@ -13,7 +14,6 @@ def open_file(file_path, mode):
     else:
         return open(file_path, mode)
 
-
 def check_inputs(json_file, sample_id, address, output_error_file):
     with open_file(json_file, "rt") as f:
         json_data = json.load(f)
@@ -22,13 +22,17 @@ def check_inputs(json_file, sample_id, address, output_error_file):
     match_status = sample_id in json_data
 
     keys = list(json_data.keys())
-    original_key = keys[0]
+    original_key = keys[0] if keys else None
 
     # Initialize the error message
     error_message = None
 
     # Check for multiple keys in the JSON file and define error message
-    if len(keys) > 1:
+    if len(keys) == 0:
+        error_message = f"{json_file} is completely empty!"
+        print(error_message)
+        sys.exit(1)
+    elif len(keys) > 1:
         # Check if sample_id matches any key
         if not match_status:
             error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
@@ -76,4 +80,4 @@ def check_inputs(json_file, sample_id, address, output_error_file):
 
     args = parser.parse_args()
 
-    check_inputs(args.input, args.sample_id, args.address, args.output_error)
+    check_inputs(args.input, args.sample_id, args.address, args.output_error)
\ No newline at end of file
diff --git a/tests/data/reports/sample2_empty.mlst.json b/tests/data/reports/sample2_empty.mlst.json
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/tests/data/reports/sample2_empty.mlst.json
@@ -0,0 +1 @@
+{}
diff --git a/tests/data/samplesheets/samplesheet_emptyJSON.csv b/tests/data/samplesheets/samplesheet_emptyJSON.csv
new file mode 100644
index 0000000..efcb1bb
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet_emptyJSON.csv
@@ -0,0 +1,6 @@
+sample,mlst_alleles,address
+sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
+sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1
+sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample2_empty.mlst.json,1.1.1
+sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2
+
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index 4cec606..d292d1d 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -289,4 +289,20 @@ nextflow_pipeline {
             assert iridanext_metadata.sampleQ."address" == "1.1.3"
         }
     }
+
+    test("Testing when provided MLST JSON file(s) are empty."){
+        tag "empty_JSON"
+
+        when{
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet_emptyJSON.csv"
+                outdir = "results"
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert (workflow.stdout =~ /sample2_empty.mlst.json is completely empty!/).find()
+        }
+    }
 }

From da8c82992277aa2903727b5730b95544f92b1097 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Thu, 13 Jun 2024 16:12:28 -0400
Subject: [PATCH 19/21] EC issue fix

---
 bin/input_assure.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 5e749b5..5fabad4 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -14,6 +14,7 @@ def open_file(file_path, mode):
     else:
         return open(file_path, mode)
 
+
 def check_inputs(json_file, sample_id, address, output_error_file):
     with open_file(json_file, "rt") as f:
         json_data = json.load(f)
@@ -80,4 +81,4 @@ def check_inputs(json_file, sample_id, address, output_error_file):
 
     args = parser.parse_args()
 
-    check_inputs(args.input, args.sample_id, args.address, args.output_error)
\ No newline at end of file
+    check_inputs(args.input, args.sample_id, args.address, args.output_error)

From 6642b72ad0132805bf23083cf9979a8dde965941 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Thu, 13 Jun 2024 17:07:04 -0400
Subject: [PATCH 20/21] Create a new JSON output file in input_assure

---
 bin/input_assure.py                | 17 +++++++++++------
 modules/local/input_assure/main.nf |  5 +++--
 tests/pipelines/main.nf.test       |  5 ++---
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 5fabad4..7926cab 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -15,20 +15,20 @@ def open_file(file_path, mode):
         return open(file_path, mode)
 
 
-def check_inputs(json_file, sample_id, address, output_error_file):
+def check_inputs(json_file, sample_id, address, output_error_file, output_json_file):
     with open_file(json_file, "rt") as f:
         json_data = json.load(f)
 
     # Define a variable to store the match_status (True or False)
     match_status = sample_id in json_data
 
-    keys = list(json_data.keys())
-    original_key = keys[0] if keys else None
-
     # Initialize the error message
     error_message = None
 
     # Check for multiple keys in the JSON file and define error message
+    keys = list(json_data.keys())
+    original_key = keys[0] if keys else None
+
     if len(keys) == 0:
         error_message = f"{json_file} is completely empty!"
         print(error_message)
@@ -60,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file):
             writer.writerow([sample_id, keys, error_message])
 
     # Write the updated JSON data back to the original file
-    with open_file(json_file, "wt") as f:
+    with open_file(output_json_file, "wt") as f:
         json.dump(json_data, f, indent=4)
 
 
@@ -78,7 +78,12 @@ def check_inputs(json_file, sample_id, address, output_error_file):
     parser.add_argument(
         "--output_error", help="Path to the error report file.", required=True
     )
+    parser.add_argument(
+        "--output_json", help="Path to the MLST JSON file.", required=True
+    )
 
     args = parser.parse_args()
 
-    check_inputs(args.input, args.sample_id, args.address, args.output_error)
+    check_inputs(
+        args.input, args.sample_id, args.address, args.output_error, args.output_json
+    )
diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index e0376ac..dd72bb1 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -10,7 +10,7 @@ process INPUT_ASSURE {
     tuple val(meta), path(mlst)
 
     output:
-    tuple val(meta), path(mlst),                                    emit: result
+    tuple val(meta), path("${meta.id}.mlst.json"),                  emit: result
     tuple val(meta), path("*_error_report.csv"), optional: true,    emit: error_report
     path("versions.yml"),                                           emit: versions
 
@@ -21,7 +21,8 @@ process INPUT_ASSURE {
         --input ${mlst} \\
         --sample_id ${meta.id} \\
         --address ${meta.address} \\
-        --output_error ${meta.id}_error_report.csv
+        --output_error ${meta.id}_error_report.csv \\
+        --output_json ${meta.id}.mlst.json
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index d292d1d..b6a5ab8 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -123,9 +123,8 @@ nextflow_pipeline {
             assert workflow.success
             assert path("$launchDir/results").exists()
 
-            // Check is sample1.mlst.json.gz exists and is gzipped
-            def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz")
-            assert gzipped_json.exists()
+            // Check that sample1.mlst.json.gz has been open, read, and that a new file has been generated
+            assert path("$launchDir/results/input/sample1.mlst.json").exists()
 
             // Check called clusters
             def actual_calls = path("$launchDir/results/call/Called/results.text")

From 348fe9558c27e1ef76ba387c4abc31632656c1aa Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Fri, 14 Jun 2024 12:01:48 -0400
Subject: [PATCH 21/21] Ensure MLST JSON files from input_assure are gzipped

---
 bin/input_assure.py                           |   4 ++--
 modules/local/input_assure/main.nf            |   4 ++--
 tests/data/irida/test2_iridanext.output.json  |  19 ------------------
 tests/data/reports/sample1.mlst.json.gz       | Bin 84 -> 84 bytes
 tests/data/samplesheets/samplesheet_test1.csv |   5 -----
 tests/data/samplesheets/samplesheet_test2.csv |   7 -------
 tests/pipelines/main.nf.test                  |   4 ++--
 7 files changed, 6 insertions(+), 37 deletions(-)
 delete mode 100644 tests/data/irida/test2_iridanext.output.json
 delete mode 100644 tests/data/samplesheets/samplesheet_test1.csv
 delete mode 100644 tests/data/samplesheets/samplesheet_test2.csv

diff --git a/bin/input_assure.py b/bin/input_assure.py
index 7926cab..d99bf2a 100755
--- a/bin/input_assure.py
+++ b/bin/input_assure.py
@@ -60,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f
             writer.writerow([sample_id, keys, error_message])
 
     # Write the updated JSON data back to the original file
-    with open_file(output_json_file, "wt") as f:
+    with gzip.open(output_json_file, "wt") as f:
         json.dump(json_data, f, indent=4)
 
 
@@ -79,7 +79,7 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f
         "--output_error", help="Path to the error report file.", required=True
     )
     parser.add_argument(
-        "--output_json", help="Path to the MLST JSON file.", required=True
+        "--output_json", help="Path to the MLST JSON file (gzipped).", required=True
     )
 
     args = parser.parse_args()
diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf
index dd72bb1..43b7462 100644
--- a/modules/local/input_assure/main.nf
+++ b/modules/local/input_assure/main.nf
@@ -10,7 +10,7 @@ process INPUT_ASSURE {
     tuple val(meta), path(mlst)
 
     output:
-    tuple val(meta), path("${meta.id}.mlst.json"),                  emit: result
+    tuple val(meta), path("${meta.id}.mlst.json.gz"),               emit: result
     tuple val(meta), path("*_error_report.csv"), optional: true,    emit: error_report
     path("versions.yml"),                                           emit: versions
 
@@ -22,7 +22,7 @@ process INPUT_ASSURE {
         --sample_id ${meta.id} \\
         --address ${meta.address} \\
         --output_error ${meta.id}_error_report.csv \\
-        --output_json ${meta.id}.mlst.json
+        --output_json ${meta.id}.mlst.json.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/tests/data/irida/test2_iridanext.output.json b/tests/data/irida/test2_iridanext.output.json
deleted file mode 100644
index 2882954..0000000
--- a/tests/data/irida/test2_iridanext.output.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-    "files": {
-        "global": [],
-        "samples": {
-            "sampleR": [
-                {
-                    "path": "input/sampleR_error_report.csv"
-                }
-            ]
-        }
-    },
-    "metadata": {
-        "samples": {
-            "sampleQ": {
-                "address": "1.1.3"
-            }
-        }
-    }
-}
diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz
index 94f25c81407bfee01777e59fb4af80f5998dbf30..735e1082b5193673c4844e4f4558af8e8206a12f 100644
GIT binary patch
delta 13
UcmWFuVVCdb;5f#WHIdyH02iwRi2wiq

delta 13
UcmWFuVVCdb;5hm}b0WJh032!rd;kCd

diff --git a/tests/data/samplesheets/samplesheet_test1.csv b/tests/data/samplesheets/samplesheet_test1.csv
deleted file mode 100644
index cf87b26..0000000
--- a/tests/data/samplesheets/samplesheet_test1.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-sample,mlst_alleles,address
-sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json,
-sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1
-sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1
-sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2
diff --git a/tests/data/samplesheets/samplesheet_test2.csv b/tests/data/samplesheets/samplesheet_test2.csv
deleted file mode 100644
index 036c317..0000000
--- a/tests/data/samplesheets/samplesheet_test2.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-sample,mlst_alleles,address
-sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json,
-sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
-sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1
-sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1
-sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2
-
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
index b6a5ab8..6716dae 100644
--- a/tests/pipelines/main.nf.test
+++ b/tests/pipelines/main.nf.test
@@ -123,8 +123,8 @@ nextflow_pipeline {
             assert workflow.success
             assert path("$launchDir/results").exists()
 
-            // Check that sample1.mlst.json.gz has been open, read, and that a new file has been generated
-            assert path("$launchDir/results/input/sample1.mlst.json").exists()
+            // Check that sample1.mlst.json.gz has been open, read, and that a new gzipped file has been generated
+            assert path("$launchDir/results/input/sample1.mlst.json.gz").exists()
 
             // Check called clusters
             def actual_calls = path("$launchDir/results/call/Called/results.text")