From be3ba2671b0e9358d3031c917560bfe7539bc5dc Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 6 Jun 2024 16:45:58 -0400 Subject: [PATCH 01/21] Revise 'input_check' to 'input_assure'; enforce JSON key alteration to match the sample ID if a mismatch is detected --- bin/input_check.py | 24 +++++++++++++++---- .../{input_check => input_assure}/main.nf | 2 +- workflows/gas_nomenclature.nf | 19 ++++----------- 3 files changed, 24 insertions(+), 21 deletions(-) rename modules/local/{input_check => input_assure}/main.nf (97%) diff --git a/bin/input_check.py b/bin/input_check.py index a21f0c7..bd7ac70 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -4,11 +4,18 @@ import argparse import sys import csv +import gzip +def open_file(file_path, mode): + # Open a file based on the file extension + if file_path.endswith('.gz'): + return gzip.open(file_path, mode) + else: + return open(file_path, mode) def check_inputs(json_file, sample_id, address, output_match_file, output_error_file): # Define a variable to store the match_status (True or False) - with open(json_file, "r") as f: + with open(json_file, "rt") as f: json_data = json.load(f) match_status = sample_id in json_data @@ -16,23 +23,30 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_ with open(output_match_file, "w") as f: f.write(str(match_status)) + # Define the original key in the JSON data + original_key = list(json_data.keys())[0] + # Define error message based on meta.address (query or reference) if address == "null": - error_message = f"Query {sample_id} removed from pipeline" + error_message = f"Query {sample_id} ID did not match the JSON key in {json_file} - User must manually check input files to ensure correctness." else: - error_message = f"Pipeline stopped: Reference {sample_id}'s input ID and MLST JSON file key DO NOT MATCH" + error_message = f"Reference {sample_id}'s sample ID and JSON key in {json_file} DO NOT MATCH: the '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}'." # Write sample ID and JSON key to error report CSV if not matched; include error message if not match_status: with open(output_error_file, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["sample", "JSON_key", "error_message"]) - writer.writerow([sample_id, list(json_data.keys())[0], error_message]) + writer.writerow([sample_id, original_key, error_message]) + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + with open(json_file, "wt") as f: + json.dump(json_data, f, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Check sample inputs and generate an error report." + description="Check sample inputs, force change if ID ≠ KEY, and generate an error report." ) parser.add_argument("--input", help="Path to the mlst.json file.", required=True) parser.add_argument( diff --git a/modules/local/input_check/main.nf b/modules/local/input_assure/main.nf similarity index 97% rename from modules/local/input_check/main.nf rename to modules/local/input_assure/main.nf index 79a2242..90260a2 100644 --- a/modules/local/input_check/main.nf +++ b/modules/local/input_assure/main.nf @@ -1,4 +1,4 @@ -process INPUT_CHECK{ +process INPUT_ASSURE { tag "Check Sample Inputs and Generate Error Report" label 'process_single' diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index b0fb977..dfa029d 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -22,7 +22,7 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from "../modules/local/input_check/main" +include { INPUT_ASSURE } from "../modules/local/input_assure/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" include { PROFILE_DISTS } from "../modules/local/profile_dists/main" @@ -72,7 +72,7 @@ workflow GAS_NOMENCLATURE { input = Channel.fromSamplesheet("input") // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key - id_key = INPUT_CHECK(input) + id_key = INPUT_ASSURE(input) ch_versions = ch_versions.mix(id_key.versions) // Update metadata to include the id_key.match data @@ -80,21 +80,10 @@ workflow GAS_NOMENCLATURE { def id_match = file.text.trim() [meta + [id_match: id_match == 'True'], json] } - - // If samples have a disparity between meta.id and JSON key: Exclude the queried samples OR halt the pipeline with an error if sample has an associated cluster address (reference) - new_input = match.filter { meta, json -> - if (meta.id_match) { - return true // Keep the sample - } else if (meta.address == null && !meta.id_match) { - return false // Remove the sample - } else if (meta.address != null && !meta.id_match) { - // Exit with error statement - throw new RuntimeException("Pipeline exiting: sample with ID ${meta.id} does not have matching MLST JSON file.") - } - } + match.view() // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = new_input.branch{ + profiles = match.branch { query: !it[0].address } reference_values = input.collect{ meta, profile -> profile} From 95e40f6979ab60cbe6f8531bb72996c45cf1c56f Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 08:47:18 -0400 Subject: [PATCH 02/21] Remove id_match from meta --- bin/input_check.py | 11 ++--------- modules/local/input_assure/main.nf | 5 ++--- workflows/gas_nomenclature.nf | 9 +-------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/bin/input_check.py b/bin/input_check.py index bd7ac70..19c099d 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -13,16 +13,12 @@ def open_file(file_path, mode): else: return open(file_path, mode) -def check_inputs(json_file, sample_id, address, output_match_file, output_error_file): +def check_inputs(json_file, sample_id, address, output_error_file): # Define a variable to store the match_status (True or False) with open(json_file, "rt") as f: json_data = json.load(f) match_status = sample_id in json_data - # Write match status to file - with open(output_match_file, "w") as f: - f.write(str(match_status)) - # Define the original key in the JSON data original_key = list(json_data.keys())[0] @@ -58,12 +54,9 @@ def check_inputs(json_file, sample_id, address, output_match_file, output_error_ parser.add_argument( "--output_error", help="Path to the error report file.", required=True ) - parser.add_argument( - "--output_match", help="Path to the match status file.", required=True - ) args = parser.parse_args() check_inputs( - args.input, args.sample_id, args.address, args.output_match, args.output_error + args.input, args.sample_id, args.address, args.output_error ) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index 90260a2..1b22242 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path("${meta.id}_match.txt"), path(mlst), emit: match + tuple val(meta), path(mlst), emit: match tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -21,8 +21,7 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv \\ - --output_match ${meta.id}_match.txt + --output_error ${meta.id}_error_report.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index dfa029d..d527777 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -75,15 +75,8 @@ workflow GAS_NOMENCLATURE { id_key = INPUT_ASSURE(input) ch_versions = ch_versions.mix(id_key.versions) - // Update metadata to include the id_key.match data - match = id_key.match.map { meta, file, json -> - def id_match = file.text.trim() - [meta + [id_match: id_match == 'True'], json] - } - match.view() - // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = match.branch { + profiles = id_key.match.branch { query: !it[0].address } reference_values = input.collect{ meta, profile -> profile} From 9e20417c028621a6c18fd8011aef9bfbb1885956 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 08:49:36 -0400 Subject: [PATCH 03/21] Fix linting --- modules/local/input_assure/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index 1b22242..c01c319 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -21,7 +21,7 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv + --output_error ${meta.id}_error_report.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From deb43495bf65e0e599d4f645f6ca389f6698fcdc Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 7 Jun 2024 09:26:53 -0400 Subject: [PATCH 04/21] Updated error_message from input_assure --- bin/input_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/input_check.py b/bin/input_check.py index 19c099d..5c8365d 100755 --- a/bin/input_check.py +++ b/bin/input_check.py @@ -24,9 +24,9 @@ def check_inputs(json_file, sample_id, address, output_error_file): # Define error message based on meta.address (query or reference) if address == "null": - error_message = f"Query {sample_id} ID did not match the JSON key in {json_file} - User must manually check input files to ensure correctness." + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." else: - error_message = f"Reference {sample_id}'s sample ID and JSON key in {json_file} DO NOT MATCH: the '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}'." + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." # Write sample ID and JSON key to error report CSV if not matched; include error message if not match_status: From 07fe2c66dee1631279630a3bdda33281f30b4b2b Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:52:24 -0400 Subject: [PATCH 05/21] Update python script name to match process: input_assure.py --- bin/{input_check.py => input_assure.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{input_check.py => input_assure.py} (100%) diff --git a/bin/input_check.py b/bin/input_assure.py similarity index 100% rename from bin/input_check.py rename to bin/input_assure.py From 23c1397efd2c31f8af2588a846809af16a81a0fc Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:54:35 -0400 Subject: [PATCH 06/21] Add 'fair = true' to input_assure process in modules.config for reproducibility --- conf/modules.config | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a898c53..00855c7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,8 +13,6 @@ process { // Publish directory names - assembly_directory_name = "assembly" - summary_directory_name = "summary" profile_dists_directory_name = "distances" gas_call_directory_name = "call" @@ -27,6 +25,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: INPUT_ASSURE { + fair = true + } + withName: LOCIDEX_MERGE_REF { publishDir = [ path: locidex_merge_ref_directory_name, From c7252cfeb77a4f32fce0f0811e711961ce07b19a Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:56:49 -0400 Subject: [PATCH 07/21] Update input_assure.py to include additional check for multiple keys --- bin/input_assure.py | 52 +++++++++++++++++++----------- modules/local/input_assure/main.nf | 6 ++-- workflows/gas_nomenclature.nf | 10 +++--- 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5c8365d..2705eae 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -2,7 +2,6 @@ import json import argparse -import sys import csv import gzip @@ -14,31 +13,48 @@ def open_file(file_path, mode): return open(file_path, mode) def check_inputs(json_file, sample_id, address, output_error_file): - # Define a variable to store the match_status (True or False) - with open(json_file, "rt") as f: + with open_file(json_file, "rt") as f: json_data = json.load(f) - match_status = sample_id in json_data - # Define the original key in the JSON data - original_key = list(json_data.keys())[0] + # Define a variable to store the match_status (True or False) + match_status = sample_id in json_data + + keys = list (json_data.keys()) + original_key = keys[0] + # Initialize the error message + error_message = None + + # Check for multiple keys in the JSON file and define error message + if len(keys) > 1: + # Check if sample_id matches any key + if not match_status: + error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." + # Retain only the specified sample ID + json_data = {sample_id: json_data.pop(original_key)} + else: + error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry" + # Remove all keys expect the one matching sample_id + json_data = {sample_id: json_data[sample_id]} + elif not match_status: # Define error message based on meta.address (query or reference) - if address == "null": - error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." - else: - error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." - - # Write sample ID and JSON key to error report CSV if not matched; include error message - if not match_status: + if address == "null": + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + else: + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + + # Write file containing relevant error messages + if error_message: with open(output_error_file, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["sample", "JSON_key", "error_message"]) - writer.writerow([sample_id, original_key, error_message]) + writer.writerow([sample_id, keys, error_message]) - # Update the JSON file with the new sample ID - json_data[sample_id] = json_data.pop(original_key) - with open(json_file, "wt") as f: - json.dump(json_data, f, indent=4) + # Write the updated JSON data back to the original file + with open_file(json_file, "wt") as f: + json.dump(json_data, f, indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index c01c319..e0376ac 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -1,5 +1,5 @@ process INPUT_ASSURE { - tag "Check Sample Inputs and Generate Error Report" + tag "Assures Inputs are Consistent" label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -10,14 +10,14 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path(mlst), emit: match + tuple val(meta), path(mlst), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions script: """ - input_check.py \\ + input_assure.py \\ --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index d527777..4531ff1 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -72,15 +72,15 @@ workflow GAS_NOMENCLATURE { input = Channel.fromSamplesheet("input") // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key - id_key = INPUT_ASSURE(input) - ch_versions = ch_versions.mix(id_key.versions) + input_assure = INPUT_ASSURE(input) + ch_versions = ch_versions.mix(input_assure.versions) // Prepare reference and query TSV files for LOCIDEX_MERGE - profiles = id_key.match.branch { + profiles = input_assure.result.branch { query: !it[0].address } - reference_values = input.collect{ meta, profile -> profile} - query_values = profiles.query.collect{ meta, profile -> profile } + reference_values = input_assure.result.collect{ meta, mlst -> mlst} + query_values = profiles.query.collect{ meta, mlst -> mlst } // LOCIDEX modules ref_tag = Channel.value("ref") From f7ed9d3325e78da2a394d9f5d96b70e10f904a33 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Mon, 10 Jun 2024 16:59:14 -0400 Subject: [PATCH 08/21] Fixed linting issues --- bin/input_assure.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 2705eae..779e888 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -5,46 +5,48 @@ import csv import gzip + def open_file(file_path, mode): # Open a file based on the file extension - if file_path.endswith('.gz'): + if file_path.endswith(".gz"): return gzip.open(file_path, mode) else: return open(file_path, mode) + def check_inputs(json_file, sample_id, address, output_error_file): with open_file(json_file, "rt") as f: json_data = json.load(f) # Define a variable to store the match_status (True or False) match_status = sample_id in json_data - - keys = list (json_data.keys()) + + keys = list(json_data.keys()) original_key = keys[0] # Initialize the error message - error_message = None - + error_message = None + # Check for multiple keys in the JSON file and define error message if len(keys) > 1: # Check if sample_id matches any key if not match_status: error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." - # Retain only the specified sample ID + # Retain only the specified sample ID json_data = {sample_id: json_data.pop(original_key)} else: error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry" # Remove all keys expect the one matching sample_id - json_data = {sample_id: json_data[sample_id]} + json_data = {sample_id: json_data[sample_id]} elif not match_status: - # Define error message based on meta.address (query or reference) + # Define error message based on meta.address (query or reference) if address == "null": error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." else: error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." # Update the JSON file with the new sample ID json_data[sample_id] = json_data.pop(original_key) - + # Write file containing relevant error messages if error_message: with open(output_error_file, "w", newline="") as f: @@ -52,10 +54,11 @@ def check_inputs(json_file, sample_id, address, output_error_file): writer.writerow(["sample", "JSON_key", "error_message"]) writer.writerow([sample_id, keys, error_message]) - # Write the updated JSON data back to the original file + # Write the updated JSON data back to the original file with open_file(json_file, "wt") as f: json.dump(json_data, f, indent=4) + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Check sample inputs, force change if ID ≠ KEY, and generate an error report." @@ -73,6 +76,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs( - args.input, args.sample_id, args.address, args.output_error - ) + check_inputs(args.input, args.sample_id, args.address, args.output_error) From 7592bd3bea310522d45de6725804a6f3093050c5 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 15:13:49 -0400 Subject: [PATCH 09/21] Resolve conflicts between dev and input_assure --- modules/local/input_assure/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index 30dfdaa..e0376ac 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -1,7 +1,6 @@ process INPUT_ASSURE { tag "Assures Inputs are Consistent" label 'process_single' - fair true container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : From 32663302aa75ba71c3b606c08b2ec62e3dea3c03 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 16:08:44 -0400 Subject: [PATCH 10/21] Add test with gzipped MLST JSON file --- tests/data/reports/sample1.mlst.json.gz | Bin 0 -> 84 bytes tests/data/samplesheets/samplesheet_gzip.csv | 5 +++ tests/pipelines/main.nf.test | 35 +++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 tests/data/reports/sample1.mlst.json.gz create mode 100644 tests/data/samplesheets/samplesheet_gzip.csv diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..94f25c81407bfee01777e59fb4af80f5998dbf30 GIT binary patch literal 84 zcmb2|=HNK`KQoQ#e{o`NK~AcnUT#itiC$K5ejdZBXF&{1OCKG%(=;c{ Date: Wed, 12 Jun 2024 16:56:07 -0400 Subject: [PATCH 11/21] Added test for mismatched IDs --- .../irida/mismatched_iridanext.output.json | 29 ++++++++ .../samplesheet-mismatched_IDs.csv | 7 ++ tests/pipelines/main.nf.test | 66 +++++++------------ 3 files changed, 61 insertions(+), 41 deletions(-) create mode 100644 tests/data/irida/mismatched_iridanext.output.json create mode 100644 tests/data/samplesheets/samplesheet-mismatched_IDs.csv diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json new file mode 100644 index 0000000..ec418dc --- /dev/null +++ b/tests/data/irida/mismatched_iridanext.output.json @@ -0,0 +1,29 @@ +{ + "files": { + "global": [ + + ], + "samples": { + "sampleR": [ + { + "path": "input/sampleR_error_report.csv" + } + ], + "sample2": [ + { + "path": "input/sample2_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "2.2.3" + }, + "sampleR": { + "address": "2.2.3" + } + } + } +} \ No newline at end of file diff --git a/tests/data/samplesheets/samplesheet-mismatched_IDs.csv b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv new file mode 100644 index 0000000..73230d4 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv @@ -0,0 +1,7 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index d47b7ed..9892b40 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -105,12 +105,12 @@ nextflow_pipeline { assert iridanext_metadata.containsKey("sampleN") assert iridanext_metadata.sampleQ."address" == "2.2.3" - assert iridanext_metadata.sampleN.address == "2.2.3" + assert iridanext_metadata.sampleN."address" == "2.2.3" } } test("Small-scale test of full pipeline with gzipped MLST JSON"){ - tag "pipeline_success_gzipped_JSON" + tag "Gzipped_MLST_JSON" when{ params { @@ -144,37 +144,15 @@ nextflow_pipeline { } } - test("Integration test where input contains reference sample with mismatched MLST JSON file"){ - tag "pipeline_failure" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet_test1.csv" - outdir = "results" - } - } - - then { - assert workflow.failed - assert (workflow.stdout =~ /Pipeline exiting: sample with ID sample2 does not have matching MLST JSON file./).find() - - assert path("$launchDir/results").exists() - assert path("$launchDir/results/input").exists() - - // Ensure that despite pipeline failure, error_reports are generated for all samples added to pipeline (i.e. sampleQ query) - def lines = [] - - lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - assert lines.contains("sample2,sample7,Pipeline stopped: Reference sample2's input ID and MLST JSON file key DO NOT MATCH") - } - } - - test("Integration test where input contains a single query sample with mismatched MLST JSON file"){ - tag "pipeline_success_after_query_removal" + test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){ + // IDs in the sample sheet and IDs in the individual MLST JSON files will not match. + // This tests the pipelines ability to handle and correct for this problem. + + tag "mismatched_IDs" when{ params { - input = "$baseDir/tests/data/samplesheets/samplesheet_test2.csv" + input = "$baseDir/tests/data/samplesheets/samplesheet-mismatched_IDs.csv" outdir = "results" } } @@ -182,29 +160,35 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - assert path("$launchDir/results/input").exists() - assert path("$launchDir/results/filter").exists() - + // Check outputs def lines = [] - // Ensure that the error_report is generated for removed query sampleR + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() + assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() - assert lines.contains("sampleR,sampleF,Query sampleR removed from pipeline") + assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") - // Check query output csv + // Check filter_query csv file lines = path("$launchDir/results/filter/new_addresses.csv").readLines() - assert lines.contains("sampleQ,1.1.3") + assert lines.contains("sampleQ,2.2.3") + assert lines.contains("sampleR,2.2.3") - // Check IRIDA Next JSON output - assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test2_iridanext.output.json").json + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json def iridanext_json = path("$launchDir/results/iridanext.output.json").json def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_samples.sampleR.findAll { it.path == "input/sampleR_error_report.csv" }.size() == 1 - assert iridanext_metadata.sampleQ."address" == "1.1.3" + assert iridanext_metadata.size() == 2 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.containsKey("sampleR") + + assert iridanext_metadata.sampleQ."address" == "2.2.3" + assert iridanext_metadata.sampleR."address" == "2.2.3" } } From 001709087fdbb07d23aa25193d2379dea17dcbe3 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 16:59:27 -0400 Subject: [PATCH 12/21] Update paths in samplesheet --- tests/data/samplesheets/samplesheet_gzip.csv | 2 +- tests/pipelines/main.nf.test | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/samplesheets/samplesheet_gzip.csv b/tests/data/samplesheets/samplesheet_gzip.csv index a4b5bad..2337c78 100644 --- a/tests/data/samplesheets/samplesheet_gzip.csv +++ b/tests/data/samplesheets/samplesheet_gzip.csv @@ -1,5 +1,5 @@ sample,mlst_alleles,address sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sample1,/root/working_directory/gas/gasnomenclature/tests/data/reports/sample1.mlst.json.gz,1.1.1 +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample1.mlst.json.gz,1.1.1 sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 9892b40..77c2672 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -166,7 +166,7 @@ nextflow_pipeline { // Ensure that the error_reports are generated for query and reference samples lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") From 3f181eb99c1e458977b03abdbdc83dad2ba2610e Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 17:00:48 -0400 Subject: [PATCH 13/21] Fix EC issues --- tests/pipelines/main.nf.test | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 77c2672..43449ea 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -126,7 +126,7 @@ nextflow_pipeline { // Check is sample1.mlst.json.gz exists and is gzipped def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz") assert gzipped_json.exists() - + // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") @@ -147,7 +147,7 @@ nextflow_pipeline { test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){ // IDs in the sample sheet and IDs in the individual MLST JSON files will not match. // This tests the pipelines ability to handle and correct for this problem. - + tag "mismatched_IDs" when{ @@ -160,14 +160,14 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - + // Check outputs def lines = [] // Ensure that the error_reports are generated for query and reference samples lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") - + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") @@ -176,7 +176,7 @@ nextflow_pipeline { assert lines.contains("sampleQ,2.2.3") assert lines.contains("sampleR,2.2.3") - // Check IRIDA Next JSON output + // Check IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json def iridanext_json = path("$launchDir/results/iridanext.output.json").json From 1f525294704b6b8a7c9110e2bdd35085578e8ddf Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 17:02:29 -0400 Subject: [PATCH 14/21] Fix EC issues --- tests/data/irida/mismatched_iridanext.output.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json index ec418dc..750523b 100644 --- a/tests/data/irida/mismatched_iridanext.output.json +++ b/tests/data/irida/mismatched_iridanext.output.json @@ -1,8 +1,6 @@ { "files": { - "global": [ - - ], + "global": [], "samples": { "sampleR": [ { @@ -26,4 +24,4 @@ } } } -} \ No newline at end of file +} From ec347e46b808810f5ebfd6fc532aed832f4e91d3 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Wed, 12 Jun 2024 17:08:30 -0400 Subject: [PATCH 15/21] Removed unexpected character (#) in main.nf.test --- tests/pipelines/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 43449ea..f1df721 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -166,7 +166,7 @@ nextflow_pipeline { // Ensure that the error_reports are generated for query and reference samples lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() - #assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") From 7c1b5dc31536243a5870ed050f5b90286b33ff67 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 10:13:08 -0400 Subject: [PATCH 16/21] Add test data for multiple keyed JSON file --- tests/data/reports/sample3_multiplekeys.mlst.json | 12 ++++++++++++ .../reports/sample3_multiplekeys_nomatch.mlst.json | 12 ++++++++++++ .../data/samplesheets/samplesheet-multiple_keys.csv | 5 +++++ .../samplesheet-multiplekeys_nomatch.csv | 5 +++++ 4 files changed, 34 insertions(+) create mode 100644 tests/data/reports/sample3_multiplekeys.mlst.json create mode 100644 tests/data/reports/sample3_multiplekeys_nomatch.mlst.json create mode 100644 tests/data/samplesheets/samplesheet-multiple_keys.csv create mode 100644 tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv diff --git a/tests/data/reports/sample3_multiplekeys.mlst.json b/tests/data/reports/sample3_multiplekeys.mlst.json new file mode 100644 index 0000000..5d85e65 --- /dev/null +++ b/tests/data/reports/sample3_multiplekeys.mlst.json @@ -0,0 +1,12 @@ +{ + "extra_key": { + "l1": "1", + "l2": "1", + "l3": "2" + }, + "sample3": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json new file mode 100644 index 0000000..6d7878d --- /dev/null +++ b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json @@ -0,0 +1,12 @@ +{ + "sample4": { + "l1": "1", + "l2": "1", + "l3": "2" + }, + "extra_key": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/samplesheets/samplesheet-multiple_keys.csv b/tests/data/samplesheets/samplesheet-multiple_keys.csv new file mode 100644 index 0000000..867d7d6 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiple_keys.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv new file mode 100644 index 0000000..cdd0bf0 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json,1.1.2 From 8e8ffa446b816d973e5aa04888cc26d75b9c18b4 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 11:21:17 -0400 Subject: [PATCH 17/21] Tests added to handle when there are multiple sample entries (keys) in provided MLST JSON file(s) --- .../irida/multiplekeys_iridanext.output.json | 19 ++++ tests/pipelines/main.nf.test | 97 +++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 tests/data/irida/multiplekeys_iridanext.output.json diff --git a/tests/data/irida/multiplekeys_iridanext.output.json b/tests/data/irida/multiplekeys_iridanext.output.json new file mode 100644 index 0000000..f7b872f --- /dev/null +++ b/tests/data/irida/multiplekeys_iridanext.output.json @@ -0,0 +1,19 @@ +{ + "files": { + "global": [], + "samples": { + "sample3": [ + { + "path": "input/sample3_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.1.3" + } + } + } +} diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index f1df721..4cec606 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -192,4 +192,101 @@ nextflow_pipeline { } } + test("Testing data removal in MLST JSON with a matching sampleID key."){ + // There are multiple sample entries (keys) in the MLST JSON and one of them matches the sampleID. + // This test evaluates the pipeline's ability to address this issue by removing keys that do not match the sampleID. + + tag "multiple_keys_with_matching_ID" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_keys.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample3_error_report.csv").readLines() + assert lines.contains('sample3,"[\'extra_key\', \'sample3\']","MLST JSON file (sample3_multiplekeys.mlst.json) contains multiple keys: [\'extra_key\', \'sample3\']. The MLST JSON file has been modified to retain only the \'sample3\' entry"') + + // Check filtered query csv results + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sample3.size() == 1 + assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv' + + assert iridanext_metadata.size() == 1 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Testing the removal of data in MLST JSON with no sampleID match."){ + // There are multiple sample entries (keys) in the MLST JSON and none of them match the sampleID.. + // This test ensures the pipeline can handle and resolve this issue by retaining only the first JSON key entry and renaming it to match the sampleID. + + tag "multiple_keys_without_matching_ID" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample3_error_report.csv").readLines() + assert lines.contains("sample3,\"[\'sample4\', \'extra_key\']\",No key in the MLST JSON file (sample3_multiplekeys_nomatch.mlst.json) matches the specified sample ID \'sample3\'. The first key \'sample4\' has been forcefully changed to \'sample3\' and all other keys have been removed.") + + // Check filtered query csv results + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sample3.size() == 1 + assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv' + + assert iridanext_metadata.size() == 1 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } } From 79096738d4e792ceddf399c53d356d10f463bb55 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 16:10:21 -0400 Subject: [PATCH 18/21] Updated input_assure to identify when MLST JSON is empty. Added corresponding test --- bin/input_assure.py | 12 ++++++++---- tests/data/reports/sample2_empty.mlst.json | 1 + .../data/samplesheets/samplesheet_emptyJSON.csv | 6 ++++++ tests/pipelines/main.nf.test | 16 ++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/data/reports/sample2_empty.mlst.json create mode 100644 tests/data/samplesheets/samplesheet_emptyJSON.csv diff --git a/bin/input_assure.py b/bin/input_assure.py index 779e888..5e749b5 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -4,6 +4,7 @@ import argparse import csv import gzip +import sys def open_file(file_path, mode): @@ -13,7 +14,6 @@ def open_file(file_path, mode): else: return open(file_path, mode) - def check_inputs(json_file, sample_id, address, output_error_file): with open_file(json_file, "rt") as f: json_data = json.load(f) @@ -22,13 +22,17 @@ def check_inputs(json_file, sample_id, address, output_error_file): match_status = sample_id in json_data keys = list(json_data.keys()) - original_key = keys[0] + original_key = keys[0] if keys else None # Initialize the error message error_message = None # Check for multiple keys in the JSON file and define error message - if len(keys) > 1: + if len(keys) == 0: + error_message = f"{json_file} is completely empty!" + print(error_message) + sys.exit(1) + elif len(keys) > 1: # Check if sample_id matches any key if not match_status: error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." @@ -76,4 +80,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) + check_inputs(args.input, args.sample_id, args.address, args.output_error) \ No newline at end of file diff --git a/tests/data/reports/sample2_empty.mlst.json b/tests/data/reports/sample2_empty.mlst.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/tests/data/reports/sample2_empty.mlst.json @@ -0,0 +1 @@ +{} diff --git a/tests/data/samplesheets/samplesheet_emptyJSON.csv b/tests/data/samplesheets/samplesheet_emptyJSON.csv new file mode 100644 index 0000000..efcb1bb --- /dev/null +++ b/tests/data/samplesheets/samplesheet_emptyJSON.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/input_assure/tests/data/reports/sample2_empty.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 4cec606..d292d1d 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -289,4 +289,20 @@ nextflow_pipeline { assert iridanext_metadata.sampleQ."address" == "1.1.3" } } + + test("Testing when provided MLST JSON file(s) are empty."){ + tag "empty_JSON" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_emptyJSON.csv" + outdir = "results" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /sample2_empty.mlst.json is completely empty!/).find() + } + } } From da8c82992277aa2903727b5730b95544f92b1097 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 16:12:28 -0400 Subject: [PATCH 19/21] EC issue fix --- bin/input_assure.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5e749b5..5fabad4 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -14,6 +14,7 @@ def open_file(file_path, mode): else: return open(file_path, mode) + def check_inputs(json_file, sample_id, address, output_error_file): with open_file(json_file, "rt") as f: json_data = json.load(f) @@ -80,4 +81,4 @@ def check_inputs(json_file, sample_id, address, output_error_file): args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) \ No newline at end of file + check_inputs(args.input, args.sample_id, args.address, args.output_error) From 6642b72ad0132805bf23083cf9979a8dde965941 Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Thu, 13 Jun 2024 17:07:04 -0400 Subject: [PATCH 20/21] Create a new JSON output file in input_assure --- bin/input_assure.py | 17 +++++++++++------ modules/local/input_assure/main.nf | 5 +++-- tests/pipelines/main.nf.test | 5 ++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/bin/input_assure.py b/bin/input_assure.py index 5fabad4..7926cab 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -15,20 +15,20 @@ def open_file(file_path, mode): return open(file_path, mode) -def check_inputs(json_file, sample_id, address, output_error_file): +def check_inputs(json_file, sample_id, address, output_error_file, output_json_file): with open_file(json_file, "rt") as f: json_data = json.load(f) # Define a variable to store the match_status (True or False) match_status = sample_id in json_data - keys = list(json_data.keys()) - original_key = keys[0] if keys else None - # Initialize the error message error_message = None # Check for multiple keys in the JSON file and define error message + keys = list(json_data.keys()) + original_key = keys[0] if keys else None + if len(keys) == 0: error_message = f"{json_file} is completely empty!" print(error_message) @@ -60,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file): writer.writerow([sample_id, keys, error_message]) # Write the updated JSON data back to the original file - with open_file(json_file, "wt") as f: + with open_file(output_json_file, "wt") as f: json.dump(json_data, f, indent=4) @@ -78,7 +78,12 @@ def check_inputs(json_file, sample_id, address, output_error_file): parser.add_argument( "--output_error", help="Path to the error report file.", required=True ) + parser.add_argument( + "--output_json", help="Path to the MLST JSON file.", required=True + ) args = parser.parse_args() - check_inputs(args.input, args.sample_id, args.address, args.output_error) + check_inputs( + args.input, args.sample_id, args.address, args.output_error, args.output_json + ) diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index e0376ac..dd72bb1 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path(mlst), emit: result + tuple val(meta), path("${meta.id}.mlst.json"), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -21,7 +21,8 @@ process INPUT_ASSURE { --input ${mlst} \\ --sample_id ${meta.id} \\ --address ${meta.address} \\ - --output_error ${meta.id}_error_report.csv + --output_error ${meta.id}_error_report.csv \\ + --output_json ${meta.id}.mlst.json cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index d292d1d..b6a5ab8 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -123,9 +123,8 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() - // Check is sample1.mlst.json.gz exists and is gzipped - def gzipped_json = path("$launchDir/results/input/sample1.mlst.json.gz") - assert gzipped_json.exists() + // Check that sample1.mlst.json.gz has been open, read, and that a new file has been generated + assert path("$launchDir/results/input/sample1.mlst.json").exists() // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text") From 348fe9558c27e1ef76ba387c4abc31632656c1aa Mon Sep 17 00:00:00 2001 From: kylacochrane Date: Fri, 14 Jun 2024 12:01:48 -0400 Subject: [PATCH 21/21] Ensure MLST JSON files from input_assure are gzipped --- bin/input_assure.py | 4 ++-- modules/local/input_assure/main.nf | 4 ++-- tests/data/irida/test2_iridanext.output.json | 19 ------------------ tests/data/reports/sample1.mlst.json.gz | Bin 84 -> 84 bytes tests/data/samplesheets/samplesheet_test1.csv | 5 ----- tests/data/samplesheets/samplesheet_test2.csv | 7 ------- tests/pipelines/main.nf.test | 4 ++-- 7 files changed, 6 insertions(+), 37 deletions(-) delete mode 100644 tests/data/irida/test2_iridanext.output.json delete mode 100644 tests/data/samplesheets/samplesheet_test1.csv delete mode 100644 tests/data/samplesheets/samplesheet_test2.csv diff --git a/bin/input_assure.py b/bin/input_assure.py index 7926cab..d99bf2a 100755 --- a/bin/input_assure.py +++ b/bin/input_assure.py @@ -60,7 +60,7 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f writer.writerow([sample_id, keys, error_message]) # Write the updated JSON data back to the original file - with open_file(output_json_file, "wt") as f: + with gzip.open(output_json_file, "wt") as f: json.dump(json_data, f, indent=4) @@ -79,7 +79,7 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f "--output_error", help="Path to the error report file.", required=True ) parser.add_argument( - "--output_json", help="Path to the MLST JSON file.", required=True + "--output_json", help="Path to the MLST JSON file (gzipped).", required=True ) args = parser.parse_args() diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf index dd72bb1..43b7462 100644 --- a/modules/local/input_assure/main.nf +++ b/modules/local/input_assure/main.nf @@ -10,7 +10,7 @@ process INPUT_ASSURE { tuple val(meta), path(mlst) output: - tuple val(meta), path("${meta.id}.mlst.json"), emit: result + tuple val(meta), path("${meta.id}.mlst.json.gz"), emit: result tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report path("versions.yml"), emit: versions @@ -22,7 +22,7 @@ process INPUT_ASSURE { --sample_id ${meta.id} \\ --address ${meta.address} \\ --output_error ${meta.id}_error_report.csv \\ - --output_json ${meta.id}.mlst.json + --output_json ${meta.id}.mlst.json.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/tests/data/irida/test2_iridanext.output.json b/tests/data/irida/test2_iridanext.output.json deleted file mode 100644 index 2882954..0000000 --- a/tests/data/irida/test2_iridanext.output.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "files": { - "global": [], - "samples": { - "sampleR": [ - { - "path": "input/sampleR_error_report.csv" - } - ] - } - }, - "metadata": { - "samples": { - "sampleQ": { - "address": "1.1.3" - } - } - } -} diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz index 94f25c81407bfee01777e59fb4af80f5998dbf30..735e1082b5193673c4844e4f4558af8e8206a12f 100644 GIT binary patch delta 13 UcmWFuVVCdb;5f#WHIdyH02iwRi2wiq delta 13 UcmWFuVVCdb;5hm}b0WJh032!rd;kCd diff --git a/tests/data/samplesheets/samplesheet_test1.csv b/tests/data/samplesheets/samplesheet_test1.csv deleted file mode 100644 index cf87b26..0000000 --- a/tests/data/samplesheets/samplesheet_test1.csv +++ /dev/null @@ -1,5 +0,0 @@ -sample,mlst_alleles,address -sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, -sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_test2.csv b/tests/data/samplesheets/samplesheet_test2.csv deleted file mode 100644 index 036c317..0000000 --- a/tests/data/samplesheets/samplesheet_test2.csv +++ /dev/null @@ -1,7 +0,0 @@ -sample,mlst_alleles,address -sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, -sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, -sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 -sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 -sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 - diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index b6a5ab8..6716dae 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -123,8 +123,8 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() - // Check that sample1.mlst.json.gz has been open, read, and that a new file has been generated - assert path("$launchDir/results/input/sample1.mlst.json").exists() + // Check that sample1.mlst.json.gz has been open, read, and that a new gzipped file has been generated + assert path("$launchDir/results/input/sample1.mlst.json.gz").exists() // Check called clusters def actual_calls = path("$launchDir/results/call/Called/results.text")