Merge pull request #140 from phac-nml/inx_id

added sample irida_next sample field option
phac-nml · Nov 26, 2024 · f853ce8 · f853ce8
2 parents 9ed27b8 + ae296a3
commit f853ce8
Show file tree

Hide file tree

Showing 19 changed files with 479 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,10 +5,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
-### `Changed`
+### `Added`
 
 - Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
+- Added a new `sample_name` field to the `schema_input.json` file: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+- Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+### `Changed`
+
+- Added a `sample_name` field, `sample` still exists but is used to incorporate additional names/identifiers in IRIDANext [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+- RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
+
+
 ### `Updated`
 
 - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123)

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -10,7 +10,12 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces",
+                "meta": ["external_id"],
+                "errorMessage": "Sample name to be used in report generation. Valid characters include alphanumeric and -. All other characters will be replaced by underscores."
+            },
+            "sample_name": {
+                "type": "string",
+                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Valid characters include alphanumeric and -. All other characters will be replaced by underscores.",
                 "meta": ["id"]
             },
             "fastq_1": {

diff --git a/bin/report_summaries.py b/bin/report_summaries.py
@@ -37,9 +37,10 @@ class JsonImport:
     __keep_keys = frozenset(__key_order.keys())
     __delimiter = "\t"
     __key_delimiter = "."
+    __inx_irida_key = "meta.external_id"
 
     def __init__(self, report_fp, output_name, sample_suffix):
-        self.tool_data = None # TODO set this in output of group tool fields
+        self.tool_data = None
         self.output_name = output_name
         self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv"
         self.output_dir = os.path.dirname(self.output_name)
@@ -49,7 +50,7 @@ def __init__(self, report_fp, output_name, sample_suffix):
         self.flat_sample_string = sample_suffix
         self.data = self.ingest_report(self.report_fp)
         self.flat_data, self.common_fields, self.tool_fields, self.table = self.flatten_json(self.data)
-        self.output_indv_json(self.flat_data)
+        self.flat_data = self.output_indv_json(self.flat_data)
         self.output_flat_json(self.flat_data)
         self.write_table(self.table)
 
@@ -64,7 +65,6 @@ def write_table(self, table_data: Dict[str, Dict[str, str]]):
         """
         keys = set([k for k in table_data])
         ordered_keys = []
-
         # Get the wanted information to the top of the page
         poisoned_keys = set()
         for option in self.__key_order:
@@ -79,7 +79,6 @@ def write_table(self, table_data: Dict[str, Dict[str, str]]):
         ordered_keys.extend(scalar_keys)
         ordered_keys.extend(sorted([i for i in keys if i not in ordered_keys and i not in poisoned_keys]))
         row_labels = sorted([i for i in next(iter(table_data.values()))])
-
         self.write_tsv(table_data, row_labels, ordered_keys)
         self.write_transposed_tsv(table_data, row_labels, ordered_keys)
 
@@ -233,7 +232,6 @@ def remove_prefix_id_fields(self, flattened_dict):
                     top_level_keys.add(item_key)
                 temp[item_key] = v
 
-        #self.tool_data = tool_data
         return reformatted_data, top_level_keys, tool_keys
 
 
@@ -242,7 +240,7 @@ def ingest_report(self, report_fp):
         report_fp: File path to the json report to be read in
         """
         data = None
-        with open(report_fp, "r", encoding="utf8") as report:
+        with open(report_fp, "r") as report:
             data = json.load(report)
         return data
 
@@ -262,11 +260,27 @@ def output_indv_json(self, flattened_data):
         Args:
             flattened_data (json: Dict[sample_id: Dict[tool_info: value]]):
         """
+        updated_items = dict()
         for k, v in flattened_data.items():
-            with open(os.path.join(self.output_dir, k + self.flat_sample_string), "w") as output:
+            out_key = k
+            sample_dir = k
+            dir_name = v.get(self.__inx_irida_key)
+            if k != dir_name:
+                sample_dir = dir_name
+                #! this field affects the identification of the irida next id being passed out of the pipeline
+                out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field
+            out_dir = os.path.join(self.output_dir, sample_dir)
+            out_path = os.path.join(out_dir, k + self.flat_sample_string)
+            if not os.path.isdir(out_dir): # Check for directory existence, as it will still exist on pipeline resumes
+                os.mkdir(out_dir)
+
+            with open(out_path, "w") as output:
                 json_data = json.dumps({k: v}, indent=2)
                 output.write(json_data)
+            updated_items[out_key] = v
 
+        flattened_data = updated_items
+        return flattened_data
 
     def to_file(self):
         with open(self.output_name, "w") as out_file:
@@ -282,7 +296,6 @@ def to_file(self):
                         out_file.write(f'"{val_write}"')
                     else:
                         out_file.write(val_write)
-                        # out_file.write(str(ii[1][i]).replace('\n', ' \\'))
                     out_file.write(self.__delimiter)
                 out_file.write("\n")
 
@@ -291,7 +304,7 @@ def to_file(self):
 
 
 
-def main_(args_in):
+def main(args_in):
     default_samp_suffix = "_flat_sample.json"
     parser = argparse.ArgumentParser("Table Summary")
     parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary")
@@ -307,4 +320,4 @@ def main_(args_in):
 
 if __name__ == "__main__":
     # pass json file to program to parse it
-    main_(sys.argv[1:])
+    main(sys.argv[1:])
diff --git a/conf/irida_next.config b/conf/irida_next.config
@@ -11,7 +11,7 @@ iridanext {
         overwrite = true
         validate = false
         files {
-            idkey = "sample"
+            idkey = 'external_id'  // Previously sample
             global = [
                 "**/FinalReports/Aggregated/Json/final_report.json",
                 "**/FinalReports/Aggregated/Tables/final_report.tsv"

diff --git a/docs/usage/usage.md b/docs/usage/usage.md
@@ -23,32 +23,33 @@ Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) con
 - long_reads
 - assembly
 
+> **Note:** Illegal characters (e.g. characters that match the expression [^A-Za-z0-9_\-] ) in the sample name will be replaced with underscores.
 
 Example layouts for different sample-sheets include:
 
 _Illumina paired-end data_
 
 |sample|fastq_1|fastq_2|
 |------|-------|-------|
-|sample_name|path_to_forward_reads|path_to_reversed_reads|
+|sample|path_to_forward_reads|path_to_reversed_reads|
 
 _Nanopore_
 
 |sample|long_reads|
 |------|----------|
-|sample_name|path_to_reads|
+|sample|path_to_reads|
 
 _Hybrid Assembly_
 
 |sample|fastq_1|fastq_2|long_reads|
 |-------|-------|------|----------|
-|sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
+|sample|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
 
 _Starting with assembly only_
 
 |sample|assembly|
 |------|--------|
-|sample_name|path_to_assembly|
+|sample|path_to_assembly|
 
 _Example merging paired-end data_
 
@@ -96,6 +97,8 @@ _Example merging paired-end data_
 Numerous steps within mikrokondo can be turned off without compromising the stability of the pipeline. This skip options can reduce run-time of the pipeline or allow for completion of the pipeline despite errors.
 ** All of the above options can be turned on by entering `--{skip_option} true` in the command line arguments to the pipeline (where optional parameters can be added)**
 
+
+- `--skip_read_merging`: Do not merge reads, if duplicate sample names are present the names will be made unique.
 - `--skip_abricate`: turn off abricate AMR detection
 - `--skip_bakta`: turn off bakta annotation pipeline (generally a slow step, requiring a database to be specified).
 - `--skip_checkm`: used as part of the contamination detection within mikrokondo, its run time and resource usage can be quite lengthy.

diff --git a/main.nf b/main.nf
@@ -42,9 +42,6 @@ if (params.help) {
 if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
 
 
-
-
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     NAMED WORKFLOW FOR PIPELINE
@@ -111,15 +108,17 @@ workflow MIKROKONDO {
         REPORT_AGGREGATE(REPORT.out.final_report)
         ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions)
 
-
         updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{
                     sample ->
                         def name_trim = sample.getName()
                         def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
-                        tuple([
+                        def external_id_name = sample.getParent().getBaseName()
+                        def output_map = [
                             "id": trimmed_name,
-                            "sample": trimmed_name],
-                            sample)
+                            "sample": trimmed_name,
+                            "external_id": external_id_name]
+
+                        tuple(output_map, sample)
                     }
 
         GZIP_FILES(updated_samples)

diff --git a/modules/local/combine_data.nf b/modules/local/combine_data.nf
@@ -20,16 +20,16 @@ process COMBINE_DATA{
     def fields_merge = meta.fields_merge
 
     if(fastq_1){
-        cmd_ << "cat ${meta.fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
+        cmd_ << "cat ${fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
     }
     if(fastq_2){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
+        cmd_ << "cat ${fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
     }
     if(long_reads){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
+        cmd_ << "cat ${long_reads.join(' ')} > out/${prefix}.merged.fastq.gz;"
     }
     if(assembly){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
+        cmd_ << "cat ${assembly.join(' ')} > out/${prefix}.merged.fastq.gz;"
     }
     def cmd = cmd_.join("\n")
     // creating dummy outputs so that all outputs exist for any scenario

diff --git a/modules/local/report.nf b/modules/local/report.nf
@@ -43,11 +43,13 @@ process REPORT{
 
         if(!sample_data.containsKey(meta_data.sample)){
             sample_data[meta_data.sample] = [:]
-            // TODO add strings to constants file
             sample_data[meta_data.sample]["meta"] = [:]
         }
 
         update_map_values(sample_data, meta_data, "metagenomic")
+        update_map_values(sample_data, meta_data, "id")
+        update_map_values(sample_data, meta_data, "sample")
+        update_map_values(sample_data, meta_data, "external_id")
         update_map_values(sample_data, meta_data, "assembly")
         update_map_values(sample_data, meta_data, "hybrid")
         update_map_values(sample_data, meta_data, "single_end")
@@ -63,7 +65,6 @@ process REPORT{
             if(!check_file_params(report_tag, extension)){
                 continue
             }
-            // TODO pass in report metadata
             def output_data = parse_data(report_value, extension, report_tag, headers_list)
             if(output_data){
                 report_value = output_data

diff --git a/modules/local/report_aggregate.nf b/modules/local/report_aggregate.nf
@@ -14,7 +14,7 @@ process REPORT_AGGREGATE{
     path("final_report.tsv"), emit: final_report
     path("final_report_transposed.tsv"), emit: final_report_transposed
     path("final_report_flattened.json"), emit: flattened_files
-    path("*${sample_flat_suffix}"), emit: flat_samples
+    path("*/*${sample_flat_suffix}"), emit: flat_samples
     path "versions.yml", emit: versions
 
     script:

diff --git a/nextflow.config b/nextflow.config
@@ -7,6 +7,7 @@
 */
 
 
+
 // Global default params, used in configs
 params {
     // Input options
@@ -43,12 +44,11 @@ params {
     show_hidden_params = false
     validationS3PathCheck = true
     validationShowHiddenParams = false
-    validationSchemaIgnoreParams = 'locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
+    validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
     validationFailUnrecognisedParams = false // for the qcreport fields
 
     // SKIP options
-    // TODO need to add constants section
-    // TODO investigate usage of template scripts to replace mash modules
+    skip_read_merging = true
     skip_report = false
     skip_raw_read_metrics = false
     skip_version_gathering = false
@@ -58,7 +58,7 @@ params {
     skip_checkm = false
     skip_depth_sampling = false // TODO have it mentioned that this should be turned off for metagenomic runs
     skip_ont_header_cleaning = true // TODO an awk script can likely replace this and be much faster at what it does...
-    skip_polishing = false // TODO make it clear this does not apply to Hybrid assembly
+    skip_polishing = false
     skip_species_classification = false
     skip_mlst = false
     skip_mobrecon = false
@@ -146,7 +146,6 @@ params {
     version                    = false
 
 
-
     // If a param in camel case is present nextflow automatically creates a kebab case parameter as well
 
     stage_in_mode = 'symlink'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -380,6 +380,12 @@
                     "type": "boolean",
                     "description": "Do not enter the subtyping workflow, e.g. ECTyper, SISTR etc will not be ran."
                 },
+                "skip_read_merging": {
+                    "type": "boolean",
+                    "default": true,
+                    "description": "Do not merge reads",
+                    "hidden": true
+                },
                 "skip_bakta": {
                     "type": "boolean",
                     "default": true,