kbase · Tianhao-Gu · Aug 17, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py
@@ -91,11 +91,11 @@
 OBJECTS_NAME_ASSEMBLY = "KBaseGenomeAnnotations.Assembly"
 OBJECTS_NAME_GENOME = "KBaseGenomes.Genome"
 
-# The metadata file name created during the Mash run
-MASH_METADATA = 'mash_run_metadata.json'
+# The metadata file name created during the tool's execution
+TOOL_METADATA = 'tool_metadata.json'
 
-# The metadata file name created during the Eggnog run
-EGGNOG_METADATA = 'eggnog_run_metadata.json'
+# Tool metadata file required keys
+TOOL_METADATA_REQUIRED_KEYS = ['tool_name', 'version', 'command']
 
 # The fatal error file created if a data file cannot be successfully processed
 FATAL_ERROR_FILE = "fatal_error.json"

diff --git a/src/loaders/compute_tools/checkm2/Dockerfile b/src/loaders/compute_tools/checkm2/Dockerfile
@@ -1,5 +1,6 @@
 FROM continuumio/miniconda3:22.11.1
 
+# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated
 ARG CHECKM2_VER=1.0.1
 ENV CONDA_ENV checkm2-$CHECKM2_VER
 

diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py
@@ -12,6 +12,7 @@
     run_command,
     write_fatal_tuples_to_dict,
     create_fatal_tuple,
+    create_tool_metadata,
 )
 from src.loaders.compute_tools.tool_result_parser import (
     process_genome_attri_result,
@@ -68,6 +69,16 @@ def _run_checkm2(
         fatal_tuples.append(fatal_tuple)
     write_fatal_tuples_to_dict(fatal_tuples, output_dir)
 
+    metadata = {'tool': 'checkm2',
+                'version': '1.0.1',
+                'command': command,
+                "reference_db": {
+                    "version": None,
+                    "comment": "diamond_db, ver unknown",
+                    },
+                'ids_to_files': ids_to_files}
+    create_tool_metadata(output_dir, metadata)
+
 
 def main():
     runner = ToolRunner("checkm2", tool_data_id_from_filename=True)

diff --git a/src/loaders/compute_tools/checkm2/versions.yaml b/src/loaders/compute_tools/checkm2/versions.yaml
@@ -31,4 +31,12 @@ versions:
     date: 2024-06-25
     notes: |
       - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
-    reference_db_version: 1.0.1
+    reference_db_version: 1.0.1
+  - version: 0.1.6
+    date: 2024-08-16
+    notes: |
+      - Create metadata file after running CheckM2
+    reference_db_version: 1.0.1
+
+#Please keep this reminder at the end of this file
+#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated
diff --git a/src/loaders/compute_tools/eggnog/Dockerfile b/src/loaders/compute_tools/eggnog/Dockerfile
@@ -1,5 +1,6 @@
 FROM continuumio/miniconda3:24.1.2-0
 
+# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated
 ENV EGGNOG_VER 2.1.12
 ENV CONDA_ENV eggnog-$EGGNOG_VER
 ENV PYTHON_VER 3.11

diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py
@@ -5,11 +5,10 @@
 Therefore, the parser program is not compatible with data generated by this tool.
 
 """
-import json
 from pathlib import Path
 
-from src.loaders.common.loader_common_names import EGGNOG_METADATA
-from src.loaders.compute_tools.tool_common import ToolRunner, run_command
+from src.loaders.common.loader_common_names import TOOL_METADATA
+from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata
 
 INPUT_TYPE = 'proteins'
 
@@ -22,7 +21,7 @@ def _run_eggnog_single(
         threads_per_tool_run: int,
         debug: bool) -> None:
 
-    metadata_file = output_dir / EGGNOG_METADATA
+    metadata_file = output_dir / TOOL_METADATA
     if metadata_file.exists():
         print(f"Skipping {source_file} as it has already been processed.")
         return
@@ -44,9 +43,16 @@ def _run_eggnog_single(
 
     # Save run info to a metadata file in the output directory for parsing later
     metadata = {'source_file': str(source_file),
-                'input_type': INPUT_TYPE}
-    with open(metadata_file, 'w') as f:
-        json.dump(metadata, f, indent=4)
+                'input_type': INPUT_TYPE,
+                'data_id': data_id,
+                'tool_name': 'eggnog',
+                'version': '2.1.12',
+                'command': command,
+                "reference_db": {
+                    "version": "5.0.2",
+                    },
+                }
+    create_tool_metadata(output_dir, metadata)
 
 
 def main():

diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml
@@ -21,4 +21,13 @@ versions:
     date: 2024-06-25
     notes: |
       - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
-    reference_db_version: 5.0.2
+    reference_db_version: 5.0.2
+
+  - version: 0.1.4
+    date: 2024-08-16
+    notes: |
+      - Create metadata file after running Eggnog
+    reference_db_version: 5.0.2
+
+#Please keep this reminder at the end of this file
+#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated
diff --git a/src/loaders/compute_tools/gtdb_tk/Dockerfile b/src/loaders/compute_tools/gtdb_tk/Dockerfile
@@ -1,5 +1,6 @@
 FROM continuumio/miniconda3:22.11.1
 
+# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated
 ENV GTDB_VER 2.3.2
 ENV CONDA_ENV gtdbtk-$GTDB_VER
 ENV PYTHON_VER 3.8

diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py
@@ -19,6 +19,7 @@
     find_gtdbtk_summary_files,
     run_command,
     write_fatal_tuples_to_dict,
+    create_tool_metadata,
 )
 from src.loaders.compute_tools.tool_result_parser import (
     process_genome_attri_result,
@@ -150,6 +151,15 @@ def _run_gtdb_tk(
                                 summary_files,
                                 )
 
+    metadata = {'tool': 'gtdb_tk',
+                'version': '2.3.2',
+                'command': command,
+                "reference_db": {
+                    "version": "release214",
+                    },
+                'ids_to_files': ids_to_files}
+    create_tool_metadata(output_dir, metadata)
+
 
 def main():
     runner = ToolRunner("gtdb_tk", suffix_ids=True)

diff --git a/src/loaders/compute_tools/gtdb_tk/versions.yaml b/src/loaders/compute_tools/gtdb_tk/versions.yaml
@@ -40,4 +40,12 @@ versions:
     date: 2024-06-25
     notes: |
       - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
-    reference_db_version: release214
+    reference_db_version: release214
+  - version: 0.1.7
+    date: 2024-08-16
+    notes: |
+      - Create metadata file after running GTDB-Tk
+    reference_db_version: release214
+
+#Please keep this reminder at the end of this file
+#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated
diff --git a/src/loaders/compute_tools/mash/Dockerfile b/src/loaders/compute_tools/mash/Dockerfile
@@ -5,6 +5,7 @@ RUN apt-get update && \
 
 # Set the Mash version argument
 # using version 2.0 to maintain compatibility with the homology service
+# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_mash_single method is updated
 ARG MASH_VER=2.0
 ENV CONDA_ENV mash-$MASH_VER
 

diff --git a/src/loaders/compute_tools/mash/mash.py b/src/loaders/compute_tools/mash/mash.py
@@ -1,11 +1,9 @@
 """
 Run Mash on a set of assemblies.
 """
-import json
 from pathlib import Path
 
-from src.loaders.common.loader_common_names import MASH_METADATA
-from src.loaders.compute_tools.tool_common import ToolRunner, run_command
+from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata
 
 KMER_SIZE = 19
 SKETCH_SIZE = 10000
@@ -32,14 +30,16 @@ def _run_mash_single(
     run_command(command, output_dir if debug else None)
 
     # Save run info to a metadata file in the output directory for parsing later
-    metadata_file = output_dir / MASH_METADATA
     metadata = {'source_file': str(source_file),
                 # Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
                 'sketch_file': str(source_file) + '.msh',
                 'kmer_size': kmer_size,
-                'sketch_size': sketch_size}
-    with open(metadata_file, 'w') as f:
-        json.dump(metadata, f, indent=4)
+                'sketch_size': sketch_size,
+                'data_id': data_id,
+                'tool_name': 'mash',
+                'version': '2.0',
+                'command': command}
+    create_tool_metadata(output_dir, metadata)
 
 
 def main():

diff --git a/src/loaders/compute_tools/mash/versions.yaml b/src/loaders/compute_tools/mash/versions.yaml
@@ -16,4 +16,8 @@ versions:
   - version: 0.1.4
     date: 2024-06-25
     notes: |
-      - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
+      - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
+  - version: 0.1.5
+    date: 2024-08-16
+    notes: |
+      - Create metadata file after running Mash
diff --git a/src/loaders/compute_tools/tool_common.py b/src/loaders/compute_tools/tool_common.py
@@ -28,6 +28,7 @@
 import pandas as pd
 
 from src.loaders.common import loader_common_names
+from src.loaders.common.loader_common_names import TOOL_METADATA
 
 # TODO CODE add a common module for saving and loading the metadata shared between the compute
 #           and parser
@@ -211,7 +212,8 @@
             with self._data_id_file:
                 df = pd.read_csv(self._data_id_file, sep='\t')
                 try:
-                    data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(str).tolist()  # convert to string in case of int directory names
+                    data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(
+                        str).tolist()  # convert to string in case of int directory names
                 except KeyError:
                     raise ValueError(
                         f"Please ensure {loader_common_names.DATA_ID_COLUMN_HEADER} column exists in the "
@@ -344,7 +346,8 @@
     def _execute(
             self,
             tool_callable: Callable[..., None],
-            args: Union[List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]],
+            args: Union[
+                List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]],
             start: datetime.datetime,
             total: bool,
     ):
@@ -595,6 +598,25 @@
     return fatal_tuple
 
 
+def create_tool_metadata(output_dir: Path, metadata: Dict[str, str]):
+    """
+    Save the metadata as a JSON file to the specified output directory.
+
+    Args:
+        output_dir (Path): The directory where the metadata file will be saved.
+        metadata (Dict[str, str]): A dictionary containing metadata key-value pairs.
+    """
+    required_keys = loader_common_names.TOOL_METADATA_REQUIRED_KEYS
+
+    if not all(key in metadata for key in required_keys):
+        missing_keys = [key for key in required_keys if key not in metadata]
+        raise ValueError(f"Missing required keys in metadata: {missing_keys}")
+
+    metadata_file = output_dir / TOOL_METADATA
+    with open(metadata_file, 'w') as f:
+        json.dump(metadata, f, indent=4)
+
+
 if __name__ == "__main__":
     # mostly just here to allow easily getting the help info with --help:
     ToolRunner("fake_tool")
diff --git a/src/loaders/genome_collection/parse_tool_results.py b/src/loaders/genome_collection/parse_tool_results.py
@@ -244,7 +244,7 @@
             if data_id in fatal_ids:
                 continue
             data_dir = Path(result_dir, batch_dir, data_id)
-            with open(data_dir / loader_common_names.MASH_METADATA, 'r') as file:
+            with open(data_dir / loader_common_names.TOOL_METADATA, 'r') as file:
                 metadata = json.load(file)
 
             sketch_file = metadata['sketch_file']