From 885d546290ad406ede45b6d179317c51f9f8901f Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 16 Aug 2024 14:40:22 -0500 Subject: [PATCH 1/6] add tool metadata file --- src/loaders/common/loader_common_names.py | 8 ++++---- src/loaders/compute_tools/checkm2/checkm2.py | 7 +++++++ .../compute_tools/checkm2/versions.yaml | 5 +++++ src/loaders/compute_tools/eggnog/eggnog.py | 17 +++++++++------- .../compute_tools/eggnog/versions.yaml | 6 ++++++ src/loaders/compute_tools/gtdb_tk/gtdb_tk.py | 7 +++++++ .../compute_tools/gtdb_tk/versions.yaml | 5 +++++ src/loaders/compute_tools/mash/mash.py | 14 ++++++------- src/loaders/compute_tools/mash/versions.yaml | 6 +++++- src/loaders/compute_tools/tool_common.py | 20 +++++++++++++++++-- .../genome_collection/parse_tool_results.py | 2 +- 11 files changed, 75 insertions(+), 22 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 809cac97..7ef49f41 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -91,11 +91,11 @@ OBJECTS_NAME_ASSEMBLY = "KBaseGenomeAnnotations.Assembly" OBJECTS_NAME_GENOME = "KBaseGenomes.Genome" -# The metadata file name created during the Mash run -MASH_METADATA = 'mash_run_metadata.json' +# The metadata file name created during the tool's execution +TOOL_METADATA = 'tool_metadata.json' -# The metadata file name created during the Eggnog run -EGGNOG_METADATA = 'eggnog_run_metadata.json' +# Tool metadata file required keys +TOOL_METADATA_REQUIRED_KEYS = ['tool_name', 'version', 'command'] # The fatal error file created if a data file cannot be successfully processed FATAL_ERROR_FILE = "fatal_error.json" diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index e6b7e22b..a8c4765d 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -12,6 +12,7 @@ run_command, write_fatal_tuples_to_dict, create_fatal_tuple, + create_tool_metadata, ) from src.loaders.compute_tools.tool_result_parser import ( process_genome_attri_result, @@ -68,6 +69,12 @@ def _run_checkm2( fatal_tuples.append(fatal_tuple) write_fatal_tuples_to_dict(fatal_tuples, output_dir) + metadata = {'tool': 'checkm2', + 'version': '1.0.1', + 'command': command, + 'reference_db': 'diamond_db ver unknown - saved copy in NERSC at global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd',} + create_tool_metadata(output_dir, metadata) + def main(): runner = ToolRunner("checkm2", tool_data_id_from_filename=True) diff --git a/src/loaders/compute_tools/checkm2/versions.yaml b/src/loaders/compute_tools/checkm2/versions.yaml index 7c880d10..ac970175 100644 --- a/src/loaders/compute_tools/checkm2/versions.yaml +++ b/src/loaders/compute_tools/checkm2/versions.yaml @@ -31,4 +31,9 @@ versions: date: 2024-06-25 notes: | - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names + reference_db_version: 1.0.1 + - version: 0.1.6 + date: 2024-08-16 + notes: | + - Create metadata file after running CheckM2 reference_db_version: 1.0.1 \ No newline at end of file diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index d57b9a82..0309dc7f 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -5,11 +5,10 @@ Therefore, the parser program is not compatible with data generated by this tool. """ -import json from pathlib import Path -from src.loaders.common.loader_common_names import EGGNOG_METADATA -from src.loaders.compute_tools.tool_common import ToolRunner, run_command +from src.loaders.common.loader_common_names import TOOL_METADATA +from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata INPUT_TYPE = 'proteins' @@ -22,7 +21,7 @@ def _run_eggnog_single( threads_per_tool_run: int, debug: bool) -> None: - metadata_file = output_dir / EGGNOG_METADATA + metadata_file = output_dir / TOOL_METADATA if metadata_file.exists(): print(f"Skipping {source_file} as it has already been processed.") return @@ -44,9 +43,13 @@ def _run_eggnog_single( # Save run info to a metadata file in the output directory for parsing later metadata = {'source_file': str(source_file), - 'input_type': INPUT_TYPE} - with open(metadata_file, 'w') as f: - json.dump(metadata, f, indent=4) + 'input_type': INPUT_TYPE, + 'data_id': data_id, + 'tool_name': 'eggnog', + 'version': '2.1.12', + 'command': command, + 'reference_db': '5.0.2',} + create_tool_metadata(output_dir, metadata) def main(): diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml index d044b7fc..4325e32e 100644 --- a/src/loaders/compute_tools/eggnog/versions.yaml +++ b/src/loaders/compute_tools/eggnog/versions.yaml @@ -21,4 +21,10 @@ versions: date: 2024-06-25 notes: | - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names + reference_db_version: 5.0.2 + + - version: 0.1.4 + date: 2024-08-16 + notes: | + - Create metadata file after running Eggnog reference_db_version: 5.0.2 \ No newline at end of file diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py index e5c99abf..a0c41bf8 100644 --- a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py +++ b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py @@ -19,6 +19,7 @@ find_gtdbtk_summary_files, run_command, write_fatal_tuples_to_dict, + create_tool_metadata, ) from src.loaders.compute_tools.tool_result_parser import ( process_genome_attri_result, @@ -150,6 +151,12 @@ def _run_gtdb_tk( summary_files, ) + metadata = {'tool': 'gtdb_tk', + 'version': '2.3.2', + 'command': command, + 'reference_db': 'release214',} + create_tool_metadata(output_dir, metadata) + def main(): runner = ToolRunner("gtdb_tk", suffix_ids=True) diff --git a/src/loaders/compute_tools/gtdb_tk/versions.yaml b/src/loaders/compute_tools/gtdb_tk/versions.yaml index 5acca88d..26f5e6b6 100644 --- a/src/loaders/compute_tools/gtdb_tk/versions.yaml +++ b/src/loaders/compute_tools/gtdb_tk/versions.yaml @@ -40,4 +40,9 @@ versions: date: 2024-06-25 notes: | - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names + reference_db_version: release214 + - version: 0.1.7 + date: 2024-08-16 + notes: | + - Create metadata file after running GTDB-Tk reference_db_version: release214 \ No newline at end of file diff --git a/src/loaders/compute_tools/mash/mash.py b/src/loaders/compute_tools/mash/mash.py index e4429ecc..a2987f29 100644 --- a/src/loaders/compute_tools/mash/mash.py +++ b/src/loaders/compute_tools/mash/mash.py @@ -1,11 +1,9 @@ """ Run Mash on a set of assemblies. """ -import json from pathlib import Path -from src.loaders.common.loader_common_names import MASH_METADATA -from src.loaders.compute_tools.tool_common import ToolRunner, run_command +from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata KMER_SIZE = 19 SKETCH_SIZE = 10000 @@ -32,14 +30,16 @@ def _run_mash_single( run_command(command, output_dir if debug else None) # Save run info to a metadata file in the output directory for parsing later - metadata_file = output_dir / MASH_METADATA metadata = {'source_file': str(source_file), # Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch) 'sketch_file': str(source_file) + '.msh', 'kmer_size': kmer_size, - 'sketch_size': sketch_size} - with open(metadata_file, 'w') as f: - json.dump(metadata, f, indent=4) + 'sketch_size': sketch_size, + 'data_id': data_id, + 'tool_name': 'mash', + 'version': '2.0', + 'command': command} + create_tool_metadata(output_dir, metadata) def main(): diff --git a/src/loaders/compute_tools/mash/versions.yaml b/src/loaders/compute_tools/mash/versions.yaml index 96226408..f9b41945 100644 --- a/src/loaders/compute_tools/mash/versions.yaml +++ b/src/loaders/compute_tools/mash/versions.yaml @@ -16,4 +16,8 @@ versions: - version: 0.1.4 date: 2024-06-25 notes: | - - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names \ No newline at end of file + - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names + - version: 0.1.5 + date: 2024-08-16 + notes: | + - Create metadata file after running Mash \ No newline at end of file diff --git a/src/loaders/compute_tools/tool_common.py b/src/loaders/compute_tools/tool_common.py index 9769e2eb..6ab0ae91 100644 --- a/src/loaders/compute_tools/tool_common.py +++ b/src/loaders/compute_tools/tool_common.py @@ -28,6 +28,7 @@ import pandas as pd from src.loaders.common import loader_common_names +from src.loaders.common.loader_common_names import TOOL_METADATA # TODO CODE add a common module for saving and loading the metadata shared between the compute # and parser @@ -211,7 +212,8 @@ def _get_data_ids(self): with self._data_id_file: df = pd.read_csv(self._data_id_file, sep='\t') try: - data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(str).tolist() # convert to string in case of int directory names + data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype( + str).tolist() # convert to string in case of int directory names except KeyError: raise ValueError( f"Please ensure {loader_common_names.DATA_ID_COLUMN_HEADER} column exists in the " @@ -344,7 +346,8 @@ def parallel_batch_execution(self, tool_callable: Callable[[Dict[str, GenomeTupl def _execute( self, tool_callable: Callable[..., None], - args: Union[List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]], + args: Union[ + List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]], start: datetime.datetime, total: bool, ): @@ -595,6 +598,19 @@ def create_fatal_tuple( return fatal_tuple +def create_tool_metadata(output_dir: Path, metadata: Dict[str, str]): + """ + Create a metadata file for a tool run. + """ + required_keys = loader_common_names.TOOL_METADATA_REQUIRED_KEYS + + if not all(key in metadata for key in required_keys): + raise ValueError(f"Missing required keys in metadata: {required_keys}") + metadata_file = output_dir / TOOL_METADATA + with open(metadata_file, 'w') as f: + json.dump(metadata, f, indent=4) + + if __name__ == "__main__": # mostly just here to allow easily getting the help info with --help: ToolRunner("fake_tool") diff --git a/src/loaders/genome_collection/parse_tool_results.py b/src/loaders/genome_collection/parse_tool_results.py index 2d952923..0cbc0168 100644 --- a/src/loaders/genome_collection/parse_tool_results.py +++ b/src/loaders/genome_collection/parse_tool_results.py @@ -244,7 +244,7 @@ def _process_mash_tool(root_dir: str, if data_id in fatal_ids: continue data_dir = Path(result_dir, batch_dir, data_id) - with open(data_dir / loader_common_names.MASH_METADATA, 'r') as file: + with open(data_dir / loader_common_names.TOOL_METADATA, 'r') as file: metadata = json.load(file) sketch_file = metadata['sketch_file'] From b75a166948122db34faec6b13edbdedefee16d34 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 16 Aug 2024 14:47:08 -0500 Subject: [PATCH 2/6] add some comments --- src/loaders/compute_tools/tool_common.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/loaders/compute_tools/tool_common.py b/src/loaders/compute_tools/tool_common.py index 6ab0ae91..7cd98d41 100644 --- a/src/loaders/compute_tools/tool_common.py +++ b/src/loaders/compute_tools/tool_common.py @@ -600,12 +600,18 @@ def create_fatal_tuple( def create_tool_metadata(output_dir: Path, metadata: Dict[str, str]): """ - Create a metadata file for a tool run. + Save the metadata as a JSON file to the specified output directory. + + Args: + output_dir (Path): The directory where the metadata file will be saved. + metadata (Dict[str, str]): A dictionary containing metadata key-value pairs. """ required_keys = loader_common_names.TOOL_METADATA_REQUIRED_KEYS if not all(key in metadata for key in required_keys): - raise ValueError(f"Missing required keys in metadata: {required_keys}") + missing_keys = [key for key in required_keys if key not in metadata] + raise ValueError(f"Missing required keys in metadata: {missing_keys}") + metadata_file = output_dir / TOOL_METADATA with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=4) From 1e8a74d881e408003fe3fd9f9bce43571be1a3d0 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 16 Aug 2024 14:53:06 -0500 Subject: [PATCH 3/6] include ids_to_files --- src/loaders/compute_tools/checkm2/checkm2.py | 3 ++- src/loaders/compute_tools/gtdb_tk/gtdb_tk.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index a8c4765d..eeb14466 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -72,7 +72,8 @@ def _run_checkm2( metadata = {'tool': 'checkm2', 'version': '1.0.1', 'command': command, - 'reference_db': 'diamond_db ver unknown - saved copy in NERSC at global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd',} + 'reference_db': 'diamond_db ver unknown - saved copy in NERSC at global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd', + 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py index a0c41bf8..dd857197 100644 --- a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py +++ b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py @@ -154,7 +154,8 @@ def _run_gtdb_tk( metadata = {'tool': 'gtdb_tk', 'version': '2.3.2', 'command': command, - 'reference_db': 'release214',} + 'reference_db': 'release214', + 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata) From aeb87628028b2605670c7eb61deac80e82fbdd2e Mon Sep 17 00:00:00 2001 From: Tianhao Gu Date: Fri, 16 Aug 2024 15:38:14 -0500 Subject: [PATCH 4/6] Update src/loaders/compute_tools/checkm2/checkm2.py Co-authored-by: MrCreosote --- src/loaders/compute_tools/checkm2/checkm2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index eeb14466..88de2709 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -72,7 +72,12 @@ def _run_checkm2( metadata = {'tool': 'checkm2', 'version': '1.0.1', 'command': command, - 'reference_db': 'diamond_db ver unknown - saved copy in NERSC at global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd', + "reference_db": { + "path": "/global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd", + "location": "NERSC", + "version": None, + "comment": "diamond_db, ver unknown", + }, 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata) From e274c45c87b52f2aa3d050b89819c1392f73501b Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 16 Aug 2024 16:22:42 -0500 Subject: [PATCH 5/6] add reminder notes --- src/loaders/compute_tools/checkm2/Dockerfile | 1 + src/loaders/compute_tools/checkm2/checkm2.py | 12 ++++++------ src/loaders/compute_tools/checkm2/versions.yaml | 5 ++++- src/loaders/compute_tools/eggnog/Dockerfile | 1 + src/loaders/compute_tools/eggnog/eggnog.py | 7 ++++++- src/loaders/compute_tools/eggnog/versions.yaml | 5 ++++- src/loaders/compute_tools/gtdb_tk/Dockerfile | 1 + src/loaders/compute_tools/gtdb_tk/gtdb_tk.py | 6 +++++- src/loaders/compute_tools/gtdb_tk/versions.yaml | 5 ++++- src/loaders/compute_tools/mash/Dockerfile | 1 + 10 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/loaders/compute_tools/checkm2/Dockerfile b/src/loaders/compute_tools/checkm2/Dockerfile index e4d22074..29ecf88a 100644 --- a/src/loaders/compute_tools/checkm2/Dockerfile +++ b/src/loaders/compute_tools/checkm2/Dockerfile @@ -1,5 +1,6 @@ FROM continuumio/miniconda3:22.11.1 +# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated ARG CHECKM2_VER=1.0.1 ENV CONDA_ENV checkm2-$CHECKM2_VER diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index 88de2709..5b3552b8 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -72,12 +72,12 @@ def _run_checkm2( metadata = {'tool': 'checkm2', 'version': '1.0.1', 'command': command, - "reference_db": { - "path": "/global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd", - "location": "NERSC", - "version": None, - "comment": "diamond_db, ver unknown", - }, + "reference_db": { + "path": "/global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd", + "location": "NERSC", + "version": None, + "comment": "diamond_db, ver unknown", + }, 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/checkm2/versions.yaml b/src/loaders/compute_tools/checkm2/versions.yaml index ac970175..8fd8c1ac 100644 --- a/src/loaders/compute_tools/checkm2/versions.yaml +++ b/src/loaders/compute_tools/checkm2/versions.yaml @@ -36,4 +36,7 @@ versions: date: 2024-08-16 notes: | - Create metadata file after running CheckM2 - reference_db_version: 1.0.1 \ No newline at end of file + reference_db_version: 1.0.1 + +#Please keep this reminder at the end of this file +#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated \ No newline at end of file diff --git a/src/loaders/compute_tools/eggnog/Dockerfile b/src/loaders/compute_tools/eggnog/Dockerfile index 54b0d633..ad2b2930 100644 --- a/src/loaders/compute_tools/eggnog/Dockerfile +++ b/src/loaders/compute_tools/eggnog/Dockerfile @@ -1,5 +1,6 @@ FROM continuumio/miniconda3:24.1.2-0 +# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated ENV EGGNOG_VER 2.1.12 ENV CONDA_ENV eggnog-$EGGNOG_VER ENV PYTHON_VER 3.11 diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 0309dc7f..6d510eb6 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -48,7 +48,12 @@ def _run_eggnog_single( 'tool_name': 'eggnog', 'version': '2.1.12', 'command': command, - 'reference_db': '5.0.2',} + "reference_db": { + "path": "global/cfs/cdirs/kbase/collections/libraries/eggnog/5.0.2", + "location": "NERSC", + "version": "5.0.2", + }, + } create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml index 4325e32e..67c8b179 100644 --- a/src/loaders/compute_tools/eggnog/versions.yaml +++ b/src/loaders/compute_tools/eggnog/versions.yaml @@ -27,4 +27,7 @@ versions: date: 2024-08-16 notes: | - Create metadata file after running Eggnog - reference_db_version: 5.0.2 \ No newline at end of file + reference_db_version: 5.0.2 + +#Please keep this reminder at the end of this file +#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated \ No newline at end of file diff --git a/src/loaders/compute_tools/gtdb_tk/Dockerfile b/src/loaders/compute_tools/gtdb_tk/Dockerfile index a7ba3ef0..526f8b0c 100644 --- a/src/loaders/compute_tools/gtdb_tk/Dockerfile +++ b/src/loaders/compute_tools/gtdb_tk/Dockerfile @@ -1,5 +1,6 @@ FROM continuumio/miniconda3:22.11.1 +# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated ENV GTDB_VER 2.3.2 ENV CONDA_ENV gtdbtk-$GTDB_VER ENV PYTHON_VER 3.8 diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py index dd857197..70c13307 100644 --- a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py +++ b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py @@ -154,7 +154,11 @@ def _run_gtdb_tk( metadata = {'tool': 'gtdb_tk', 'version': '2.3.2', 'command': command, - 'reference_db': 'release214', + "reference_db": { + "path": "global/cfs/cdirs/kbase/collections/libraries/gtdb_tk/release214", + "location": "NERSC", + "version": "release214", + }, 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/gtdb_tk/versions.yaml b/src/loaders/compute_tools/gtdb_tk/versions.yaml index 26f5e6b6..32a1c523 100644 --- a/src/loaders/compute_tools/gtdb_tk/versions.yaml +++ b/src/loaders/compute_tools/gtdb_tk/versions.yaml @@ -45,4 +45,7 @@ versions: date: 2024-08-16 notes: | - Create metadata file after running GTDB-Tk - reference_db_version: release214 \ No newline at end of file + reference_db_version: release214 + +#Please keep this reminder at the end of this file +#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated \ No newline at end of file diff --git a/src/loaders/compute_tools/mash/Dockerfile b/src/loaders/compute_tools/mash/Dockerfile index bdbfa837..3dc8afed 100644 --- a/src/loaders/compute_tools/mash/Dockerfile +++ b/src/loaders/compute_tools/mash/Dockerfile @@ -5,6 +5,7 @@ RUN apt-get update && \ # Set the Mash version argument # using version 2.0 to maintain compatibility with the homology service +# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_mash_single method is updated ARG MASH_VER=2.0 ENV CONDA_ENV mash-$MASH_VER From 8ceb7e7ce6ce413d279dff182143105df5baf8fd Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 16 Aug 2024 17:04:38 -0500 Subject: [PATCH 6/6] remove location and path for db --- src/loaders/compute_tools/checkm2/checkm2.py | 4 +--- src/loaders/compute_tools/eggnog/eggnog.py | 4 +--- src/loaders/compute_tools/gtdb_tk/gtdb_tk.py | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index 5b3552b8..67571515 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -73,11 +73,9 @@ def _run_checkm2( 'version': '1.0.1', 'command': command, "reference_db": { - "path": "/global/cfs/cdirs/kbase/collections/libraries/checkm2/1.0.1/uniref100.KO.1.dmnd", - "location": "NERSC", "version": None, "comment": "diamond_db, ver unknown", - }, + }, 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index 6d510eb6..b133b20d 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -49,10 +49,8 @@ def _run_eggnog_single( 'version': '2.1.12', 'command': command, "reference_db": { - "path": "global/cfs/cdirs/kbase/collections/libraries/eggnog/5.0.2", - "location": "NERSC", "version": "5.0.2", - }, + }, } create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py index 70c13307..74ad8e36 100644 --- a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py +++ b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py @@ -155,10 +155,8 @@ def _run_gtdb_tk( 'version': '2.3.2', 'command': command, "reference_db": { - "path": "global/cfs/cdirs/kbase/collections/libraries/gtdb_tk/release214", - "location": "NERSC", "version": "release214", - }, + }, 'ids_to_files': ids_to_files} create_tool_metadata(output_dir, metadata)