Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add tool metadata file #747

Merged
merged 6 commits into from
Aug 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/loaders/common/loader_common_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,11 @@
OBJECTS_NAME_ASSEMBLY = "KBaseGenomeAnnotations.Assembly"
OBJECTS_NAME_GENOME = "KBaseGenomes.Genome"

# The metadata file name created during the Mash run
MASH_METADATA = 'mash_run_metadata.json'
# The metadata file name created during the tool's execution
TOOL_METADATA = 'tool_metadata.json'

# The metadata file name created during the Eggnog run
EGGNOG_METADATA = 'eggnog_run_metadata.json'
# Tool metadata file required keys
TOOL_METADATA_REQUIRED_KEYS = ['tool_name', 'version', 'command']

# The fatal error file created if a data file cannot be successfully processed
FATAL_ERROR_FILE = "fatal_error.json"
Expand Down
1 change: 1 addition & 0 deletions src/loaders/compute_tools/checkm2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM continuumio/miniconda3:22.11.1

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated
ARG CHECKM2_VER=1.0.1
ENV CONDA_ENV checkm2-$CHECKM2_VER

Expand Down
11 changes: 11 additions & 0 deletions src/loaders/compute_tools/checkm2/checkm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
run_command,
write_fatal_tuples_to_dict,
create_fatal_tuple,
create_tool_metadata,
)
from src.loaders.compute_tools.tool_result_parser import (
process_genome_attri_result,
Expand Down Expand Up @@ -68,6 +69,16 @@ def _run_checkm2(
fatal_tuples.append(fatal_tuple)
write_fatal_tuples_to_dict(fatal_tuples, output_dir)

metadata = {'tool': 'checkm2',
'version': '1.0.1',
'command': command,
"reference_db": {
"version": None,
"comment": "diamond_db, ver unknown",
},
'ids_to_files': ids_to_files}
create_tool_metadata(output_dir, metadata)


def main():
runner = ToolRunner("checkm2", tool_data_id_from_filename=True)
Expand Down
10 changes: 9 additions & 1 deletion src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,12 @@ versions:
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
reference_db_version: 1.0.1
reference_db_version: 1.0.1
- version: 0.1.6
date: 2024-08-16
notes: |
- Create metadata file after running CheckM2
reference_db_version: 1.0.1

#Please keep this reminder at the end of this file
#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_checkm2 method is updated
1 change: 1 addition & 0 deletions src/loaders/compute_tools/eggnog/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM continuumio/miniconda3:24.1.2-0

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated
ENV EGGNOG_VER 2.1.12
ENV CONDA_ENV eggnog-$EGGNOG_VER
ENV PYTHON_VER 3.11
Expand Down
20 changes: 13 additions & 7 deletions src/loaders/compute_tools/eggnog/eggnog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
Therefore, the parser program is not compatible with data generated by this tool.

"""
import json
from pathlib import Path

from src.loaders.common.loader_common_names import EGGNOG_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command
from src.loaders.common.loader_common_names import TOOL_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata

INPUT_TYPE = 'proteins'

Expand All @@ -22,7 +21,7 @@ def _run_eggnog_single(
threads_per_tool_run: int,
debug: bool) -> None:

metadata_file = output_dir / EGGNOG_METADATA
metadata_file = output_dir / TOOL_METADATA
if metadata_file.exists():
print(f"Skipping {source_file} as it has already been processed.")
return
Expand All @@ -44,9 +43,16 @@ def _run_eggnog_single(

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
'input_type': INPUT_TYPE}
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)
'input_type': INPUT_TYPE,
'data_id': data_id,
'tool_name': 'eggnog',
'version': '2.1.12',
'command': command,
"reference_db": {
"version": "5.0.2",
},
}
create_tool_metadata(output_dir, metadata)


def main():
Expand Down
11 changes: 10 additions & 1 deletion src/loaders/compute_tools/eggnog/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,13 @@ versions:
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
reference_db_version: 5.0.2
reference_db_version: 5.0.2

- version: 0.1.4
date: 2024-08-16
notes: |
- Create metadata file after running Eggnog
reference_db_version: 5.0.2

#Please keep this reminder at the end of this file
#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_eggnog_single method is updated
1 change: 1 addition & 0 deletions src/loaders/compute_tools/gtdb_tk/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM continuumio/miniconda3:22.11.1

# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated
ENV GTDB_VER 2.3.2
ENV CONDA_ENV gtdbtk-$GTDB_VER
ENV PYTHON_VER 3.8
Expand Down
10 changes: 10 additions & 0 deletions src/loaders/compute_tools/gtdb_tk/gtdb_tk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
find_gtdbtk_summary_files,
run_command,
write_fatal_tuples_to_dict,
create_tool_metadata,
)
from src.loaders.compute_tools.tool_result_parser import (
process_genome_attri_result,
Expand Down Expand Up @@ -150,6 +151,15 @@ def _run_gtdb_tk(
summary_files,
)

metadata = {'tool': 'gtdb_tk',
'version': '2.3.2',
'command': command,
"reference_db": {
"version": "release214",
},
'ids_to_files': ids_to_files}
create_tool_metadata(output_dir, metadata)


def main():
runner = ToolRunner("gtdb_tk", suffix_ids=True)
Expand Down
10 changes: 9 additions & 1 deletion src/loaders/compute_tools/gtdb_tk/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,12 @@ versions:
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
reference_db_version: release214
reference_db_version: release214
- version: 0.1.7
date: 2024-08-16
notes: |
- Create metadata file after running GTDB-Tk
reference_db_version: release214

#Please keep this reminder at the end of this file
#NOTE: If the db version changes, ensure the metadata information saved after running the tool in the _run_gtdb_tk method is updated
1 change: 1 addition & 0 deletions src/loaders/compute_tools/mash/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ RUN apt-get update && \

# Set the Mash version argument
# using version 2.0 to maintain compatibility with the homology service
# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_mash_single method is updated
ARG MASH_VER=2.0
ENV CONDA_ENV mash-$MASH_VER

Expand Down
14 changes: 7 additions & 7 deletions src/loaders/compute_tools/mash/mash.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
"""
Run Mash on a set of assemblies.
"""
import json
from pathlib import Path

from src.loaders.common.loader_common_names import MASH_METADATA
from src.loaders.compute_tools.tool_common import ToolRunner, run_command
from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata

KMER_SIZE = 19
SKETCH_SIZE = 10000
Expand All @@ -32,14 +30,16 @@ def _run_mash_single(
run_command(command, output_dir if debug else None)

# Save run info to a metadata file in the output directory for parsing later
metadata_file = output_dir / MASH_METADATA
metadata = {'source_file': str(source_file),
# Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
'sketch_file': str(source_file) + '.msh',
'kmer_size': kmer_size,
'sketch_size': sketch_size}
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)
'sketch_size': sketch_size,
'data_id': data_id,
'tool_name': 'mash',
'version': '2.0',
'command': command}
create_tool_metadata(output_dir, metadata)


def main():
Expand Down
6 changes: 5 additions & 1 deletion src/loaders/compute_tools/mash/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ versions:
- version: 0.1.4
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- version: 0.1.5
date: 2024-08-16
notes: |
- Create metadata file after running Mash
26 changes: 24 additions & 2 deletions src/loaders/compute_tools/tool_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import pandas as pd

from src.loaders.common import loader_common_names
from src.loaders.common.loader_common_names import TOOL_METADATA

# TODO CODE add a common module for saving and loading the metadata shared between the compute
# and parser
Expand Down Expand Up @@ -211,7 +212,8 @@
with self._data_id_file:
df = pd.read_csv(self._data_id_file, sep='\t')
try:
data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(str).tolist() # convert to string in case of int directory names
data_ids = df[loader_common_names.DATA_ID_COLUMN_HEADER].astype(

Check warning on line 215 in src/loaders/compute_tools/tool_common.py

View check run for this annotation

Codecov / codecov/patch

src/loaders/compute_tools/tool_common.py#L215

Added line #L215 was not covered by tests
str).tolist() # convert to string in case of int directory names
except KeyError:
raise ValueError(
f"Please ensure {loader_common_names.DATA_ID_COLUMN_HEADER} column exists in the "
Expand Down Expand Up @@ -344,7 +346,8 @@
def _execute(
self,
tool_callable: Callable[..., None],
args: Union[List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]],
args: Union[
List[Tuple[Dict[str, GenomeTuple], Path, int, bool]], List[Tuple[str, str, Path, Path, int, bool]]],
start: datetime.datetime,
total: bool,
):
Expand Down Expand Up @@ -595,6 +598,25 @@
return fatal_tuple


def create_tool_metadata(output_dir: Path, metadata: Dict[str, str]):
"""
Save the metadata as a JSON file to the specified output directory.

Args:
output_dir (Path): The directory where the metadata file will be saved.
metadata (Dict[str, str]): A dictionary containing metadata key-value pairs.
"""
required_keys = loader_common_names.TOOL_METADATA_REQUIRED_KEYS

Check warning on line 609 in src/loaders/compute_tools/tool_common.py

View check run for this annotation

Codecov / codecov/patch

src/loaders/compute_tools/tool_common.py#L609

Added line #L609 was not covered by tests

if not all(key in metadata for key in required_keys):
missing_keys = [key for key in required_keys if key not in metadata]
raise ValueError(f"Missing required keys in metadata: {missing_keys}")

Check warning on line 613 in src/loaders/compute_tools/tool_common.py

View check run for this annotation

Codecov / codecov/patch

src/loaders/compute_tools/tool_common.py#L611-L613

Added lines #L611 - L613 were not covered by tests

metadata_file = output_dir / TOOL_METADATA
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)

Check warning on line 617 in src/loaders/compute_tools/tool_common.py

View check run for this annotation

Codecov / codecov/patch

src/loaders/compute_tools/tool_common.py#L615-L617

Added lines #L615 - L617 were not covered by tests


if __name__ == "__main__":
# mostly just here to allow easily getting the help info with --help:
ToolRunner("fake_tool")
2 changes: 1 addition & 1 deletion src/loaders/genome_collection/parse_tool_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@
if data_id in fatal_ids:
continue
data_dir = Path(result_dir, batch_dir, data_id)
with open(data_dir / loader_common_names.MASH_METADATA, 'r') as file:
with open(data_dir / loader_common_names.TOOL_METADATA, 'r') as file:

Check warning on line 247 in src/loaders/genome_collection/parse_tool_results.py

View check run for this annotation

Codecov / codecov/patch

src/loaders/genome_collection/parse_tool_results.py#L247

Added line #L247 was not covered by tests
metadata = json.load(file)

sketch_file = metadata['sketch_file']
Expand Down
Loading