Skip to content

Commit

Permalink
fixed up to index by any annotation value
Browse files Browse the repository at this point in the history
  • Loading branch information
sierra-moxon committed Jan 24, 2025
1 parent c4dc88c commit 2edab99
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 76 deletions.
126 changes: 51 additions & 75 deletions scripts/gen-model-meta.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,74 @@
"""
Script to generate model indicies. to run:
% cd scripts
% poetry install
% poetry run python gen-model-meta.py --keys-to-index contributor,term_id,reference_id --output-dir /tmp/output
"""
import os
import glob
import json
import sys
import logging
from pathlib import Path
import click
from pprint import pprint

import click

# Logger basic setup
logging.basicConfig(level=logging.INFO)
LOG = logging.getLogger('gen-model-meta')
LOG = logging.getLogger("gen-model-meta")
LOG.setLevel(logging.WARNING)
json_path = Path("/tmp/gocams/")

def die_screaming(instr):
"""Exit in a way that will get attention."""
LOG.error(instr)
sys.exit(1)


def create_indices(read_data, model_id, keys_to_index):
def process_json_files(keys_to_index, output_dir, path_to_json=json_path):
"""
Process the JSON data structure once and generate indices for the specified keys.
:param read_data: The JSON data structure
:param model_id: The model ID to associate with indexed values
:param keys_to_index: List of keys to index
:return: A dictionary of indices for the specified keys
Process all JSON files, generate indices for the specified keys, and save them.
"""
# Get the list of files from path_to_json
json_files = [f for f in os.listdir(path_to_json) if f.endswith(".json")]
indices = {key: {} for key in keys_to_index}

individuals = read_data.get("individuals", [])

for individual in individuals:
annotations = individual.get("annotations", [])
for annotation in annotations:
key = annotation.get("key")
value = annotation.get("value")
if key in keys_to_index:
if value not in indices[key]:
indices[key][value] = []
if model_id not in indices[key][value]:
indices[key][value].append(model_id)
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

return indices
# Process each JSON file
for file_name in json_files:
model_id = Path(file_name).stem

# Open, parse, and save the JSON in a dictionary file
file_url = os.path.join(path_to_json, file_name)
with open(file_url, "r") as f:
print(model_id)
read_data = json.load(f)

if not read_data:
die_screaming(f"ERROR: No data in file: {file_url}")

# Create indices for the current file
individuals = read_data.get("individuals", [])
for individual in individuals:
annotations = individual.get("annotations", [])
for annotation in annotations:
key = annotation.get("key")
value = annotation.get("value")
if key in keys_to_index:
if value not in indices[key]:
indices[key][value] = []
if model_id not in indices[key][value]:
indices[key][value].append(model_id)
# for each top level key in the indicies dictionary, write out the JSON to a file
for key, value in indices.items():
with open(os.path.join(output_dir, f"{key}.json"), "w") as f:
json.dump(value, f, indent=4)


# CLI using Click
@click.command()
@click.option(
"--keys-to-index",
Expand All @@ -54,63 +77,16 @@ def create_indices(read_data, model_id, keys_to_index):
required=True,
help="List of keys to index (e.g., contributor, term_id, reference_id).",
)
@click.option(
"--input-dir",
"-i",
type=click.Path(file_okay=False, dir_okay=True, readable=True, path_type=str),
required=True,
help="Directory containing the JSON input files.",
)
@click.option(
"--output-dir",
"-o",
type=click.Path(file_okay=False, dir_okay=True, writable=True, path_type=str),
required=True,
help="Directory where the JSON index files will be saved.",
)
def main(keys_to_index, input_dir, output_dir):
"""
Process all JSON files in the input directory, generate indices for the specified keys, and save them.
"""
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process each JSON file in the input directory
for file_path in glob.glob(f"{input_dir}/*.json"):
model_id = Path(file_path).stem

with open(file_path) as fhandle:
try:
read_data = json.load(fhandle)
except json.JSONDecodeError:
die_screaming(f"ERROR decoding JSON in file: {file_path}")

if not read_data:
die_screaming(f"ERROR: No data in file: {file_path}")

# Create indices for the current file
indices = create_indices(read_data, model_id, keys_to_index)

# Save each index as a JSON file
for key, index in indices.items():
output_file = os.path.join(output_dir, f"{key}_index.json")
# Append to the existing file if it exists
if os.path.exists(output_file):
with open(output_file, "r") as f:
existing_data = json.load(f)
for k, v in index.items():
if k in existing_data:
existing_data[k].extend(v)
existing_data[k] = list(set(existing_data[k])) # Ensure unique entries
else:
existing_data[k] = v
index = existing_data

with open(output_file, "w") as f:
json.dump(index, f, indent=4)

click.echo(f"Saved {key} index to {output_file}")

def main(keys_to_index, output_dir):
process_json_files(keys_to_index, output_dir)

# Ensure the script runs only when executed directly
if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion scripts/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ docutils = "0.21.2"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"

[scripts]
gen_model_meta = "gen_model_meta:main"

0 comments on commit 2edab99

Please sign in to comment.