Skip to content

Commit

Permalink
IDS-328: Submit ABI data to MyTardis (#356)
Browse files Browse the repository at this point in the history
* Initial implementation of submitting metadata to MyTardis for ABI data

* Use diskcache for temporary md5 caching

* Isolate the temporary caching logic so it can be removed more easily

* Fix inconsistent prefixing of dataset names, and add some logging

* Initial implementation of command-line interface for ABI ingestion

* Add optional sorting of file/directory entries when iterating over them

* Fix an outdated docstrnig

* Delete some temporary code (caching and comments)

* Remove a dependency which was used temporarily during development

* Move the Timer class to a separate module

* Fix tests that were broken by the addition of the sorting option to DirectoryNode

* Move the command-line ingestion runner code to a new script

* Replace print statements with logger calls
  • Loading branch information
andrew-uoa authored Dec 19, 2023
1 parent 80e0b65 commit f0bf669
Show file tree
Hide file tree
Showing 8 changed files with 419 additions and 338 deletions.
576 changes: 299 additions & 277 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ python-slugify = "^8.0.1"
dependency-injector = "^4.41.0"
pydantic-settings = "^2.0.3"
validators = "^0.22.0"
typer = "^0.9.0"

[tool.poetry.group.dev.dependencies]
wily = "^1.25.0"
Expand Down
4 changes: 0 additions & 4 deletions src/forges/forge.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,6 @@ def forge_datafile(
Args:
object_dict: The object dictionary containing the metadata to create
the datafile in MyTardis
Returns:
a tuple containing the URI of the forged project and boolean flags indicating the
status of the object creation.
"""
# No URI is yielded when forging a datafile
_ = self.forge_object(refined_object)
62 changes: 62 additions & 0 deletions src/profiles/abi_music/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Ingestion runner for the ABI MuSIC data
"""

import io
import logging
from pathlib import Path

import typer

from src.config.config import ConfigFromEnv
from src.ingestion_factory.factory import IngestionFactory
from src.profiles.abi_music import parsing
from src.utils import log_utils
from src.utils.filesystem.filesystem_nodes import DirectoryNode
from src.utils.timing import Timer


def main(data_root: Path, log_file: Path = Path("abi_ingestion.log")) -> None:
"""
Run an ingestion for the ABI MuSIC data
"""
log_utils.init_logging(file_name=str(log_file), level=logging.DEBUG)
config = ConfigFromEnv()
timer = Timer(start=True)

root_dir = DirectoryNode(data_root)
if root_dir.empty():
raise ValueError("Data root directory is empty. May not be mounted.")

dataclasses = parsing.parse_data(root_dir)

logging.info("Number of datafiles: %d", len(dataclasses.get_datafiles()))

# Does this logging still meet our needs?
stream = io.StringIO()
dataclasses.print(stream)
logging.info(stream.getvalue())

elapsed = timer.stop()
logging.info("Finished parsing data directory into PEDD hierarchy")
logging.info("Total time (s): %.2f", elapsed)

logging.info("Submitting to MyTardis")
timer.start()

ingestion_agent = IngestionFactory(config=config)

ingestion_agent.ingest(
dataclasses.get_projects(),
dataclasses.get_experiments(),
dataclasses.get_datasets(),
dataclasses.get_datafiles(),
)

elapsed = timer.stop()
logging.info("Finished submitting dataclasses to MyTardis")
logging.info("Total time (s): %.2f", elapsed)


if __name__ == "__main__":
typer.run(main)
48 changes: 7 additions & 41 deletions src/profiles/abi_music/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
Parsing logic for generating PEDD dataclasses from ABI Music files
"""

import io
import json
import logging
import mimetypes
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Mapping
Expand All @@ -27,7 +25,6 @@
ABI_MUSIC_POSTPROCESSING_INSTRUMENT,
DEFAULT_INSTITUTION,
)
from src.utils import log_utils
from src.utils.filesystem import checksums, filters
from src.utils.filesystem.filesystem_nodes import DirectoryNode, FileNode

Expand Down Expand Up @@ -149,7 +146,7 @@ def parse_raw_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
"sqrt-offset": json_data["Offsets"]["SQRT Offset"],
}

main_id = json_data["Basename"]["Sequence"]
main_id = "raw-" + json_data["Basename"]["Sequence"]

dataset = RawDataset(
description="Raw:" + json_data["Description"],
Expand All @@ -160,7 +157,7 @@ def parse_raw_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
immutable=False,
identifiers=[
main_id,
str(json_data["SequenceID"]),
"raw-" + str(json_data["SequenceID"]),
],
experiments=[
json_data["Basename"]["Sample"],
Expand Down Expand Up @@ -196,7 +193,7 @@ def parse_zarr_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
"sqrt-offset": json_data["config"]["Offsets"]["SQRT Offset"],
}

main_id = json_data["config"]["Basename"]["Sequence"]
main_id = "zarr-" + json_data["config"]["Basename"]["Sequence"]

dataset = RawDataset(
description="Zarr:" + json_data["config"]["Description"],
Expand All @@ -207,7 +204,7 @@ def parse_zarr_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
immutable=False,
identifiers=[
main_id,
str(json_data["config"]["SequenceID"]),
"zarr-" + str(json_data["config"]["SequenceID"]),
],
experiments=[
json_data["config"]["Basename"]["Sample"],
Expand Down Expand Up @@ -262,7 +259,7 @@ def parse_raw_data(
]

for project_dir in project_dirs:
print(f"Project directory: {project_dir.name()}")
logging.info("Project directory: %s", project_dir.name())

pedd_builder.add_project(parse_project_info(project_dir))

Expand All @@ -273,7 +270,7 @@ def parse_raw_data(
]

for experiment_dir in experiment_dirs:
print(f"Experiment directory: {experiment_dir.name()}")
logging.info("Experiment directory: %s", experiment_dir.name())

pedd_builder.add_experiment(parse_experiment_info(experiment_dir))

Expand All @@ -284,7 +281,7 @@ def parse_raw_data(
]

for dataset_dir in dataset_dirs:
print(f"Dataset directory: {dataset_dir.name()}")
logging.info("Dataset directory: %s", dataset_dir.name())

dataset, dataset_id = parse_raw_dataset(dataset_dir)

Expand Down Expand Up @@ -367,8 +364,6 @@ def parse_data(root: DirectoryNode) -> IngestibleDataclasses:
Parse/validate the data directory to extract the files to be ingested
"""

assert not root.empty(), "Data root directory is empty. May not be mounted."

raw_dir = root.dir("Vault").dir("Raw")
zarr_dir = root.dir("Zarr")

Expand All @@ -386,32 +381,3 @@ def parse_data(root: DirectoryNode) -> IngestibleDataclasses:
datasets=dc_raw.get_datasets() + dc_zarr.get_datasets(),
datafiles=dc_raw.get_datafiles() + dc_zarr.get_datafiles(),
)


def main() -> None:
"""
main function - this is just for testing - a proper ingestion runner is yet to be written.
"""
log_utils.init_logging(file_name="abi_ingest.log", level=logging.DEBUG)

# This path will come from command-line args or a config file
data_root = Path("/mnt/abi_test_data")

root_node = DirectoryNode(data_root)

start = time.perf_counter(), time.process_time()

dataclasses = parse_data(root_node)

# For now just log the dataclass contents. In a future PR we will submit
# them to MyTardis.
stream = io.StringIO()
dataclasses.print(stream)
logging.info(stream.getvalue())

end = time.perf_counter(), time.process_time()
print(f"Total time: {end[0] - start[0]}\nCPU Time: {end[1] - start[1]}")


if __name__ == "__main__":
main()
8 changes: 8 additions & 0 deletions src/utils/filesystem/filesystem_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,13 @@ def __init__(
path: Path,
parent: DirectoryNode | None = None,
check_exists: bool = True,
sort_entries: bool = True,
):
self._path = path
self._parent = parent
self._dirs: list[DirectoryNode] | None = None
self._files: list[FileNode] | None = None
self._sort_entries = sort_entries

if check_exists and not path.is_dir():
raise NotADirectoryError(f"'{path}' is not a valid directory")
Expand Down Expand Up @@ -173,12 +175,18 @@ def files(self) -> list[FileNode]:
"""Get a list of all the files in this directory"""
if self._files is None:
self._files, self._dirs = collect_children(self)
if self._sort_entries:
self._files.sort(key=lambda fn: fn.name())
self._dirs.sort(key=lambda dn: dn.path())
return self._files

def directories(self) -> list[DirectoryNode]:
"""Get a list of all the directories in this directory"""
if self._dirs is None:
self._files, self._dirs = collect_children(self)
if self._sort_entries:
self._files.sort(key=lambda fn: fn.name())
self._dirs.sort(key=lambda dn: dn.path())
return self._dirs

def iter_files(self, recursive: bool = False) -> Iterator[FileNode]:
Expand Down
26 changes: 26 additions & 0 deletions src/utils/timing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Helpers related to time measurements
"""

import time


class Timer:
"""A very basic class for measuring the elapsed time for some operation."""

def __init__(self, start: bool = True) -> None:
self._start_time: float | None = None
if start:
self.start()

def start(self) -> None:
"""Start the timer running"""
self._start_time = time.perf_counter()

def stop(self) -> float:
"""Stop the timer from running and return the elapsed time"""
if self._start_time is None:
raise RuntimeError("Attempted to stop Timer which was never started.")

elapsed = time.perf_counter() - self._start_time
self._start_time = None
return elapsed
32 changes: 16 additions & 16 deletions tests/test_utils_filesystem_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,12 @@ def test_directory_node_query_files(_fake_filesystem: FakeFilesystem):
assert len(iter_files_recursive) == 8
assert iter_files_recursive[0].path() == Path("/test/a.txt")
assert iter_files_recursive[1].path() == Path("/test/b.jpg")
assert iter_files_recursive[2].path() == Path("/test/foo/b.txt")
assert iter_files_recursive[3].path() == Path("/test/foo/c.png")
assert iter_files_recursive[4].path() == Path("/test/foo/baz/f.mov")
assert iter_files_recursive[5].path() == Path("/test/foo/baz/g.json")
assert iter_files_recursive[6].path() == Path("/test/bar/d.pdf")
assert iter_files_recursive[7].path() == Path("/test/bar/e.py")
assert iter_files_recursive[2].path() == Path("/test/bar/d.pdf")
assert iter_files_recursive[3].path() == Path("/test/bar/e.py")
assert iter_files_recursive[4].path() == Path("/test/foo/b.txt")
assert iter_files_recursive[5].path() == Path("/test/foo/c.png")
assert iter_files_recursive[6].path() == Path("/test/foo/baz/f.mov")
assert iter_files_recursive[7].path() == Path("/test/foo/baz/g.json")

empty_dir = DirectoryNode(Path("/test/foo/empty"))
assert len(empty_dir.files()) == 0
Expand All @@ -134,18 +134,18 @@ def test_directory_node_query_directories(_fake_filesystem: FakeFilesystem):

dirs = test_dir.directories()
assert len(dirs) == 2
assert dirs[0].name() == "foo"
assert dirs[1].name() == "bar"
assert dirs[0].name() == "bar"
assert dirs[1].name() == "foo"

iter_dirs = list(test_dir.iter_dirs(recursive=False))
assert len(iter_dirs) == 2
assert iter_dirs[0].name() == "foo"
assert iter_dirs[1].name() == "bar"
assert iter_dirs[0].name() == "bar"
assert iter_dirs[1].name() == "foo"

iter_dirs_recursive = list(test_dir.iter_dirs(recursive=True))
assert len(iter_dirs_recursive) == 4
assert iter_dirs_recursive[0].path() == Path("/test/foo")
assert iter_dirs_recursive[1].path() == Path("/test/bar")
assert iter_dirs_recursive[0].path() == Path("/test/bar")
assert iter_dirs_recursive[1].path() == Path("/test/foo")
assert iter_dirs_recursive[2].path() == Path("/test/foo/baz")
assert iter_dirs_recursive[3].path() == Path("/test/foo/empty")

Expand Down Expand Up @@ -174,28 +174,28 @@ def stash_path(file_node: FileNode | DirectoryNode) -> None:
assert arg_paths == [
Path("/test/a.txt"),
Path("/test/b.jpg"),
Path("/test/bar/d.pdf"),
Path("/test/bar/e.py"),
Path("/test/foo/b.txt"),
Path("/test/foo/c.png"),
Path("/test/foo/baz/f.mov"),
Path("/test/foo/baz/g.json"),
Path("/test/bar/d.pdf"),
Path("/test/bar/e.py"),
]

arg_paths.clear()

test_dir.visit_directories(stash_path, recursive=False)
assert arg_paths == [
Path("/test/foo"),
Path("/test/bar"),
Path("/test/foo"),
]

arg_paths.clear()

test_dir.visit_directories(stash_path, recursive=True)
assert arg_paths == [
Path("/test/foo"),
Path("/test/bar"),
Path("/test/foo"),
Path("/test/foo/baz"),
Path("/test/foo/empty"),
]
Expand Down

0 comments on commit f0bf669

Please sign in to comment.