Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IDS-328: Submit ABI data to MyTardis #356

Merged
merged 13 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
576 changes: 299 additions & 277 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ python-slugify = "^8.0.1"
dependency-injector = "^4.41.0"
pydantic-settings = "^2.0.3"
validators = "^0.22.0"
typer = "^0.9.0"

[tool.poetry.group.dev.dependencies]
wily = "^1.25.0"
Expand Down
4 changes: 0 additions & 4 deletions src/forges/forge.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,6 @@ def forge_datafile(
Args:
object_dict: The object dictionary containing the metadata to create
the datafile in MyTardis

Returns:
a tuple containing the URI of the forged project and boolean flags indicating the
status of the object creation.
"""
# No URI is yielded when forging a datafile
_ = self.forge_object(refined_object)
62 changes: 62 additions & 0 deletions src/profiles/abi_music/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Ingestion runner for the ABI MuSIC data
"""

import io
import logging
from pathlib import Path

import typer

from src.config.config import ConfigFromEnv
from src.ingestion_factory.factory import IngestionFactory
from src.profiles.abi_music import parsing
from src.utils import log_utils
from src.utils.filesystem.filesystem_nodes import DirectoryNode
from src.utils.timing import Timer


def main(data_root: Path, log_file: Path = Path("abi_ingestion.log")) -> None:
"""
Run an ingestion for the ABI MuSIC data
"""
log_utils.init_logging(file_name=str(log_file), level=logging.DEBUG)
config = ConfigFromEnv()
timer = Timer(start=True)

root_dir = DirectoryNode(data_root)
if root_dir.empty():
raise ValueError("Data root directory is empty. May not be mounted.")

dataclasses = parsing.parse_data(root_dir)

logging.info("Number of datafiles: %d", len(dataclasses.get_datafiles()))

# Does this logging still meet our needs?
stream = io.StringIO()
dataclasses.print(stream)
logging.info(stream.getvalue())

elapsed = timer.stop()
logging.info("Finished parsing data directory into PEDD hierarchy")
logging.info("Total time (s): %.2f", elapsed)

logging.info("Submitting to MyTardis")
timer.start()

ingestion_agent = IngestionFactory(config=config)

ingestion_agent.ingest(
dataclasses.get_projects(),
dataclasses.get_experiments(),
dataclasses.get_datasets(),
dataclasses.get_datafiles(),
)

elapsed = timer.stop()
logging.info("Finished submitting dataclasses to MyTardis")
logging.info("Total time (s): %.2f", elapsed)


if __name__ == "__main__":
typer.run(main)
48 changes: 7 additions & 41 deletions src/profiles/abi_music/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
Parsing logic for generating PEDD dataclasses from ABI Music files
"""

import io
import json
import logging
import mimetypes
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Mapping
Expand All @@ -27,7 +25,6 @@
ABI_MUSIC_POSTPROCESSING_INSTRUMENT,
DEFAULT_INSTITUTION,
)
from src.utils import log_utils
from src.utils.filesystem import checksums, filters
from src.utils.filesystem.filesystem_nodes import DirectoryNode, FileNode

Expand Down Expand Up @@ -149,7 +146,7 @@ def parse_raw_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
"sqrt-offset": json_data["Offsets"]["SQRT Offset"],
}

main_id = json_data["Basename"]["Sequence"]
main_id = "raw-" + json_data["Basename"]["Sequence"]

dataset = RawDataset(
description="Raw:" + json_data["Description"],
Expand All @@ -160,7 +157,7 @@ def parse_raw_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
immutable=False,
identifiers=[
main_id,
str(json_data["SequenceID"]),
"raw-" + str(json_data["SequenceID"]),
],
experiments=[
json_data["Basename"]["Sample"],
Expand Down Expand Up @@ -196,7 +193,7 @@ def parse_zarr_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
"sqrt-offset": json_data["config"]["Offsets"]["SQRT Offset"],
}

main_id = json_data["config"]["Basename"]["Sequence"]
main_id = "zarr-" + json_data["config"]["Basename"]["Sequence"]

dataset = RawDataset(
description="Zarr:" + json_data["config"]["Description"],
Expand All @@ -207,7 +204,7 @@ def parse_zarr_dataset(directory: DirectoryNode) -> tuple[RawDataset, str]:
immutable=False,
identifiers=[
main_id,
str(json_data["config"]["SequenceID"]),
"zarr-" + str(json_data["config"]["SequenceID"]),
],
experiments=[
json_data["config"]["Basename"]["Sample"],
Expand Down Expand Up @@ -262,7 +259,7 @@ def parse_raw_data(
]

for project_dir in project_dirs:
print(f"Project directory: {project_dir.name()}")
logging.info("Project directory: %s", project_dir.name())

pedd_builder.add_project(parse_project_info(project_dir))

Expand All @@ -273,7 +270,7 @@ def parse_raw_data(
]

for experiment_dir in experiment_dirs:
print(f"Experiment directory: {experiment_dir.name()}")
logging.info("Experiment directory: %s", experiment_dir.name())

pedd_builder.add_experiment(parse_experiment_info(experiment_dir))

Expand All @@ -284,7 +281,7 @@ def parse_raw_data(
]

for dataset_dir in dataset_dirs:
print(f"Dataset directory: {dataset_dir.name()}")
logging.info("Dataset directory: %s", dataset_dir.name())

dataset, dataset_id = parse_raw_dataset(dataset_dir)

Expand Down Expand Up @@ -367,8 +364,6 @@ def parse_data(root: DirectoryNode) -> IngestibleDataclasses:
Parse/validate the data directory to extract the files to be ingested
"""

assert not root.empty(), "Data root directory is empty. May not be mounted."

raw_dir = root.dir("Vault").dir("Raw")
zarr_dir = root.dir("Zarr")

Expand All @@ -386,32 +381,3 @@ def parse_data(root: DirectoryNode) -> IngestibleDataclasses:
datasets=dc_raw.get_datasets() + dc_zarr.get_datasets(),
datafiles=dc_raw.get_datafiles() + dc_zarr.get_datafiles(),
)


def main() -> None:
"""
main function - this is just for testing - a proper ingestion runner is yet to be written.
"""
log_utils.init_logging(file_name="abi_ingest.log", level=logging.DEBUG)

# This path will come from command-line args or a config file
data_root = Path("/mnt/abi_test_data")

root_node = DirectoryNode(data_root)

start = time.perf_counter(), time.process_time()

dataclasses = parse_data(root_node)

# For now just log the dataclass contents. In a future PR we will submit
# them to MyTardis.
stream = io.StringIO()
dataclasses.print(stream)
logging.info(stream.getvalue())

end = time.perf_counter(), time.process_time()
print(f"Total time: {end[0] - start[0]}\nCPU Time: {end[1] - start[1]}")


if __name__ == "__main__":
main()
8 changes: 8 additions & 0 deletions src/utils/filesystem/filesystem_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,13 @@ def __init__(
path: Path,
parent: DirectoryNode | None = None,
check_exists: bool = True,
sort_entries: bool = True,
):
self._path = path
self._parent = parent
self._dirs: list[DirectoryNode] | None = None
self._files: list[FileNode] | None = None
self._sort_entries = sort_entries

if check_exists and not path.is_dir():
raise NotADirectoryError(f"'{path}' is not a valid directory")
Expand Down Expand Up @@ -173,12 +175,18 @@ def files(self) -> list[FileNode]:
"""Get a list of all the files in this directory"""
if self._files is None:
self._files, self._dirs = collect_children(self)
if self._sort_entries:
self._files.sort(key=lambda fn: fn.name())
self._dirs.sort(key=lambda dn: dn.path())
return self._files

def directories(self) -> list[DirectoryNode]:
"""Get a list of all the directories in this directory"""
if self._dirs is None:
self._files, self._dirs = collect_children(self)
if self._sort_entries:
self._files.sort(key=lambda fn: fn.name())
self._dirs.sort(key=lambda dn: dn.path())
return self._dirs

def iter_files(self, recursive: bool = False) -> Iterator[FileNode]:
Expand Down
26 changes: 26 additions & 0 deletions src/utils/timing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Helpers related to time measurements
"""

import time


class Timer:
"""A very basic class for measuring the elapsed time for some operation."""

def __init__(self, start: bool = True) -> None:
self._start_time: float | None = None
if start:
self.start()

def start(self) -> None:
"""Start the timer running"""
self._start_time = time.perf_counter()

def stop(self) -> float:
"""Stop the timer from running and return the elapsed time"""
if self._start_time is None:
raise RuntimeError("Attempted to stop Timer which was never started.")

elapsed = time.perf_counter() - self._start_time
self._start_time = None
return elapsed
32 changes: 16 additions & 16 deletions tests/test_utils_filesystem_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,12 @@ def test_directory_node_query_files(_fake_filesystem: FakeFilesystem):
assert len(iter_files_recursive) == 8
assert iter_files_recursive[0].path() == Path("/test/a.txt")
assert iter_files_recursive[1].path() == Path("/test/b.jpg")
assert iter_files_recursive[2].path() == Path("/test/foo/b.txt")
assert iter_files_recursive[3].path() == Path("/test/foo/c.png")
assert iter_files_recursive[4].path() == Path("/test/foo/baz/f.mov")
assert iter_files_recursive[5].path() == Path("/test/foo/baz/g.json")
assert iter_files_recursive[6].path() == Path("/test/bar/d.pdf")
assert iter_files_recursive[7].path() == Path("/test/bar/e.py")
assert iter_files_recursive[2].path() == Path("/test/bar/d.pdf")
assert iter_files_recursive[3].path() == Path("/test/bar/e.py")
assert iter_files_recursive[4].path() == Path("/test/foo/b.txt")
assert iter_files_recursive[5].path() == Path("/test/foo/c.png")
assert iter_files_recursive[6].path() == Path("/test/foo/baz/f.mov")
assert iter_files_recursive[7].path() == Path("/test/foo/baz/g.json")

empty_dir = DirectoryNode(Path("/test/foo/empty"))
assert len(empty_dir.files()) == 0
Expand All @@ -134,18 +134,18 @@ def test_directory_node_query_directories(_fake_filesystem: FakeFilesystem):

dirs = test_dir.directories()
assert len(dirs) == 2
assert dirs[0].name() == "foo"
assert dirs[1].name() == "bar"
assert dirs[0].name() == "bar"
assert dirs[1].name() == "foo"

iter_dirs = list(test_dir.iter_dirs(recursive=False))
assert len(iter_dirs) == 2
assert iter_dirs[0].name() == "foo"
assert iter_dirs[1].name() == "bar"
assert iter_dirs[0].name() == "bar"
assert iter_dirs[1].name() == "foo"

iter_dirs_recursive = list(test_dir.iter_dirs(recursive=True))
assert len(iter_dirs_recursive) == 4
assert iter_dirs_recursive[0].path() == Path("/test/foo")
assert iter_dirs_recursive[1].path() == Path("/test/bar")
assert iter_dirs_recursive[0].path() == Path("/test/bar")
assert iter_dirs_recursive[1].path() == Path("/test/foo")
assert iter_dirs_recursive[2].path() == Path("/test/foo/baz")
assert iter_dirs_recursive[3].path() == Path("/test/foo/empty")

Expand Down Expand Up @@ -174,28 +174,28 @@ def stash_path(file_node: FileNode | DirectoryNode) -> None:
assert arg_paths == [
Path("/test/a.txt"),
Path("/test/b.jpg"),
Path("/test/bar/d.pdf"),
Path("/test/bar/e.py"),
Path("/test/foo/b.txt"),
Path("/test/foo/c.png"),
Path("/test/foo/baz/f.mov"),
Path("/test/foo/baz/g.json"),
Path("/test/bar/d.pdf"),
Path("/test/bar/e.py"),
]

arg_paths.clear()

test_dir.visit_directories(stash_path, recursive=False)
assert arg_paths == [
Path("/test/foo"),
Path("/test/bar"),
Path("/test/foo"),
]

arg_paths.clear()

test_dir.visit_directories(stash_path, recursive=True)
assert arg_paths == [
Path("/test/foo"),
Path("/test/bar"),
Path("/test/foo"),
Path("/test/foo/baz"),
Path("/test/foo/empty"),
]
Expand Down
Loading