Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Split catalogue reader #76

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 12 additions & 21 deletions velociraptor/catalogue/catalogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from velociraptor.catalogue.derived import DerivedQuantities
from velociraptor.catalogue.registration import global_registration_functions
from velociraptor.exceptions import RegistrationDoesNotMatchError
from velociraptor.catalogue.reader import VelociraptorCatalogueReader


class VelociraptorFieldMetadata(object):
Expand Down Expand Up @@ -88,7 +89,7 @@ def register_field_properties(self):
return


def generate_getter(filename, name: str, field: str, full_name: str, unit):
def generate_getter(reader, name: str, field: str, full_name: str, unit):
"""
Generates a function that:

Expand All @@ -113,14 +114,9 @@ def getter(self):
if current_value is not None:
return current_value
else:
with h5py.File(filename, "r") as handle:
try:
setattr(self, f"_{name}", unyt.unyt_array(handle[field][...], unit))
getattr(self, f"_{name}").name = full_name
getattr(self, f"_{name}").file = filename
except KeyError:
print(f"Could not read {field}")
return None
setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit))
getattr(self, f"_{name}").name = full_name
getattr(self, f"_{name}").file = reader.filename

return getattr(self, f"_{name}")

Expand Down Expand Up @@ -156,7 +152,7 @@ def deleter(self):


def generate_sub_catalogue(
filename,
reader,
registration_name: str,
registration_function: Callable,
units: VelociraptorUnits,
Expand All @@ -173,10 +169,7 @@ def generate_sub_catalogue(
"""

# This creates a _copy_ of the _class_, not object.
this_sub_catalogue_bases = (
__VelociraptorSubCatalogue,
object,
)
this_sub_catalogue_bases = (__VelociraptorSubCatalogue, object)
this_sub_catalogue_dict = {}

valid_sub_paths = []
Expand All @@ -186,11 +179,7 @@ def generate_sub_catalogue(

this_sub_catalogue_dict[metadata.snake_case] = property(
generate_getter(
filename,
metadata.snake_case,
metadata.path,
metadata.name,
metadata.unit,
reader, metadata.snake_case, metadata.path, metadata.name, metadata.unit
),
generate_setter(metadata.snake_case),
generate_deleter(metadata.snake_case),
Expand All @@ -205,7 +194,7 @@ def generate_sub_catalogue(
)

# Finally, we can actually create an instance of our new class.
catalogue = ThisSubCatalogue(filename=filename)
catalogue = ThisSubCatalogue(filename=reader.filename)
catalogue.valid_sub_paths = valid_sub_paths

return catalogue
Expand Down Expand Up @@ -375,6 +364,8 @@ def __create_sub_catalogues(self):
else:
self.invalid_field_paths.append(path)

reader = VelociraptorCatalogueReader(self.filename)

# For each registration function, we create a dynamic sub-class that
# contains only that information - otherwise the namespace of the
# VelociraptorCatalogue is way too crowded.
Expand All @@ -383,7 +374,7 @@ def __create_sub_catalogues(self):
self,
attribute_name,
generate_sub_catalogue(
filename=self.filename,
reader=reader,
registration_name=attribute_name, # This ensures each class has a unique name
registration_function=self.registration_functions[attribute_name],
units=self.units,
Expand Down
103 changes: 103 additions & 0 deletions velociraptor/catalogue/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
Main objects for the velociraptor reading library.

This is based upon the reading routines in the SWIFTsimIO library.
"""

import h5py
import re
import numpy as np

from typing import List


class VelociraptorCatalogueReader(object):
"""
VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it
will detect whether this catalogue is self-contained or part of a larger
split catalogue consisting of multiple files.

When a split catalogue is used, any of the catalogue.properties.X files can
be passed on to the constructor, where X is a counter ranging from 0 to
properties_file["Num_of_files"]-1. When a dataset is extracted from such a
catalogue, the elements in the resulting dataset will be ordered in blocks
of increasing X.

For split catalogues, this class's read_field() method handles reading the
distributed datasets. For unsplit catalogues, it behaves exactly the same
as a direct read from the HDF5 file.
"""

# List of files that make up the catalogue
filenames: List[str]

def __init__(self, filename: str):
"""
I take in:

+ filename of (one of) the velociraptor properties file(s)
"""
with h5py.File(filename, "r") as handle:
num_files = handle["Num_of_files"][0]
if num_files == 1:
self.filenames = [filename]
else:
# compose the other file names
# we cast to str() because filename could be a pathlib.Path
basename = re.match("(\S+properties)\.\d+\Z", str(filename)).groups()[0]
self.filenames = [f"{basename}.{idx}" for idx in range(num_files)]

@property
def filename(self):
"""
Returns the velociraptor properties file name or the first file name
if the catalogue is split
"""
return self.filenames[0]

def read_field(self, field: str):
"""
Read the given field from the catalogue file(s)
"""
if len(self.filenames) == 1:
with h5py.File(self.filenames[0], "r") as handle:
try:
value = handle[field][...]
except KeyError:
print(f"Could not read {field}")
return None
return value
else:
# figure out the shape and dtype of the return value, so that we can
# create the appropriate array
dtype = None
shape = None
for filename in self.filenames:
with h5py.File(filename, "r") as handle:
try:
ds = handle[field]
except KeyError:
print(f"Could not read {field}")
return None
if dtype is None:
dtype = ds.dtype
shape = ds.shape
else:
# tuples are immutable, so instead of
# shape[0]+= ds.shape[0], we have to unpack, sum and
# then pack again
shape0, *shaperest = shape
shape0 += ds.shape[0]
shape = (shape0, *shaperest)

# create an empty array to store the return value
value = np.zeros(shape, dtype=dtype)
# now read the data (no need to check for existence again, this was
# done when getting the shape and type)
offset = 0
for filename in self.filenames:
with h5py.File(filename, "r") as handle:
size = handle[field].shape[0]
value[offset : offset + size] = handle[field][...]
offset += size
return value