SWIFTSIM · bwvdnbro · Feb 4, 2022 · Feb 9, 2022 · Feb 17, 2022 · Mar 11, 2022
diff --git a/velociraptor/catalogue/catalogue.py b/velociraptor/catalogue/catalogue.py
@@ -15,6 +15,7 @@
 from velociraptor.catalogue.derived import DerivedQuantities
 from velociraptor.catalogue.registration import global_registration_functions
 from velociraptor.exceptions import RegistrationDoesNotMatchError
+from velociraptor.catalogue.reader import VelociraptorCatalogueReader
 
 
 class VelociraptorFieldMetadata(object):
@@ -88,7 +89,7 @@ def register_field_properties(self):
         return
 
 
-def generate_getter(filename, name: str, field: str, full_name: str, unit):
+def generate_getter(reader, name: str, field: str, full_name: str, unit):
     """
     Generates a function that:
 
@@ -113,14 +114,9 @@ def getter(self):
         if current_value is not None:
             return current_value
         else:
-            with h5py.File(filename, "r") as handle:
-                try:
-                    setattr(self, f"_{name}", unyt.unyt_array(handle[field][...], unit))
-                    getattr(self, f"_{name}").name = full_name
-                    getattr(self, f"_{name}").file = filename
-                except KeyError:
-                    print(f"Could not read {field}")
-                    return None
+            setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit))
+            getattr(self, f"_{name}").name = full_name
+            getattr(self, f"_{name}").file = reader.filename
 
         return getattr(self, f"_{name}")
 
@@ -156,7 +152,7 @@ def deleter(self):
 
 
 def generate_sub_catalogue(
-    filename,
+    reader,
     registration_name: str,
     registration_function: Callable,
     units: VelociraptorUnits,
@@ -173,10 +169,7 @@ def generate_sub_catalogue(
     """
 
     # This creates a _copy_ of the _class_, not object.
-    this_sub_catalogue_bases = (
-        __VelociraptorSubCatalogue,
-        object,
-    )
+    this_sub_catalogue_bases = (__VelociraptorSubCatalogue, object)
     this_sub_catalogue_dict = {}
 
     valid_sub_paths = []
@@ -186,11 +179,7 @@ def generate_sub_catalogue(
 
         this_sub_catalogue_dict[metadata.snake_case] = property(
             generate_getter(
-                filename,
-                metadata.snake_case,
-                metadata.path,
-                metadata.name,
-                metadata.unit,
+                reader, metadata.snake_case, metadata.path, metadata.name, metadata.unit
             ),
             generate_setter(metadata.snake_case),
             generate_deleter(metadata.snake_case),
@@ -205,7 +194,7 @@ def generate_sub_catalogue(
     )
 
     # Finally, we can actually create an instance of our new class.
-    catalogue = ThisSubCatalogue(filename=filename)
+    catalogue = ThisSubCatalogue(filename=reader.filename)
     catalogue.valid_sub_paths = valid_sub_paths
 
     return catalogue
@@ -375,6 +364,8 @@ def __create_sub_catalogues(self):
             else:
                 self.invalid_field_paths.append(path)
 
+        reader = VelociraptorCatalogueReader(self.filename)
+
         # For each registration function, we create a dynamic sub-class that
         # contains only that information - otherwise the namespace of the
         # VelociraptorCatalogue is way too crowded.
@@ -383,7 +374,7 @@ def __create_sub_catalogues(self):
                 self,
                 attribute_name,
                 generate_sub_catalogue(
-                    filename=self.filename,
+                    reader=reader,
                     registration_name=attribute_name,  # This ensures each class has a unique name
                     registration_function=self.registration_functions[attribute_name],
                     units=self.units,

diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py
@@ -0,0 +1,103 @@
+"""
+Main objects for the velociraptor reading library.
+
+This is based upon the reading routines in the SWIFTsimIO library.
+"""
+
+import h5py
+import re
+import numpy as np
+
+from typing import List
+
+
+class VelociraptorCatalogueReader(object):
+    """
+    VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it
+    will detect whether this catalogue is self-contained or part of a larger
+    split catalogue consisting of multiple files.
+
+    When a split catalogue is used, any of the catalogue.properties.X files can
+    be passed on to the constructor, where X is a counter ranging from 0 to
+    properties_file["Num_of_files"]-1. When a dataset is extracted from such a
+    catalogue, the elements in the resulting dataset will be ordered in blocks
+    of increasing X.
+
+    For split catalogues, this class's read_field() method handles reading the
+    distributed datasets. For unsplit catalogues, it behaves exactly the same
+    as a direct read from the HDF5 file.
+    """
+
+    # List of files that make up the catalogue
+    filenames: List[str]
+
+    def __init__(self, filename: str):
+        """
+        I take in:
+
+        + filename of (one of) the velociraptor properties file(s)
+        """
+        with h5py.File(filename, "r") as handle:
+            num_files = handle["Num_of_files"][0]
+        if num_files == 1:
+            self.filenames = [filename]
+        else:
+            # compose the other file names
+            # we cast to str() because filename could be a pathlib.Path
+            basename = re.match("(\S+properties)\.\d+\Z", str(filename)).groups()[0]
+            self.filenames = [f"{basename}.{idx}" for idx in range(num_files)]
+
+    @property
+    def filename(self):
+        """
+        Returns the velociraptor properties file name or the first file name
+        if the catalogue is split
+        """
+        return self.filenames[0]
+
+    def read_field(self, field: str):
+        """
+        Read the given field from the catalogue file(s)
+        """
+        if len(self.filenames) == 1:
+            with h5py.File(self.filenames[0], "r") as handle:
+                try:
+                    value = handle[field][...]
+                except KeyError:
+                    print(f"Could not read {field}")
+                    return None
+            return value
+        else:
+            # figure out the shape and dtype of the return value, so that we can
+            # create the appropriate array
+            dtype = None
+            shape = None
+            for filename in self.filenames:
+                with h5py.File(filename, "r") as handle:
+                    try:
+                        ds = handle[field]
+                    except KeyError:
+                        print(f"Could not read {field}")
+                        return None
+                    if dtype is None:
+                        dtype = ds.dtype
+                        shape = ds.shape
+                    else:
+                        # tuples are immutable, so instead of
+                        # shape[0]+= ds.shape[0], we have to unpack, sum and
+                        # then pack again
+                        shape0, *shaperest = shape
+                        shape0 += ds.shape[0]
+                        shape = (shape0, *shaperest)
+
+            # create an empty array to store the return value
+            value = np.zeros(shape, dtype=dtype)
+            # now read the data (no need to check for existence again, this was
+            # done when getting the shape and type)
+            offset = 0
+            for filename in self.filenames:
+                with h5py.File(filename, "r") as handle:
+                    size = handle[field].shape[0]
+                    value[offset : offset + size] = handle[field][...]
+                    offset += size
+            return value