From b61fa0e21f8093d6c237e0b35c54cb1dba9bbd8c Mon Sep 17 00:00:00 2001 From: mattjala <124107509+mattjala@users.noreply.github.com> Date: Tue, 26 Mar 2024 14:32:22 -0500 Subject: [PATCH] Field selection for simple compound types (#173) * Field selection for simple compound types * Add logging to test_datatype --- h5pyd/_hl/dataset.py | 37 ++++----- h5pyd/_hl/h5type.py | 13 ++-- test/hl/test_datatype.py | 157 +++++++++++++++++++++++++++++++++++++++ testall.py | 3 + 4 files changed, 181 insertions(+), 29 deletions(-) create mode 100644 test/hl/test_datatype.py diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py index 7f8038ba..6aea38a9 100644 --- a/h5pyd/_hl/dataset.py +++ b/h5pyd/_hl/dataset.py @@ -1049,6 +1049,9 @@ def __getitem__(self, args, new_dtype=None): req = "/datasets/" + self.id.uuid + "/value" params = {} + if len(names) > 0: + params["fields"] = ":".join(names) + if self.id._http_conn.mode == "r" and self.id._http_conn.cache_on: # enables lambda to be used on server self.log.debug("setting nonstrict parameter") @@ -1483,41 +1486,23 @@ def __setitem__(self, args, val): last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] + """ - - # Make a compound memory type if field-name slicing is required - elif len(names) != 0: - - mshape = val.shape - + # Check for field selection + if len(names) != 0: # Catch common errors if self.dtype.fields is None: raise TypeError("Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: - mismatch = ", ".join('"%s"'%x for x in mismatch) + mismatch = ", ".join('"%s"' % x for x in mismatch) raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch) - # Write non-compound source into a single dataset field - if len(names) == 1 and val.dtype.fields is None: - subtype = h5y.py_create(val.dtype) - mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) - mtype.insert(self._e(names[0]), 0, subtype) - - # Make a new source type keeping only the requested fields - else: - fieldnames = [x for x in val.dtype.names if x in names] # Keep source order - mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) - for fieldname in fieldnames: - subtype = h5t.py_create(val.dtype.fields[fieldname][0]) - offset = val.dtype.fields[fieldname][1] - mtype.insert(self._e(fieldname), offset, subtype) - # Use mtype derived from array (let DatasetID.write figure it out) else: mshape = val.shape - #mtype = None - """ + # mtype = None + mshape = val.shape self.log.debug(f"mshape: {mshape}") self.log.debug(f"data dtype: {val.dtype}") @@ -1582,6 +1567,10 @@ def __setitem__(self, args, val): self.log.debug(f"got select query param: {select_param}") params["select"] = select_param + # Perform write to subset of named fields within compound datatype, if any + if len(names) > 0: + params["fields"] = ":".join(names) + self.PUT(req, body=body, format=format, params=params) """ mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) diff --git a/h5pyd/_hl/h5type.py b/h5pyd/_hl/h5type.py index 81b0ed42..fed3da71 100644 --- a/h5pyd/_hl/h5type.py +++ b/h5pyd/_hl/h5type.py @@ -441,10 +441,14 @@ def getTypeItem(dt): type_info['length'] = 'H5T_VARIABLE' type_info['charSet'] = 'H5T_CSET_UTF8' type_info['strPad'] = 'H5T_STR_NULLTERM' - elif vlen_check == int: + elif vlen_check in (int, np.int64): type_info['class'] = 'H5T_VLEN' type_info['size'] = 'H5T_VARIABLE' type_info['base'] = 'H5T_STD_I64' + elif vlen_check == np.int32: + type_info['class'] = 'H5T_VLEN' + type_info['size'] = 'H5T_VARIABLE' + type_info['base'] = 'H5T_STD_I32' elif vlen_check in (float, np.float64): type_info['class'] = 'H5T_VLEN' type_info['size'] = 'H5T_VARIABLE' @@ -456,7 +460,7 @@ def getTypeItem(dt): type_info['base'] = getTypeItem(vlen_check) elif vlen_check is not None: # unknown vlen type - raise TypeError("Unknown h5py vlen type: " + str(vlen_check)) + raise TypeError("Unknown h5pyd vlen type: " + str(vlen_check)) elif ref_check is not None: # a reference type type_info['class'] = 'H5T_REFERENCE' @@ -781,7 +785,7 @@ def createBaseDataType(typeItem): raise TypeError("ArrayType is not supported for variable len types") if 'base' not in typeItem: raise KeyError("'base' not provided") - baseType = createBaseDataType(typeItem['base']) + baseType = createDataType(typeItem['base']) dtRet = special_dtype(vlen=np.dtype(baseType)) elif typeClass == 'H5T_OPAQUE': if dims: @@ -842,9 +846,8 @@ def createBaseDataType(typeItem): else: # not a boolean enum, use h5py special dtype dtRet = special_dtype(enum=(dt, mapping)) - else: - raise TypeError("Invalid type class") + raise TypeError(f"Invalid base type class: {typeClass}") return dtRet diff --git a/test/hl/test_datatype.py b/test/hl/test_datatype.py new file mode 100644 index 00000000..00c0c507 --- /dev/null +++ b/test/hl/test_datatype.py @@ -0,0 +1,157 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import numpy as np +import math +import logging +import config + +if config.get("use_h5py"): + import h5py +else: + import h5pyd as h5py + +from common import ut, TestCase + + +class TestScalarCompound(TestCase): + + def setUp(self): + filename = self.getFileName("scalar_compound_dset") + print("filename:", filename) + self.f = h5py.File(filename, "w") + self.data = np.array((42.5, -118, "Hello"), dtype=[('a', 'f'), ('b', 'i'), ('c', '|S10')]) + self.dset = self.f.create_dataset('x', data=self.data) + + def test_ndim(self): + """ Verify number of dimensions """ + self.assertEqual(self.dset.ndim, 0) + + def test_shape(self): + """ Verify shape """ + self.assertEqual(self.dset.shape, tuple()) + + def test_size(self): + """ Verify size """ + self.assertEqual(self.dset.size, 1) + + def test_ellipsis(self): + """ Ellipsis -> scalar ndarray """ + out = self.dset[...] + # assertArrayEqual doesn't work with compounds; do manually + self.assertIsInstance(out, np.ndarray) + self.assertEqual(out.shape, self.data.shape) + self.assertEqual(out.dtype, self.data.dtype) + + def test_tuple(self): + """ () -> np.void instance """ + out = self.dset[()] + self.assertIsInstance(out, np.void) + self.assertEqual(out.dtype, self.data.dtype) + + def test_slice(self): + """ slice -> ValueError """ + with self.assertRaises(ValueError): + self.dset[0:4] + + def test_index(self): + """ index -> ValueError """ + with self.assertRaises(ValueError): + self.dset[0] + + def test_rt(self): + """ Compound types are read back in correct order (h5py issue 236)""" + + dt = np.dtype([('weight', np.float64), + ('cputime', np.float64), + ('walltime', np.float64), + ('parents_offset', np.uint32), + ('n_parents', np.uint32), + ('status', np.uint8), + ('endpoint_type', np.uint8),]) + + testdata = np.ndarray((16,), dtype=dt) + for key in dt.fields: + testdata[key] = np.random.random((16,)) * 100 + + self.f['test'] = testdata + outdata = self.f['test'][...] + self.assertTrue(np.all(outdata == testdata)) + self.assertEqual(outdata.dtype, testdata.dtype) + + def test_assign(self): + dt = np.dtype([('weight', (np.float64)), + ('endpoint_type', np.uint8),]) + + testdata = np.ndarray((16,), dtype=dt) + for key in dt.fields: + testdata[key] = np.random.random(size=testdata[key].shape) * 100 + + ds = self.f.create_dataset('test', (16,), dtype=dt) + for key in dt.fields: + ds[key] = testdata[key] + + outdata = self.f['test'][...] + + self.assertTrue(np.all(outdata == testdata)) + self.assertEqual(outdata.dtype, testdata.dtype) + + def test_read(self): + dt = np.dtype([('weight', (np.float64)), + ('endpoint_type', np.uint8),]) + + testdata = np.ndarray((16,), dtype=dt) + for key in dt.fields: + testdata[key] = np.random.random(size=testdata[key].shape) * 100 + + ds = self.f.create_dataset('test', (16,), dtype=dt) + + # Write to all fields + ds[...] = testdata + + for key in dt.fields: + outdata = self.f['test'][key] + np.testing.assert_array_equal(outdata, testdata[key]) + self.assertEqual(outdata.dtype, testdata[key].dtype) + + """ + TBD + def test_nested_compound_vlen(self): + dt_inner = np.dtype([('a', h5py.vlen_dtype(np.int32)), + ('b', h5py.vlen_dtype(np.int32))]) + + dt = np.dtype([('f1', h5py.vlen_dtype(dt_inner)), + ('f2', np.int64)]) + + inner1 = (np.array(range(1, 3), dtype=np.int32), + np.array(range(6, 9), dtype=np.int32)) + + inner2 = (np.array(range(10, 14), dtype=np.int32), + np.array(range(16, 21), dtype=np.int32)) + + data = np.array([(np.array([inner1, inner2], dtype=dt_inner), 2), + (np.array([inner1], dtype=dt_inner), 3)], + dtype=dt) + + self.f["ds"] = data + out = self.f["ds"] + + # Specifying check_alignment=False because vlen fields have 8 bytes of padding + # because the vlen datatype in hdf5 occupies 16 bytes + self.assertArrayEqual(out, data, check_alignment=False) + """ + + +if __name__ == '__main__': + loglevel = logging.ERROR + logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel) + ut.main() diff --git a/testall.py b/testall.py index 14f3a209..4914bd5f 100755 --- a/testall.py +++ b/testall.py @@ -14,6 +14,7 @@ import os import sys + hl_tests = ('test_attribute', 'test_committedtype', 'test_complex_numbers', @@ -26,6 +27,7 @@ 'test_dataset_pointselect', 'test_dataset_scalar', 'test_dataset_setitem', + 'test_datatype', 'test_dimscale', 'test_file', 'test_group', @@ -34,6 +36,7 @@ 'test_vlentype', 'test_folder') + app_tests = ('test_hsinfo', 'test_tall_inspect', 'test_diamond_inspect', 'test_shuffle_inspect')