From b61fa0e21f8093d6c237e0b35c54cb1dba9bbd8c Mon Sep 17 00:00:00 2001
From: mattjala <124107509+mattjala@users.noreply.github.com>
Date: Tue, 26 Mar 2024 14:32:22 -0500
Subject: [PATCH] Field selection for simple compound types (#173)

* Field selection for simple compound types

* Add logging to test_datatype
---
 h5pyd/_hl/dataset.py     |  37 ++++-----
 h5pyd/_hl/h5type.py      |  13 ++--
 test/hl/test_datatype.py | 157 +++++++++++++++++++++++++++++++++++++++
 testall.py               |   3 +
 4 files changed, 181 insertions(+), 29 deletions(-)
 create mode 100644 test/hl/test_datatype.py

diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py
index 7f8038ba..6aea38a9 100644
--- a/h5pyd/_hl/dataset.py
+++ b/h5pyd/_hl/dataset.py
@@ -1049,6 +1049,9 @@ def __getitem__(self, args, new_dtype=None):
         req = "/datasets/" + self.id.uuid + "/value"
         params = {}
 
+        if len(names) > 0:
+            params["fields"] = ":".join(names)
+
         if self.id._http_conn.mode == "r" and self.id._http_conn.cache_on:
             # enables lambda to be used on server
             self.log.debug("setting nonstrict parameter")
@@ -1483,41 +1486,23 @@ def __setitem__(self, args, val):
                                  last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
             mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
             mshape = val.shape[0:len(val.shape)-len(shp)]
+        """
 
-
-        # Make a compound memory type if field-name slicing is required
-        elif len(names) != 0:
-
-            mshape = val.shape
-
+        # Check for field selection
+        if len(names) != 0:
             # Catch common errors
             if self.dtype.fields is None:
                 raise TypeError("Illegal slicing argument (not a compound dataset)")
             mismatch = [x for x in names if x not in self.dtype.fields]
             if len(mismatch) != 0:
-                mismatch = ", ".join('"%s"'%x for x in mismatch)
+                mismatch = ", ".join('"%s"' % x for x in mismatch)
                 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
 
-            # Write non-compound source into a single dataset field
-            if len(names) == 1 and val.dtype.fields is None:
-                subtype = h5y.py_create(val.dtype)
-                mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
-                mtype.insert(self._e(names[0]), 0, subtype)
-
-            # Make a new source type keeping only the requested fields
-            else:
-                fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
-                mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
-                for fieldname in fieldnames:
-                    subtype = h5t.py_create(val.dtype.fields[fieldname][0])
-                    offset = val.dtype.fields[fieldname][1]
-                   mtype.insert(self._e(fieldname), offset, subtype)
-
         # Use mtype derived from array (let DatasetID.write figure it out)
         else:
             mshape = val.shape
-            #mtype = None
-        """
+            # mtype = None
+
         mshape = val.shape
         self.log.debug(f"mshape: {mshape}")
         self.log.debug(f"data dtype: {val.dtype}")
@@ -1582,6 +1567,10 @@ def __setitem__(self, args, val):
             self.log.debug(f"got select query param: {select_param}")
             params["select"] = select_param
 
+        # Perform write to subset of named fields within compound datatype, if any
+        if len(names) > 0:
+            params["fields"] = ":".join(names)
+
         self.PUT(req, body=body, format=format, params=params)
         """
         mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad))
diff --git a/h5pyd/_hl/h5type.py b/h5pyd/_hl/h5type.py
index 81b0ed42..fed3da71 100644
--- a/h5pyd/_hl/h5type.py
+++ b/h5pyd/_hl/h5type.py
@@ -441,10 +441,14 @@ def getTypeItem(dt):
             type_info['length'] = 'H5T_VARIABLE'
             type_info['charSet'] = 'H5T_CSET_UTF8'
             type_info['strPad'] = 'H5T_STR_NULLTERM'
-        elif vlen_check == int:
+        elif vlen_check in (int, np.int64):
             type_info['class'] = 'H5T_VLEN'
             type_info['size'] = 'H5T_VARIABLE'
             type_info['base'] = 'H5T_STD_I64'
+        elif vlen_check == np.int32:
+            type_info['class'] = 'H5T_VLEN'
+            type_info['size'] = 'H5T_VARIABLE'
+            type_info['base'] = 'H5T_STD_I32'
         elif vlen_check in (float, np.float64):
             type_info['class'] = 'H5T_VLEN'
             type_info['size'] = 'H5T_VARIABLE'
@@ -456,7 +460,7 @@ def getTypeItem(dt):
             type_info['base'] = getTypeItem(vlen_check)
         elif vlen_check is not None:
             # unknown vlen type
-            raise TypeError("Unknown h5py vlen type: " + str(vlen_check))
+            raise TypeError("Unknown h5pyd vlen type: " + str(vlen_check))
         elif ref_check is not None:
             # a reference type
             type_info['class'] = 'H5T_REFERENCE'
@@ -781,7 +785,7 @@ def createBaseDataType(typeItem):
             raise TypeError("ArrayType is not supported for variable len types")
         if 'base' not in typeItem:
             raise KeyError("'base' not provided")
-        baseType = createBaseDataType(typeItem['base'])
+        baseType = createDataType(typeItem['base'])
         dtRet = special_dtype(vlen=np.dtype(baseType))
     elif typeClass == 'H5T_OPAQUE':
         if dims:
@@ -842,9 +846,8 @@ def createBaseDataType(typeItem):
         else:
             # not a boolean enum, use h5py special dtype
             dtRet = special_dtype(enum=(dt, mapping))
-
     else:
-        raise TypeError("Invalid type class")
+        raise TypeError(f"Invalid base type class: {typeClass}")
 
     return dtRet
 
diff --git a/test/hl/test_datatype.py b/test/hl/test_datatype.py
new file mode 100644
index 00000000..00c0c507
--- /dev/null
+++ b/test/hl/test_datatype.py
@@ -0,0 +1,157 @@
+##############################################################################
+# Copyright by The HDF Group.                                                #
+# All rights reserved.                                                       #
+#                                                                            #
+# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and      #
+# Utilities.  The full HDF5 REST Server copyright notice, including          #
+# terms governing use, modification, and redistribution, is contained in     #
+# the file COPYING, which can be found at the root of the source code        #
+# distribution tree.  If you do not have access to this file, you may        #
+# request a copy from help@hdfgroup.org.                                     #
+##############################################################################
+
+import numpy as np
+import math
+import logging
+import config
+
+if config.get("use_h5py"):
+    import h5py
+else:
+    import h5pyd as h5py
+
+from common import ut, TestCase
+
+
+class TestScalarCompound(TestCase):
+
+    def setUp(self):
+        filename = self.getFileName("scalar_compound_dset")
+        print("filename:", filename)
+        self.f = h5py.File(filename, "w")
+        self.data = np.array((42.5, -118, "Hello"), dtype=[('a', 'f'), ('b', 'i'), ('c', '|S10')])
+        self.dset = self.f.create_dataset('x', data=self.data)
+
+    def test_ndim(self):
+        """ Verify number of dimensions """
+        self.assertEqual(self.dset.ndim, 0)
+
+    def test_shape(self):
+        """ Verify shape """
+        self.assertEqual(self.dset.shape, tuple())
+
+    def test_size(self):
+        """ Verify size """
+        self.assertEqual(self.dset.size, 1)
+
+    def test_ellipsis(self):
+        """ Ellipsis -> scalar ndarray """
+        out = self.dset[...]
+        # assertArrayEqual doesn't work with compounds; do manually
+        self.assertIsInstance(out, np.ndarray)
+        self.assertEqual(out.shape, self.data.shape)
+        self.assertEqual(out.dtype, self.data.dtype)
+
+    def test_tuple(self):
+        """ () -> np.void instance """
+        out = self.dset[()]
+        self.assertIsInstance(out, np.void)
+        self.assertEqual(out.dtype, self.data.dtype)
+
+    def test_slice(self):
+        """ slice -> ValueError """
+        with self.assertRaises(ValueError):
+            self.dset[0:4]
+
+    def test_index(self):
+        """ index -> ValueError """
+        with self.assertRaises(ValueError):
+            self.dset[0]
+
+    def test_rt(self):
+        """ Compound types are read back in correct order (h5py issue 236)"""
+
+        dt = np.dtype([('weight', np.float64),
+                       ('cputime', np.float64),
+                       ('walltime', np.float64),
+                       ('parents_offset', np.uint32),
+                       ('n_parents', np.uint32),
+                       ('status', np.uint8),
+                       ('endpoint_type', np.uint8),])
+
+        testdata = np.ndarray((16,), dtype=dt)
+        for key in dt.fields:
+            testdata[key] = np.random.random((16,)) * 100
+
+        self.f['test'] = testdata
+        outdata = self.f['test'][...]
+        self.assertTrue(np.all(outdata == testdata))
+        self.assertEqual(outdata.dtype, testdata.dtype)
+
+    def test_assign(self):
+        dt = np.dtype([('weight', (np.float64)),
+                       ('endpoint_type', np.uint8),])
+
+        testdata = np.ndarray((16,), dtype=dt)
+        for key in dt.fields:
+            testdata[key] = np.random.random(size=testdata[key].shape) * 100
+
+        ds = self.f.create_dataset('test', (16,), dtype=dt)
+        for key in dt.fields:
+            ds[key] = testdata[key]
+
+        outdata = self.f['test'][...]
+
+        self.assertTrue(np.all(outdata == testdata))
+        self.assertEqual(outdata.dtype, testdata.dtype)
+
+    def test_read(self):
+        dt = np.dtype([('weight', (np.float64)),
+                       ('endpoint_type', np.uint8),])
+
+        testdata = np.ndarray((16,), dtype=dt)
+        for key in dt.fields:
+            testdata[key] = np.random.random(size=testdata[key].shape) * 100
+
+        ds = self.f.create_dataset('test', (16,), dtype=dt)
+
+        # Write to all fields
+        ds[...] = testdata
+
+        for key in dt.fields:
+            outdata = self.f['test'][key]
+            np.testing.assert_array_equal(outdata, testdata[key])
+            self.assertEqual(outdata.dtype, testdata[key].dtype)
+
+    """
+    TBD
+    def test_nested_compound_vlen(self):
+        dt_inner = np.dtype([('a', h5py.vlen_dtype(np.int32)),
+                            ('b', h5py.vlen_dtype(np.int32))])
+
+        dt = np.dtype([('f1', h5py.vlen_dtype(dt_inner)),
+                       ('f2', np.int64)])
+
+        inner1 = (np.array(range(1, 3), dtype=np.int32),
+                  np.array(range(6, 9), dtype=np.int32))
+
+        inner2 = (np.array(range(10, 14), dtype=np.int32),
+                  np.array(range(16, 21), dtype=np.int32))
+
+        data = np.array([(np.array([inner1, inner2], dtype=dt_inner), 2),
+                        (np.array([inner1], dtype=dt_inner), 3)],
+                        dtype=dt)
+
+        self.f["ds"] = data
+        out = self.f["ds"]
+
+        # Specifying check_alignment=False because vlen fields have 8 bytes of padding
+        # because the vlen datatype in hdf5 occupies 16 bytes
+        self.assertArrayEqual(out, data, check_alignment=False)
+    """
+
+
+if __name__ == '__main__':
+    loglevel = logging.ERROR
+    logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel)
+    ut.main()
diff --git a/testall.py b/testall.py
index 14f3a209..4914bd5f 100755
--- a/testall.py
+++ b/testall.py
@@ -14,6 +14,7 @@
 import os
 import sys
 
+
 hl_tests = ('test_attribute',
             'test_committedtype',
             'test_complex_numbers',
@@ -26,6 +27,7 @@
             'test_dataset_pointselect',
             'test_dataset_scalar',
             'test_dataset_setitem',
+            'test_datatype',
             'test_dimscale',
             'test_file',
             'test_group',
@@ -34,6 +36,7 @@
             'test_vlentype',
             'test_folder')
 
+
 app_tests = ('test_hsinfo', 'test_tall_inspect', 'test_diamond_inspect',
              'test_shuffle_inspect')