diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 26919178..46546b3f 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -30,7 +30,7 @@ from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot -from .util.storUtil import getFilters +from .util.storUtil import getSupportedFilters from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId @@ -1092,7 +1092,7 @@ async def POST_Dataset(request): # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ # filters.html#grammar-token-filter_list f_in = creationProperties["filters"] - supported_filters = getFilters(include_compressors=True) + supported_filters = getSupportedFilters(include_compressors=True) log.debug(f"supported_compressors: {supported_filters}") log.debug(f"filters provided in creationProperties: {f_in}") diff --git a/hsds/util/hdf5dtype.py b/hsds/util/hdf5dtype.py index 67119491..a1cf1361 100644 --- a/hsds/util/hdf5dtype.py +++ b/hsds/util/hdf5dtype.py @@ -339,12 +339,25 @@ def getTypeItem(dt, metadata=None): else: # Fixed length string type type_info["class"] = "H5T_STRING" - type_info["charSet"] = "H5T_CSET_ASCII" type_info["length"] = dt.itemsize + type_info["charSet"] = "H5T_CSET_ASCII" type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.base.kind == "U": # Fixed length unicode type - raise TypeError("Fixed length unicode type is not supported") + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + raise TypeError("unexpected reference type") + + # Fixed length UTF8 string type + type_info["class"] = "H5T_STRING" + + # this can be problematic if the encoding of the string is not valid, + # or reqires too many bytes. Use unicode sting length * 4 to handle all + # UTF8 strings correctly + type_info["charSet"] = "H5T_CSET_UTF8" + # convert from UTF32 length to a fixed length + type_info["length"] = dt.itemsize // 4 + type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.kind == "b": # boolean type - h5py stores as enum @@ -614,8 +627,7 @@ def createBaseDataType(typeItem): if typeItem["charSet"] == "H5T_CSET_ASCII": type_code = "S" elif typeItem["charSet"] == "H5T_CSET_UTF8": - msg = "fixed-width unicode strings are not supported" - raise TypeError(msg) + type_code = "U" else: raise TypeError("unexpected 'charSet' value") # a fixed size string diff --git a/hsds/util/storUtil.py b/hsds/util/storUtil.py index 704c2980..5b55fa19 100644 --- a/hsds/util/storUtil.py +++ b/hsds/util/storUtil.py @@ -62,10 +62,11 @@ def getCompressors(): return compressors -def getFilters(include_compressors=True): +def getSupportedFilters(include_compressors=True): """return list of other supported filters""" filters = [ "shuffle", + "fletcher32" ] if include_compressors: filters.extend(getCompressors()) diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 072d25f4..2dbc1e57 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -517,7 +517,7 @@ def testPutFixedStringNullTerm(self): def testPutVLenUTF8String(self): # Test PUT value for 1d attribute with fixed length UTF-8 string - print("testPutFixedUTF8String", self.base_domain) + print("testPutVLenUTF8String", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) req = self.endpoint + "/" @@ -531,13 +531,7 @@ def testPutVLenUTF8String(self): # create attr text = "I'm an UTF-8 null terminated string" - text_length = len(text) + 1 - fixed_str_type = { - "charSet": "H5T_CSET_UTF8", - "class": "H5T_STRING", - "length": text_length, - "strPad": "H5T_STR_NULLTERM", - } + variable_str_type = { "charSet": "H5T_CSET_UTF8", "class": "H5T_STRING", @@ -545,14 +539,58 @@ def testPutVLenUTF8String(self): "strPad": "H5T_STR_NULLTERM", } scalar_shape = {"class": "H5S_SCALAR"} - data = {"type": fixed_str_type, "shape": scalar_shape, "value": text} + + data = {"type": variable_str_type, "shape": scalar_shape, "value": text} attr_name = "str_attr" req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name - # Should fail since UTF8 with fixed width is not supported rsp = self.session.put(req, data=json.dumps(data), headers=headers) - self.assertEqual(rsp.status_code, 400) + self.assertEqual(rsp.status_code, 201) - data = {"type": variable_str_type, "shape": scalar_shape, "value": text} + # read attr + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], text) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_STRING") + self.assertTrue("length" in type_json) + self.assertEqual(type_json["length"], "H5T_VARIABLE") + self.assertTrue("strPad" in type_json) + self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") + self.assertTrue("charSet" in type_json) + self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8") + + def testPutFixedUTF8String(self): + # Test PUT value for 1d attribute with fixed length UTF-8 string + print("testPutFixedUTF8String", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create attr + text = "this is the chinese character for the number eight: \u516b" + + text_length = len(text) + 1 + fixed_str_type = { + "charSet": "H5T_CSET_UTF8", + "class": "H5T_STRING", + "length": text_length, + "strPad": "H5T_STR_NULLTERM", + } + + scalar_shape = {"class": "H5S_SCALAR"} + data = {"type": fixed_str_type, "shape": scalar_shape, "value": text} attr_name = "str_attr" req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name rsp = self.session.put(req, data=json.dumps(data), headers=headers) @@ -570,7 +608,7 @@ def testPutVLenUTF8String(self): self.assertTrue("class" in type_json) self.assertEqual(type_json["class"], "H5T_STRING") self.assertTrue("length" in type_json) - self.assertEqual(type_json["length"], "H5T_VARIABLE") + self.assertEqual(type_json["length"], text_length) self.assertTrue("strPad" in type_json) self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") self.assertTrue("charSet" in type_json) diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index c9e1c18d..18c08e53 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -1055,10 +1055,16 @@ def testCreationPropertiesLayoutDataset(self): "level": 9, "name": "deflate", } + fletcher32_filter = { + "class": "H5Z_FILTER_FLETCHER32", + "id": 3, + "name": "fletcher32" + } payload["creationProperties"] = { "layout": {"class": "H5D_CHUNKED", "dims": [1, 390, 512]}, "filters": [ gzip_filter, + fletcher32_filter, ], } req = self.endpoint + "/datasets" @@ -1094,7 +1100,7 @@ def testCreationPropertiesLayoutDataset(self): cpl = rspJson["creationProperties"] self.assertTrue("filters") in cpl filters = cpl["filters"] - self.assertEqual(len(filters), 1) + self.assertEqual(len(filters), 2) filter = filters[0] self.assertTrue("class") in filter self.assertEqual(filter["class"], "H5Z_FILTER_DEFLATE") @@ -1103,6 +1109,12 @@ def testCreationPropertiesLayoutDataset(self): self.assertTrue("id" in filter) self.assertEqual(filter["id"], 1) + filter = filters[1] + self.assertTrue("class") in filter + self.assertEqual(filter["class"], "H5Z_FILTER_FLETCHER32") + self.assertTrue("id" in filter) + self.assertEqual(filter["id"], 3) + def testCreationPropertiesContiguousDataset(self): # test Dataset with creation property list domain = self.base_domain + "/testCreationPropertiesContigousDataset.h5" diff --git a/tests/unit/hdf5_dtype_test.py b/tests/unit/hdf5_dtype_test.py index 63da67dc..9497defc 100755 --- a/tests/unit/hdf5_dtype_test.py +++ b/tests/unit/hdf5_dtype_test.py @@ -91,12 +91,12 @@ def testBaseStringTypeItem(self): def testBaseStringUTFTypeItem(self): dt = np.dtype("U3") - try: - typeItem = hdf5dtype.getTypeItem(dt) - self.assertTrue(typeItem is not None) # avoid pyflakes error - self.assertTrue(False) # expected exception - except TypeError: - pass # expected + typeItem = hdf5dtype.getTypeItem(dt) + self.assertEqual(typeItem["class"], "H5T_STRING") + # type item length in bytes (may no actual be enough space for some UTF strings) + self.assertEqual(typeItem["length"], 3) + self.assertEqual(typeItem["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(typeItem["charSet"], "H5T_CSET_UTF8") def testBaseVLenAsciiTypeItem(self): dt = special_dtype(vlen=bytes) @@ -388,13 +388,14 @@ def testCreateBaseStringType(self): self.assertEqual(typeSize, 6) def testCreateBaseUnicodeType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 32} - try: - dt = hdf5dtype.createDataType(typeItem) - self.assertTrue(dt is not None) - self.assertTrue(False) # expected exception - except TypeError: - pass + typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 6} + + dt = hdf5dtype.createDataType(typeItem) + typeSize = hdf5dtype.getItemSize(typeItem) + self.assertTrue(dt is not None) + self.assertEqual(dt.name, "str192") + self.assertEqual(dt.kind, "U") + self.assertEqual(typeSize, 6) def testCreateNullTermStringType(self): typeItem = {