Reinstate decode on load, now in-Iris coded.

pp-mo · pp-mo · commit 8d907a7ab0d1 · 2025-10-28T21:11:15.000Z
diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py
@@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate(
 
     # Determine the name of the dimension/s shared between the CF-netCDF data variable
     # and the coordinate being built.
-    common_dims = [
-        dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions
-    ]
+    coord_dims = cf_coord_var.dimensions
+    if cf._is_str_dtype(cf_coord_var):
+        coord_dims = coord_dims[:-1]
+    datavar_dims = engine.cf_var.dimensions
+    if cf._is_str_dtype(engine.cf_var):
+        datavar_dims = datavar_dims[:-1]
+    common_dims = [dim for dim in coord_dims if dim in datavar_dims]
     data_dims = None
     if common_dims:
         # Calculate the offset of each common dimension.
diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
@@ -796,15 +796,27 @@ def cf_label_data(self, cf_data_var):
 
         # Determine the name of the label string (or length) dimension by
         # finding the dimension name that doesn't exist within the data dimensions.
-        str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        n_nondata_dims = len(str_dim_names)
+
+        if n_nondata_dims == 0:
+            # *All* dims are shared with the data-variable.
+            # This is only ok if the data-var is *also* a string type.
+            dim_ok = _is_str_dtype(cf_data_var)
+            # In this case, we must just *assume* that the last dimension is "the"
+            #  string dimension
+            str_dim_name = self.dimensions[-1]
+        else:
+            # If there is exactly one non-data dim, that is the one we want
+            dim_ok = len(str_dim_names) == 1
+            (str_dim_name,) = str_dim_names
 
-        if len(str_dim_name) != 1:
+        if not dim_ok:
             raise ValueError(
                 "Invalid string dimensions for CF-netCDF label variable %r"
                 % self.cf_name
             )
 
-        str_dim_name = str_dim_name[0]
         label_data = self[:]
 
         if ma.isMaskedArray(label_data):
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -310,14 +310,39 @@ def fromcdl(cls, *args, **kwargs):
 class NetCDFDataProxy:
     """A reference to the data payload of a single NetCDF file variable."""
 
-    __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
-
-    def __init__(self, shape, dtype, path, variable_name, fill_value):
+    __slots__ = (
+        "shape",
+        "dtype",
+        "path",
+        "variable_name",
+        "fill_value",
+        "is_bytes",
+        "encoding",
+        "string_length",
+    )
+
+    def __init__(
+        self,
+        shape,
+        dtype,
+        path,
+        variable_name,
+        fill_value,
+        encoding: str | None = None,
+        string_length: int = 0,
+    ):
         self.shape = shape
         self.dtype = dtype
         self.path = path
         self.variable_name = variable_name
         self.fill_value = fill_value
+        self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
+        if self.is_bytes:
+            # We will be returning a different shape : the last dim is the byte-length
+            self.shape = self.shape[:-1]
+            self.dtype = np.dtype(f"U{string_length}")
+        self.encoding = encoding
+        self.string_length = string_length
 
     @property
     def ndim(self):
@@ -342,10 +367,20 @@ def __getitem__(self, keys):
                 variable.set_auto_chartostring(False)
 
                 # Get the NetCDF variable data and slice.
-                var = variable[keys]
+                data = variable[keys]
+
+                # If bytes, decode to strings
+                if self.is_bytes:
+                    from iris.util import convert_bytesarray_to_strings
+
+                    data = convert_bytesarray_to_strings(
+                        data,
+                        encoding=self.encoding,
+                        string_length=self.string_length,
+                    )
             finally:
                 dataset.close()
-        return np.asanyarray(var)
+        return np.asanyarray(data)
 
     def __repr__(self):
         fmt = (
diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
@@ -11,6 +11,7 @@
 
 """
 
+import codecs
 from collections.abc import Iterable, Iterator, Mapping
 from contextlib import contextmanager
 from copy import deepcopy
@@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var):
             # Normal NCVariable type:
             total_bytes = cf_var.size * cf_var.dtype.itemsize
 
+        default_encoding = "utf-8"
+        encoding = getattr(cf_var, "_Encoding", None)
+        if encoding is None:
+            # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+            encoding = default_encoding
+        else:
+            try:
+                # Accept + normalise naming of encodings
+                encoding = codecs.lookup(encoding).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Replace some invalid setting with "safe"(ish) fallback.
+                encoding = default_encoding
+
+        string_length = getattr(cf_var, "iris_string_length", None)
+
         if total_bytes < _LAZYVAR_MIN_BYTES:
             # Don't make a lazy array, as it will cost more memory AND more time to access.
             result = cf_var[:]
 
+            if result.dtype.kind == "S":
+                from iris.util import convert_bytesarray_to_strings
+
+                result = convert_bytesarray_to_strings(
+                    result,
+                    encoding=encoding,
+                    string_length=string_length,
+                )
+
             # Special handling of masked scalar value; this will be returned as
             # an `np.ma.masked` instance which will lose the original dtype.
             # Workaround for this it return a 1-element masked array of the
@@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var):
                 "_FillValue",
                 _thread_safe_nc.default_fillvals[fill_dtype],
             )
+
+            # NOTE: if the data is bytes which need to be converted to strings on read,
+            #  the data-proxy will do that (and it modifies its shape + dtype).
             proxy = NetCDFDataProxy(
-                cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
+                cf_var.shape,
+                dtype,
+                cf_var.filename,
+                cf_var.cf_name,
+                fill_value,
+                encoding=encoding,
+                string_length=string_length,
             )
             # Get the chunking specified for the variable : this is either a shape, or
             # maybe the string "contiguous".
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -3,13 +3,20 @@
 import pytest
 
 import iris
+
+iris.FUTURE.save_split_attrs = True
 from iris.coords import AuxCoord, DimCoord
 from iris.cube import Cube
 
 NX, N_STRLEN = 3, 64
 TEST_STRINGS = ["Münster", "London", "Amsterdam"]
 TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
 
+# VARS_COORDS_SHARE_STRING_DIM = True
+VARS_COORDS_SHARE_STRING_DIM = False
+if VARS_COORDS_SHARE_STRING_DIM:
+    TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
+
 
 def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
     bbytes = [text.encode(encoding) for text in string_array_1d]
@@ -19,18 +26,33 @@ def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
     return chararray
 
 
-# def convert_chararray_to_strings(char_array_2d, maxlen: int | None =0, encoding="utf-8"):
-#     strings = [bytes.decode(encoding) for bytes in char_array_2d]
-#     if not maxlen:
-#         maxlen = max(len(string) for string in strings)
-#     dtype_str = f"S{maxlen}"
-#     string_array = np.array(strings, dtype=dtype_str)
-#     return string_array
+def convert_bytesarray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
 
 
 INCLUDE_COORD = True
 # INCLUDE_COORD = False
 
+INCLUDE_NUMERIC_AUXCOORD = True
+# INCLUDE_NUMERIC_AUXCOORD = False
+
 
 def make_testfile(filepath, chararray, coordarray, encoding_str=None):
     with nc.Dataset(filepath, "w") as ds:
@@ -51,6 +73,13 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
             v_co[:] = coordarray
             if encoding_str is not None:
                 v_co._Encoding = encoding_str
+            if INCLUDE_NUMERIC_AUXCOORD:
+                v_num = ds.createVariable(
+                    "v_num",
+                    float,
+                    dimensions=("x",),
+                )
+                v_num[:] = np.arange(NX)
         v = ds.createVariable(
             "v",
             "S1",
@@ -63,7 +92,10 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
         if encoding_str is not None:
             v._Encoding = encoding_str
         if INCLUDE_COORD:
-            v.coordinates = "v_co"
+            coords_str = "v_co"
+            if INCLUDE_NUMERIC_AUXCOORD:
+                coords_str += " v_num"
+            v.coordinates = coords_str
 
 
 def make_testcube(
@@ -111,8 +143,10 @@ def show_result(filepath):
             )
         print("-data-")
         print(repr(cube.data))
+        print("-numeric auxcoord data-")
+        print(repr(cube.coord("x").points))
         if INCLUDE_COORD:
-            print("-coord data-")
+            print("-string auxcoord data-")
             try:
                 print(repr(cube.coord("v_co").points))
             except Exception as err2:
@@ -160,3 +194,19 @@ def test_save_encodings(encoding):
     filepath = f"tmp_save_{str(encoding)}.nc"
     iris.save(cube, filepath)
     show_result(filepath)
+
+
+# @pytest.mark.parametrize("ndim", [1, 2])
+# def test_convert_bytes_to_strings(ndim: int):
+#     if ndim == 1:
+#         source = convert_strings_to_chararray(TEST_STRINGS, 16)
+#     elif ndim == 2:
+#         source = np.stack([
+#             convert_strings_to_chararray(TEST_STRINGS, 16),
+#             convert_strings_to_chararray(TEST_COORD_VALS, 16),
+#         ])
+#     else:
+#         raise ValueError(f"Unexpected param ndim={ndim}.")
+#     # convert the strings to bytes
+#     result = convert_bytesarray_to_strings(source)
+#     print(result)
diff --git a/lib/iris/util.py b/lib/iris/util.py
@@ -2999,3 +2999,24 @@ def set(
 
 # Global CML settings object for use as context manager
 CML_SETTINGS: CMLSettings = CMLSettings()
+
+
+def convert_bytesarray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result