Skip to content

Commit 8d907a7

Browse files
committed
Reinstate decode on load, now in-Iris coded.
1 parent 1bcc78d commit 8d907a7

File tree

6 files changed

+179
-21
lines changed

6 files changed

+179
-21
lines changed

lib/iris/fileformats/_nc_load_rules/helpers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate(
16431643

16441644
# Determine the name of the dimension/s shared between the CF-netCDF data variable
16451645
# and the coordinate being built.
1646-
common_dims = [
1647-
dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions
1648-
]
1646+
coord_dims = cf_coord_var.dimensions
1647+
if cf._is_str_dtype(cf_coord_var):
1648+
coord_dims = coord_dims[:-1]
1649+
datavar_dims = engine.cf_var.dimensions
1650+
if cf._is_str_dtype(engine.cf_var):
1651+
datavar_dims = datavar_dims[:-1]
1652+
common_dims = [dim for dim in coord_dims if dim in datavar_dims]
16491653
data_dims = None
16501654
if common_dims:
16511655
# Calculate the offset of each common dimension.

lib/iris/fileformats/cf.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -796,15 +796,27 @@ def cf_label_data(self, cf_data_var):
796796

797797
# Determine the name of the label string (or length) dimension by
798798
# finding the dimension name that doesn't exist within the data dimensions.
799-
str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions))
799+
str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
800+
n_nondata_dims = len(str_dim_names)
801+
802+
if n_nondata_dims == 0:
803+
# *All* dims are shared with the data-variable.
804+
# This is only ok if the data-var is *also* a string type.
805+
dim_ok = _is_str_dtype(cf_data_var)
806+
# In this case, we must just *assume* that the last dimension is "the"
807+
# string dimension
808+
str_dim_name = self.dimensions[-1]
809+
else:
810+
# If there is exactly one non-data dim, that is the one we want
811+
dim_ok = len(str_dim_names) == 1
812+
(str_dim_name,) = str_dim_names
800813

801-
if len(str_dim_name) != 1:
814+
if not dim_ok:
802815
raise ValueError(
803816
"Invalid string dimensions for CF-netCDF label variable %r"
804817
% self.cf_name
805818
)
806819

807-
str_dim_name = str_dim_name[0]
808820
label_data = self[:]
809821

810822
if ma.isMaskedArray(label_data):

lib/iris/fileformats/netcdf/_thread_safe_nc.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -310,14 +310,39 @@ def fromcdl(cls, *args, **kwargs):
310310
class NetCDFDataProxy:
311311
"""A reference to the data payload of a single NetCDF file variable."""
312312

313-
__slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
314-
315-
def __init__(self, shape, dtype, path, variable_name, fill_value):
313+
__slots__ = (
314+
"shape",
315+
"dtype",
316+
"path",
317+
"variable_name",
318+
"fill_value",
319+
"is_bytes",
320+
"encoding",
321+
"string_length",
322+
)
323+
324+
def __init__(
325+
self,
326+
shape,
327+
dtype,
328+
path,
329+
variable_name,
330+
fill_value,
331+
encoding: str | None = None,
332+
string_length: int = 0,
333+
):
316334
self.shape = shape
317335
self.dtype = dtype
318336
self.path = path
319337
self.variable_name = variable_name
320338
self.fill_value = fill_value
339+
self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
340+
if self.is_bytes:
341+
# We will be returning a different shape : the last dim is the byte-length
342+
self.shape = self.shape[:-1]
343+
self.dtype = np.dtype(f"U{string_length}")
344+
self.encoding = encoding
345+
self.string_length = string_length
321346

322347
@property
323348
def ndim(self):
@@ -342,10 +367,20 @@ def __getitem__(self, keys):
342367
variable.set_auto_chartostring(False)
343368

344369
# Get the NetCDF variable data and slice.
345-
var = variable[keys]
370+
data = variable[keys]
371+
372+
# If bytes, decode to strings
373+
if self.is_bytes:
374+
from iris.util import convert_bytesarray_to_strings
375+
376+
data = convert_bytesarray_to_strings(
377+
data,
378+
encoding=self.encoding,
379+
string_length=self.string_length,
380+
)
346381
finally:
347382
dataset.close()
348-
return np.asanyarray(var)
383+
return np.asanyarray(data)
349384

350385
def __repr__(self):
351386
fmt = (

lib/iris/fileformats/netcdf/loader.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
1212
"""
1313

14+
import codecs
1415
from collections.abc import Iterable, Iterator, Mapping
1516
from contextlib import contextmanager
1617
from copy import deepcopy
@@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var):
269270
# Normal NCVariable type:
270271
total_bytes = cf_var.size * cf_var.dtype.itemsize
271272

273+
default_encoding = "utf-8"
274+
encoding = getattr(cf_var, "_Encoding", None)
275+
if encoding is None:
276+
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
277+
encoding = default_encoding
278+
else:
279+
try:
280+
# Accept + normalise naming of encodings
281+
encoding = codecs.lookup(encoding).name
282+
# NOTE: if encoding does not suit data, errors can occur.
283+
# For example, _Encoding = "ascii", with non-ascii content.
284+
except LookupError:
285+
# Replace some invalid setting with "safe"(ish) fallback.
286+
encoding = default_encoding
287+
288+
string_length = getattr(cf_var, "iris_string_length", None)
289+
272290
if total_bytes < _LAZYVAR_MIN_BYTES:
273291
# Don't make a lazy array, as it will cost more memory AND more time to access.
274292
result = cf_var[:]
275293

294+
if result.dtype.kind == "S":
295+
from iris.util import convert_bytesarray_to_strings
296+
297+
result = convert_bytesarray_to_strings(
298+
result,
299+
encoding=encoding,
300+
string_length=string_length,
301+
)
302+
276303
# Special handling of masked scalar value; this will be returned as
277304
# an `np.ma.masked` instance which will lose the original dtype.
278305
# Workaround for this it return a 1-element masked array of the
@@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var):
295322
"_FillValue",
296323
_thread_safe_nc.default_fillvals[fill_dtype],
297324
)
325+
326+
# NOTE: if the data is bytes which need to be converted to strings on read,
327+
# the data-proxy will do that (and it modifies its shape + dtype).
298328
proxy = NetCDFDataProxy(
299-
cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
329+
cf_var.shape,
330+
dtype,
331+
cf_var.filename,
332+
cf_var.cf_name,
333+
fill_value,
334+
encoding=encoding,
335+
string_length=string_length,
300336
)
301337
# Get the chunking specified for the variable : this is either a shape, or
302338
# maybe the string "contiguous".

lib/iris/tests/integration/netcdf/test_chararrays.py

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,20 @@
33
import pytest
44

55
import iris
6+
7+
iris.FUTURE.save_split_attrs = True
68
from iris.coords import AuxCoord, DimCoord
79
from iris.cube import Cube
810

911
NX, N_STRLEN = 3, 64
1012
TEST_STRINGS = ["Münster", "London", "Amsterdam"]
1113
TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
1214

15+
# VARS_COORDS_SHARE_STRING_DIM = True
16+
VARS_COORDS_SHARE_STRING_DIM = False
17+
if VARS_COORDS_SHARE_STRING_DIM:
18+
TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one
19+
1320

1421
def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
1522
bbytes = [text.encode(encoding) for text in string_array_1d]
@@ -19,18 +26,33 @@ def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
1926
return chararray
2027

2128

22-
# def convert_chararray_to_strings(char_array_2d, maxlen: int | None =0, encoding="utf-8"):
23-
# strings = [bytes.decode(encoding) for bytes in char_array_2d]
24-
# if not maxlen:
25-
# maxlen = max(len(string) for string in strings)
26-
# dtype_str = f"S{maxlen}"
27-
# string_array = np.array(strings, dtype=dtype_str)
28-
# return string_array
29+
def convert_bytesarray_to_strings(
30+
byte_array, encoding="utf-8", string_length: int | None = None
31+
):
32+
"""Convert bytes to strings.
33+
34+
N.B. for now at least, we assume the string dim is **always the last one**.
35+
"""
36+
bytes_shape = byte_array.shape
37+
var_shape = bytes_shape[:-1]
38+
if string_length is None:
39+
string_length = bytes_shape[-1]
40+
string_dtype = f"U{string_length}"
41+
result = np.empty(var_shape, dtype=string_dtype)
42+
for ndindex in np.ndindex(var_shape):
43+
element_bytes = byte_array[ndindex]
44+
bytes = b"".join([b if b else b"\0" for b in element_bytes])
45+
string = bytes.decode(encoding)
46+
result[ndindex] = string
47+
return result
2948

3049

3150
INCLUDE_COORD = True
3251
# INCLUDE_COORD = False
3352

53+
INCLUDE_NUMERIC_AUXCOORD = True
54+
# INCLUDE_NUMERIC_AUXCOORD = False
55+
3456

3557
def make_testfile(filepath, chararray, coordarray, encoding_str=None):
3658
with nc.Dataset(filepath, "w") as ds:
@@ -51,6 +73,13 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
5173
v_co[:] = coordarray
5274
if encoding_str is not None:
5375
v_co._Encoding = encoding_str
76+
if INCLUDE_NUMERIC_AUXCOORD:
77+
v_num = ds.createVariable(
78+
"v_num",
79+
float,
80+
dimensions=("x",),
81+
)
82+
v_num[:] = np.arange(NX)
5483
v = ds.createVariable(
5584
"v",
5685
"S1",
@@ -63,7 +92,10 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
6392
if encoding_str is not None:
6493
v._Encoding = encoding_str
6594
if INCLUDE_COORD:
66-
v.coordinates = "v_co"
95+
coords_str = "v_co"
96+
if INCLUDE_NUMERIC_AUXCOORD:
97+
coords_str += " v_num"
98+
v.coordinates = coords_str
6799

68100

69101
def make_testcube(
@@ -111,8 +143,10 @@ def show_result(filepath):
111143
)
112144
print("-data-")
113145
print(repr(cube.data))
146+
print("-numeric auxcoord data-")
147+
print(repr(cube.coord("x").points))
114148
if INCLUDE_COORD:
115-
print("-coord data-")
149+
print("-string auxcoord data-")
116150
try:
117151
print(repr(cube.coord("v_co").points))
118152
except Exception as err2:
@@ -160,3 +194,19 @@ def test_save_encodings(encoding):
160194
filepath = f"tmp_save_{str(encoding)}.nc"
161195
iris.save(cube, filepath)
162196
show_result(filepath)
197+
198+
199+
# @pytest.mark.parametrize("ndim", [1, 2])
200+
# def test_convert_bytes_to_strings(ndim: int):
201+
# if ndim == 1:
202+
# source = convert_strings_to_chararray(TEST_STRINGS, 16)
203+
# elif ndim == 2:
204+
# source = np.stack([
205+
# convert_strings_to_chararray(TEST_STRINGS, 16),
206+
# convert_strings_to_chararray(TEST_COORD_VALS, 16),
207+
# ])
208+
# else:
209+
# raise ValueError(f"Unexpected param ndim={ndim}.")
210+
# # convert the strings to bytes
211+
# result = convert_bytesarray_to_strings(source)
212+
# print(result)

lib/iris/util.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2999,3 +2999,24 @@ def set(
29992999

30003000
# Global CML settings object for use as context manager
30013001
CML_SETTINGS: CMLSettings = CMLSettings()
3002+
3003+
3004+
def convert_bytesarray_to_strings(
3005+
byte_array, encoding="utf-8", string_length: int | None = None
3006+
):
3007+
"""Convert bytes to strings.
3008+
3009+
N.B. for now at least, we assume the string dim is **always the last one**.
3010+
"""
3011+
bytes_shape = byte_array.shape
3012+
var_shape = bytes_shape[:-1]
3013+
if string_length is None:
3014+
string_length = bytes_shape[-1]
3015+
string_dtype = f"U{string_length}"
3016+
result = np.empty(var_shape, dtype=string_dtype)
3017+
for ndindex in np.ndindex(var_shape):
3018+
element_bytes = byte_array[ndindex]
3019+
bytes = b"".join([b if b else b"\0" for b in element_bytes])
3020+
string = bytes.decode(encoding)
3021+
result[ndindex] = string
3022+
return result

0 commit comments

Comments
 (0)