Skip to content

Commit 7f1325d

Browse files
Extract endianness from Bytes codec in V2 metadata conversion (#769)
* Extract endianness from Bytes codec in V2 metadata conversion * Format * release note --------- Co-authored-by: Tom Nicholas <[email protected]>
1 parent c9ed9ce commit 7f1325d

File tree

3 files changed

+38
-11
lines changed

3 files changed

+38
-11
lines changed

docs/releases.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
By [Tom Nicholas](https://github.com/TomNicholas).
2121
- Fix handling of big-endian data in Icechunk by making sure that non-default zarr serializers are included in the zarr array metadata [#766](https://github.com/zarr-developers/VirtualiZarr/issues/766).
2222
By [Max Jones](https://github.com/maxrjones)
23+
- Fix handling of big-endian data in Kerchunk references [#769](https://github.com/zarr-developers/VirtualiZarr/issues/769).
24+
By [Max Jones](https://github.com/maxrjones)
2325

2426
### Documentation
2527

virtualizarr/tests/test_writers/test_kerchunk.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from xarray import Dataset
66
from zarr.core.metadata.v2 import ArrayV2Metadata
77

8-
from conftest import ARRAYBYTES_CODEC
98
from virtualizarr.manifests import ChunkManifest, ManifestArray
109
from virtualizarr.tests import requires_fastparquet, requires_kerchunk
1110
from virtualizarr.utils import JSON, convert_v3_to_v2_metadata, kerchunk_refs_as_json
@@ -194,12 +193,18 @@ def test_accessor_to_kerchunk_parquet(self, tmp_path, array_v3_metadata):
194193
}
195194

196195

197-
def testconvert_v3_to_v2_metadata(array_v3_metadata):
196+
@pytest.mark.parametrize("endian,expected_dtype_char", [("little", "<"), ("big", ">")])
197+
def test_convert_v3_to_v2_metadata(
198+
array_v3_metadata, endian: str, expected_dtype_char: str
199+
):
198200
shape = (5, 20)
199201
chunks = (5, 10)
200202
codecs = [
201-
ARRAYBYTES_CODEC,
202-
{"name": "numcodecs.delta", "configuration": {"dtype": "<i8"}},
203+
{"name": "bytes", "configuration": {"endian": endian}},
204+
{
205+
"name": "numcodecs.delta",
206+
"configuration": {"dtype": f"{expected_dtype_char}i8"},
207+
},
203208
{
204209
"name": "numcodecs.blosc",
205210
"configuration": {"cname": "zstd", "clevel": 5, "shuffle": 1},
@@ -211,19 +216,25 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata):
211216

212217
assert isinstance(v2_metadata, ArrayV2Metadata)
213218
assert v2_metadata.shape == shape
214-
assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32")
219+
expected_dtype = np.dtype(f"{expected_dtype_char}i4") # assuming int32
220+
assert v2_metadata.dtype.to_native_dtype() == expected_dtype
215221
assert v2_metadata.chunks == chunks
216222
assert v2_metadata.fill_value == 0
217-
compressor_config = v2_metadata.filters[1].get_config()
223+
224+
assert v2_metadata.filters
225+
filter_codec, compressor_codec = v2_metadata.filters
226+
compressor_config = compressor_codec.get_config()
218227
assert compressor_config["id"] == "blosc"
219228
assert compressor_config["cname"] == "zstd"
220229
assert compressor_config["clevel"] == 5
221230
assert compressor_config["shuffle"] == 1
222231
assert compressor_config["blocksize"] == 0
223-
filters_config = v2_metadata.filters[0].get_config()
232+
233+
filters_config = filter_codec.get_config()
224234
assert filters_config["id"] == "delta"
225-
assert filters_config["dtype"] == "<i8"
226-
assert filters_config["astype"] == "<i8"
235+
expected_delta_dtype = f"{expected_dtype_char}i8"
236+
assert filters_config["dtype"] == expected_delta_dtype
237+
assert filters_config["astype"] == expected_delta_dtype
227238
assert v2_metadata.attributes == {}
228239

229240

virtualizarr/utils.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import obstore as obs
1010
from zarr.abc.codec import ArrayBytesCodec
1111
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
12+
from zarr.dtype import data_type_registry
1213

1314
from virtualizarr.codecs import get_codec_config, zarr_codec_config_to_v2
1415
from virtualizarr.types.kerchunk import KerchunkStoreRefs
@@ -134,15 +135,28 @@ def convert_v3_to_v2_metadata(
134135
"""
135136

136137
# TODO: Check that all ArrayBytesCodecs should in fact be excluded for V2 metadata storage.
137-
# TODO: Test round-tripping big endian since that is stored in the bytes codec in V3; it should be included in data type instead for V2.
138138
v2_codecs = [
139139
zarr_codec_config_to_v2(get_codec_config(codec))
140140
for codec in v3_metadata.codecs
141141
if not isinstance(codec, ArrayBytesCodec)
142142
]
143+
# TODO: Remove convert_v3_to_v2_metadata and always encode V3 metadata.
144+
# This logic is based on the (default) Bytes codec's endian property,
145+
# but other codec pipelines could store endianness elsewhere.
146+
big_endian = any(
147+
isinstance(codec, ArrayBytesCodec)
148+
and hasattr(codec, "endian")
149+
and codec.endian.value == "big"
150+
for codec in v3_metadata.codecs
151+
)
152+
if big_endian:
153+
na_dtype = v3_metadata.data_type.to_native_dtype().newbyteorder(">")
154+
dtype = data_type_registry.match_dtype(dtype=na_dtype)
155+
else:
156+
dtype = v3_metadata.data_type
143157
v2_metadata = ArrayV2Metadata(
144158
shape=v3_metadata.shape,
145-
dtype=v3_metadata.data_type,
159+
dtype=dtype,
146160
chunks=v3_metadata.chunks,
147161
fill_value=fill_value or v3_metadata.fill_value,
148162
filters=v2_codecs

0 commit comments

Comments
 (0)