Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
By [Tom Nicholas](https://github.com/TomNicholas).
- Fix handling of big-endian data in Icechunk by making sure that non-default zarr serializers are included in the zarr array metadata [#766](https://github.com/zarr-developers/VirtualiZarr/issues/766).
By [Max Jones](https://github.com/maxrjones)
- Fix handling of big-endian data in Kerchunk references [#769](https://github.com/zarr-developers/VirtualiZarr/issues/769).
By [Max Jones](https://github.com/maxrjones)

### Documentation

Expand Down
29 changes: 20 additions & 9 deletions virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from xarray import Dataset
from zarr.core.metadata.v2 import ArrayV2Metadata

from conftest import ARRAYBYTES_CODEC
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.tests import requires_fastparquet, requires_kerchunk
from virtualizarr.utils import JSON, convert_v3_to_v2_metadata, kerchunk_refs_as_json
Expand Down Expand Up @@ -194,12 +193,18 @@ def test_accessor_to_kerchunk_parquet(self, tmp_path, array_v3_metadata):
}


def testconvert_v3_to_v2_metadata(array_v3_metadata):
@pytest.mark.parametrize("endian,expected_dtype_char", [("little", "<"), ("big", ">")])
def test_convert_v3_to_v2_metadata(
array_v3_metadata, endian: str, expected_dtype_char: str
):
shape = (5, 20)
chunks = (5, 10)
codecs = [
ARRAYBYTES_CODEC,
{"name": "numcodecs.delta", "configuration": {"dtype": "<i8"}},
{"name": "bytes", "configuration": {"endian": endian}},
{
"name": "numcodecs.delta",
"configuration": {"dtype": f"{expected_dtype_char}i8"},
},
{
"name": "numcodecs.blosc",
"configuration": {"cname": "zstd", "clevel": 5, "shuffle": 1},
Expand All @@ -211,19 +216,25 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata):

assert isinstance(v2_metadata, ArrayV2Metadata)
assert v2_metadata.shape == shape
assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32")
expected_dtype = np.dtype(f"{expected_dtype_char}i4") # assuming int32
assert v2_metadata.dtype.to_native_dtype() == expected_dtype
assert v2_metadata.chunks == chunks
assert v2_metadata.fill_value == 0
compressor_config = v2_metadata.filters[1].get_config()

assert v2_metadata.filters
filter_codec, compressor_codec = v2_metadata.filters
compressor_config = compressor_codec.get_config()
assert compressor_config["id"] == "blosc"
assert compressor_config["cname"] == "zstd"
assert compressor_config["clevel"] == 5
assert compressor_config["shuffle"] == 1
assert compressor_config["blocksize"] == 0
filters_config = v2_metadata.filters[0].get_config()

filters_config = filter_codec.get_config()
assert filters_config["id"] == "delta"
assert filters_config["dtype"] == "<i8"
assert filters_config["astype"] == "<i8"
expected_delta_dtype = f"{expected_dtype_char}i8"
assert filters_config["dtype"] == expected_delta_dtype
assert filters_config["astype"] == expected_delta_dtype
assert v2_metadata.attributes == {}


Expand Down
18 changes: 16 additions & 2 deletions virtualizarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import obstore as obs
from zarr.abc.codec import ArrayBytesCodec
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.dtype import data_type_registry

from virtualizarr.codecs import get_codec_config, zarr_codec_config_to_v2
from virtualizarr.types.kerchunk import KerchunkStoreRefs
Expand Down Expand Up @@ -134,15 +135,28 @@ def convert_v3_to_v2_metadata(
"""

# TODO: Check that all ArrayBytesCodecs should in fact be excluded for V2 metadata storage.
# TODO: Test round-tripping big endian since that is stored in the bytes codec in V3; it should be included in data type instead for V2.
v2_codecs = [
zarr_codec_config_to_v2(get_codec_config(codec))
for codec in v3_metadata.codecs
if not isinstance(codec, ArrayBytesCodec)
]
# TODO: Remove convert_v3_to_v2_metadata and always encode V3 metadata.
# This logic is based on the (default) Bytes codec's endian property,
# but other codec pipelines could store endianness elsewhere.
big_endian = any(
isinstance(codec, ArrayBytesCodec)
and hasattr(codec, "endian")
and codec.endian.value == "big"
for codec in v3_metadata.codecs
)
if big_endian:
na_dtype = v3_metadata.data_type.to_native_dtype().newbyteorder(">")
dtype = data_type_registry.match_dtype(dtype=na_dtype)
else:
dtype = v3_metadata.data_type
v2_metadata = ArrayV2Metadata(
shape=v3_metadata.shape,
dtype=v3_metadata.data_type,
dtype=dtype,
chunks=v3_metadata.chunks,
fill_value=fill_value or v3_metadata.fill_value,
filters=v2_codecs
Expand Down