zarr-developers · ghidalgo3 · Jul 2, 2024 · Jul 8, 2024 · Jul 9, 2024 · Jul 10, 2024
diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -9,11 +9,11 @@
 import ujson  # type: ignore
 import xarray as xr
 from xarray.coding.times import CFDatetimeCoder
-from zarr.array import Array
+from zarr.array import ArrayMetadata, ArrayV2Metadata
 
 from virtualizarr.manifests.manifest import join
 from virtualizarr.utils import _fsspec_openfile_from_filepath
-from virtualizarr.zarr import ZAttrs
+from virtualizarr.zarr import ZAttrs, from_kerchunk_refs, to_kerchunk_json
 
 # Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean
 # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html)
@@ -196,8 +196,8 @@ def extract_array_refs(
 
 def parse_array_refs(
     arr_refs: KerchunkArrRefs,
-) -> tuple[dict, Array, ZAttrs]:
-    zarray = Array.from_kerchunk_refs(arr_refs.pop(".zarray"))
+) -> tuple[dict, ArrayMetadata, ZAttrs]:
+    zarray = from_kerchunk_refs(arr_refs.pop(".zarray"))
     zattrs = arr_refs.pop(".zattrs", {})
     chunk_dict = arr_refs
 
@@ -297,15 +297,15 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
         # TODO can this be generalized to save individual chunks of a dask array?
         # TODO will this fail for a scalar?
         arr_refs = {join(0 for _ in np_arr.shape): inlined_data}
-
-        zarray = Array.create(
-            store=None,  # type: ignore
+        zarray = ArrayV2Metadata(
+            chunks=np_arr.shape,
             shape=np_arr.shape,
             dtype=np_arr.dtype,
-            chunk_shape=np_arr.shape,
+            order="C",
+            fill_value=np.nan,
         )
 
-    zarray_dict = ujson.dumps(zarray)
+    zarray_dict = to_kerchunk_json(zarray)
     arr_refs[".zarray"] = zarray_dict
 
     zattrs = {**var.attrs, **var.encoding}

diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
@@ -2,7 +2,7 @@
 from typing import Any, Callable, Union
 
 import numpy as np
-from zarr.array import Array as ZArray
+from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid
 
 from ..kerchunk import KerchunkArrRefs
 from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS
@@ -22,27 +22,32 @@ class ManifestArray:
     """
 
     _manifest: ChunkManifest
-    _zarray: ZArray
+    _zarray: ArrayMetadata
 
     def __init__(
         self,
-        zarray: ZArray | dict,
+        zarray: ArrayMetadata | dict,
         chunkmanifest: dict | ChunkManifest,
     ) -> None:
         """
         Create a ManifestArray directly from the .zarray information of a zarr array and the manifest of chunks.
 
         Parameters
         ----------
-        zarray : dict or ZArray
+        zarray : dict or zarr.array.ArrayMetadata
         chunkmanifest : dict or ChunkManifest
         """
 
-        if isinstance(zarray, ZArray):
-            _zarray = zarray
-        else:
-            # try unpacking the dict
-            _zarray = ZArray(**zarray)
+        match zarray:
+            case ArrayMetadata():
+                _zarray = zarray
+            case dict():
+                zarray = zarray.copy()
+                zarr_format = zarray.pop("zarr_format", None)
+                if zarr_format == 3:
+                    _zarray = ArrayV3Metadata(**zarray)
+                else:
+                    _zarray = ArrayV2Metadata(**zarray)
 
         if isinstance(chunkmanifest, ChunkManifest):
             _chunkmanifest = chunkmanifest
@@ -79,12 +84,20 @@ def manifest(self) -> ChunkManifest:
         return self._manifest
 
     @property
-    def zarray(self) -> ZArray:
+    def zarray(self) -> ArrayMetadata:
         return self._zarray
 
     @property
     def chunks(self) -> tuple[int, ...]:
-        return tuple(self.zarray.chunks)
+        """
+        Individual chunk size by number of elements.
+        """
+        if isinstance(self._zarray.chunk_grid, RegularChunkGrid):
+            return self._zarray.chunk_grid.chunk_shape
+        else:
+            raise NotImplementedError(
+                "Only RegularChunkGrid is currently supported for chunk size"
+            )
 
     @property
     def dtype(self) -> np.dtype:
@@ -93,6 +106,9 @@ def dtype(self) -> np.dtype:
 
     @property
     def shape(self) -> tuple[int, ...]:
+        """
+        Array shape by number of elements along each dimension.
+        """
         return tuple(int(length) for length in list(self.zarray.shape))
 
     @property

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
@@ -1,8 +1,9 @@
 from dataclasses import replace
-from typing import TYPE_CHECKING, Callable, Iterable
+from typing import TYPE_CHECKING, Callable, Iterable, Union
 
 import numpy as np
-from zarr.metadata import ArrayV3Metadata
+from zarr.abc.codec import Codec as ZCodec
+from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid
 
 from virtualizarr.zarr import Codec, ceildiv
 
@@ -36,15 +37,7 @@ def _check_combineable_zarr_arrays(arrays: Iterable["ManifestArray"]) -> None:
 
     # Can't combine different codecs in one manifest
     # see https://github.com/zarr-developers/zarr-specs/issues/288
-    # If we want to support Zarr's v2 and v3 metadata, we have to branch here
-    # based on the type of arr.zarray.metadata
-    _check_same_codecs(
-        [
-            arr.zarray.metadata.codecs  # type: ignore
-            for arr in arrays
-            if isinstance(arr.zarray.metadata, ArrayV3Metadata)
-        ]
-    )
+    _check_same_codecs([arr.zarray for arr in arrays])
 
     # Would require variable-length chunks ZEP
     _check_same_chunk_shapes([arr.chunks for arr in arrays])
@@ -61,7 +54,20 @@ def _check_same_dtypes(dtypes: list[np.dtype]) -> None:
             )
 
 
-def _check_same_codecs(codecs: list[Codec]) -> None:
+def _check_same_codecs(zarrays: list[ArrayMetadata]) -> None:
+    if len({zarry.zarr_format for zarry in zarrays}) > 1:
+        raise ValueError("Cannot concatenate arrays with different zarr formats.")
+
+    def to_codec(zarray: ArrayMetadata) -> Union[Codec | tuple[ZCodec, ...]]:
+        match zarray:
+            case ArrayV2Metadata(compressor=compressor, filters=filters):
+                return Codec(compressor=compressor, filters=filters)
+            case ArrayV3Metadata(codecs=codecs):
+                return codecs
+            case _:
+                raise ValueError("Unknown ArrayMetadata type")
+
+    codecs = [to_codec(zarray) for zarray in zarrays]
     first_codec, *other_codecs = codecs
     for codec in other_codecs:
         if codec != first_codec:
@@ -155,7 +161,6 @@ def concatenate(
 
     # chunk shape has not changed, there are just now more chunks along the concatenation axis
     new_zarray = replace(first_arr.zarray, shape=tuple(new_shape))
-
     return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
 
 
@@ -247,11 +252,10 @@ def stack(
     old_chunks = first_arr.chunks
     new_chunks = list(old_chunks)
     new_chunks.insert(axis, 1)
-
     new_zarray = replace(
         first_arr.zarray,
-        chunk_shape=tuple(new_chunks),
         shape=tuple(new_shape),
+        chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunks)),
     )
 
     return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray)
@@ -325,8 +329,8 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
 
     new_zarray = replace(
         x.zarray,
-        chunk_shape=new_chunk_shape,
-        shape=new_shape,
+        shape=tuple(new_shape),
+        chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunk_shape)),
     )
 
     return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray)

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -4,10 +4,11 @@
 import numpy as np
 import pytest
 from packaging.version import Version
+from zarr.array import ArrayV2Metadata
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.manifests.manifest import join
-from virtualizarr.zarr import ZArray, ceildiv
+from virtualizarr.zarr import ceildiv
 
 network = pytest.mark.network
 
@@ -46,15 +47,14 @@ def create_manifestarray(
     The manifest is populated with a (somewhat) unique path, offset, and length for each key.
     """
 
-    zarray = ZArray(
+    zarray = ArrayV2Metadata(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "zlib"},
         dtype=np.dtype("float32"),
         fill_value=0.0,  # TODO change this to NaN?
         filters=None,
         order="C",
         shape=shape,
-        zarr_format=2,
     )
 
     chunk_grid_shape = tuple(

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -6,8 +6,6 @@
 import xarray as xr
 import xarray.testing as xrt
 from xarray.core.indexes import Index
-from zarr.array import Array
-from zarr.codecs import BytesCodec, ZstdCodec
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
@@ -19,16 +17,15 @@ def test_wrapping():
     chunks = (5, 10)
     shape = (5, 20)
     dtype = np.dtype("int32")
-    # This passes for V3
-    zarray = Array.create(
-        store=None,
-        shape=shape,
+    zarray = ZArray(
+        chunks=chunks,
+        compressor="zlib",
         dtype=dtype,
-        chunk_shape=chunks,
-        codecs=[BytesCodec(), ZstdCodec()],
         fill_value=0.0,
         filters=None,
-        zarr_format=3,
+        order="C",
+        shape=shape,
+        zarr_format=2,
     )
 
     chunks_dict = {
@@ -50,11 +47,9 @@ class TestEquals:
     def test_equals(self):
         chunks = (5, 10)
         shape = (5, 20)
-        # This passes for v2
-        zarray = Array.create(
-            store=None,
-            chunk_shape=chunks,
-            compressor=dict(id="zlib", level=1),
+        zarray = ZArray(
+            chunks=chunks,
+            compressor="zlib",
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -89,13 +84,9 @@ def test_equals(self):
 class TestConcat:
     def test_concat_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
-        # Does this need to work for both Zarr v2 and v3?
-        # Because eventually the zarray.metadata object is different and the
-        # concatenation check has to branch based on v2 and v3
-        zarray = Array.create(
-            store=None,
-            chunk_shape=(1, 10),
-            compressor=dict(id="zlib", level=1),
+        zarray = ZArray(
+            chunks=(1, 10),
+            compressor="zlib",
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,

diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
@@ -13,12 +13,13 @@ def test_zarr_v3_roundtrip(tmpdir):
         ),
         zarray=dict(
             shape=(2, 3),
-            dtype=np.dtype("<i8"),
-            chunks=(2, 3),
-            compressor=None,
-            filters=None,
+            data_type=np.dtype("<i8"),
+            chunk_grid={"name": "regular", "configuration": {"chunk_shape": [2, 3]}},
+            chunk_key_encoding={"name": "default", "configuration": {"separator": "."}},
+            codecs=(),
+            attributes={},
+            dimension_names=None,
             fill_value=np.nan,
-            order="C",
             zarr_format=3,
         ),
     )