Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace VirtualiZarr.ZArray with zarr ArrayMetadata #175

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
import ujson # type: ignore
import xarray as xr
from xarray.coding.times import CFDatetimeCoder
from zarr.array import Array
from zarr.array import ArrayMetadata, ArrayV2Metadata

from virtualizarr.manifests.manifest import join
from virtualizarr.utils import _fsspec_openfile_from_filepath
from virtualizarr.zarr import ZAttrs
from virtualizarr.zarr import ZAttrs, from_kerchunk_refs, to_kerchunk_json

# Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean
# (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html)
Expand Down Expand Up @@ -196,8 +196,8 @@ def extract_array_refs(

def parse_array_refs(
arr_refs: KerchunkArrRefs,
) -> tuple[dict, Array, ZAttrs]:
zarray = Array.from_kerchunk_refs(arr_refs.pop(".zarray"))
) -> tuple[dict, ArrayMetadata, ZAttrs]:
zarray = from_kerchunk_refs(arr_refs.pop(".zarray"))
zattrs = arr_refs.pop(".zattrs", {})
chunk_dict = arr_refs

Expand Down Expand Up @@ -297,15 +297,15 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
# TODO can this be generalized to save individual chunks of a dask array?
# TODO will this fail for a scalar?
arr_refs = {join(0 for _ in np_arr.shape): inlined_data}

zarray = Array.create(
store=None, # type: ignore
zarray = ArrayV2Metadata(
chunks=np_arr.shape,
shape=np_arr.shape,
dtype=np_arr.dtype,
chunk_shape=np_arr.shape,
order="C",
fill_value=np.nan,
)

zarray_dict = ujson.dumps(zarray)
zarray_dict = to_kerchunk_json(zarray)
arr_refs[".zarray"] = zarray_dict

zattrs = {**var.attrs, **var.encoding}
Expand Down
38 changes: 27 additions & 11 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Any, Callable, Union

import numpy as np
from zarr.array import Array as ZArray
from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid

from ..kerchunk import KerchunkArrRefs
from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS
Expand All @@ -22,27 +22,32 @@ class ManifestArray:
"""

_manifest: ChunkManifest
_zarray: ZArray
_zarray: ArrayMetadata

def __init__(
self,
zarray: ZArray | dict,
zarray: ArrayMetadata | dict,
chunkmanifest: dict | ChunkManifest,
) -> None:
"""
Create a ManifestArray directly from the .zarray information of a zarr array and the manifest of chunks.

Parameters
----------
zarray : dict or ZArray
zarray : dict or zarr.array.ArrayMetadata
chunkmanifest : dict or ChunkManifest
"""

if isinstance(zarray, ZArray):
_zarray = zarray
else:
# try unpacking the dict
_zarray = ZArray(**zarray)
match zarray:
case ArrayMetadata():
_zarray = zarray
case dict():
zarray = zarray.copy()
zarr_format = zarray.pop("zarr_format", None)
if zarr_format == 3:
_zarray = ArrayV3Metadata(**zarray)
else:
_zarray = ArrayV2Metadata(**zarray)

if isinstance(chunkmanifest, ChunkManifest):
_chunkmanifest = chunkmanifest
Expand Down Expand Up @@ -79,12 +84,20 @@ def manifest(self) -> ChunkManifest:
return self._manifest

@property
def zarray(self) -> ZArray:
def zarray(self) -> ArrayMetadata:
return self._zarray

@property
def chunks(self) -> tuple[int, ...]:
return tuple(self.zarray.chunks)
"""
Individual chunk size by number of elements.
"""
if isinstance(self._zarray.chunk_grid, RegularChunkGrid):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a nice check, but I think we could actually perform this check at ManifestArray construction time, because right now a lot of other things will break if the chunk grid is not regular.

return self._zarray.chunk_grid.chunk_shape
else:
raise NotImplementedError(
"Only RegularChunkGrid is currently supported for chunk size"
)

@property
def dtype(self) -> np.dtype:
Expand All @@ -93,6 +106,9 @@ def dtype(self) -> np.dtype:

@property
def shape(self) -> tuple[int, ...]:
"""
Array shape by number of elements along each dimension.
"""
return tuple(int(length) for length in list(self.zarray.shape))

@property
Expand Down
38 changes: 21 additions & 17 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from dataclasses import replace
from typing import TYPE_CHECKING, Callable, Iterable
from typing import TYPE_CHECKING, Callable, Iterable, Union

import numpy as np
from zarr.metadata import ArrayV3Metadata
from zarr.abc.codec import Codec as ZCodec
from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid

from virtualizarr.zarr import Codec, ceildiv

Expand Down Expand Up @@ -36,15 +37,7 @@ def _check_combineable_zarr_arrays(arrays: Iterable["ManifestArray"]) -> None:

# Can't combine different codecs in one manifest
# see https://github.com/zarr-developers/zarr-specs/issues/288
# If we want to support Zarr's v2 and v3 metadata, we have to branch here
# based on the type of arr.zarray.metadata
_check_same_codecs(
[
arr.zarray.metadata.codecs # type: ignore
for arr in arrays
if isinstance(arr.zarray.metadata, ArrayV3Metadata)
]
)
_check_same_codecs([arr.zarray for arr in arrays])

# Would require variable-length chunks ZEP
_check_same_chunk_shapes([arr.chunks for arr in arrays])
Expand All @@ -61,7 +54,20 @@ def _check_same_dtypes(dtypes: list[np.dtype]) -> None:
)


def _check_same_codecs(codecs: list[Codec]) -> None:
def _check_same_codecs(zarrays: list[ArrayMetadata]) -> None:
if len({zarry.zarr_format for zarry in zarrays}) > 1:
raise ValueError("Cannot concatenate arrays with different zarr formats.")

def to_codec(zarray: ArrayMetadata) -> Union[Codec | tuple[ZCodec, ...]]:
match zarray:
case ArrayV2Metadata(compressor=compressor, filters=filters):
return Codec(compressor=compressor, filters=filters)
case ArrayV3Metadata(codecs=codecs):
return codecs
case _:
raise ValueError("Unknown ArrayMetadata type")
Comment on lines +62 to +68
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to prefer this match...case over a standard if isinstance()...else syntax?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly a style preference coming from languages with discriminated unions and compilers, but if you'd like the whole code base to be consistent I can rewrite it with isinstance.


codecs = [to_codec(zarray) for zarray in zarrays]
first_codec, *other_codecs = codecs
for codec in other_codecs:
if codec != first_codec:
Expand Down Expand Up @@ -155,7 +161,6 @@ def concatenate(

# chunk shape has not changed, there are just now more chunks along the concatenation axis
new_zarray = replace(first_arr.zarray, shape=tuple(new_shape))

return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)


Expand Down Expand Up @@ -247,11 +252,10 @@ def stack(
old_chunks = first_arr.chunks
new_chunks = list(old_chunks)
new_chunks.insert(axis, 1)

new_zarray = replace(
first_arr.zarray,
chunk_shape=tuple(new_chunks),
shape=tuple(new_shape),
chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunks)),
)

return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray)
Expand Down Expand Up @@ -325,8 +329,8 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra

new_zarray = replace(
x.zarray,
chunk_shape=new_chunk_shape,
shape=new_shape,
shape=tuple(new_shape),
chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunk_shape)),
)

return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray)
Expand Down
8 changes: 4 additions & 4 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import numpy as np
import pytest
from packaging.version import Version
from zarr.array import ArrayV2Metadata

from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.manifests.manifest import join
from virtualizarr.zarr import ZArray, ceildiv
from virtualizarr.zarr import ceildiv

network = pytest.mark.network

Expand Down Expand Up @@ -46,15 +47,14 @@ def create_manifestarray(
The manifest is populated with a (somewhat) unique path, offset, and length for each key.
"""

zarray = ZArray(
zarray = ArrayV2Metadata(
chunks=chunks,
compressor="zlib",
compressor={"id": "zlib"},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the presence of "id" a v2 vs v3 difference? because you don't seem to use it in the other tests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In V3, the codecs array is made up of (name, configuration) objects, but in V2 the id field serves as the name. In this particular test, I just chose to convert the ZArray to an ArrayV2Metadata

Actually please ignore any of the changes in tests for now because I will need to edit almost every test to excise ZArray :) The few changes I added were done up until I hit the kerchunk module import issue

dtype=np.dtype("float32"),
fill_value=0.0, # TODO change this to NaN?
filters=None,
order="C",
shape=shape,
zarr_format=2,
)

chunk_grid_shape = tuple(
Expand Down
33 changes: 12 additions & 21 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import xarray as xr
import xarray.testing as xrt
from xarray.core.indexes import Index
from zarr.array import Array
from zarr.codecs import BytesCodec, ZstdCodec

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
Expand All @@ -19,16 +17,15 @@ def test_wrapping():
chunks = (5, 10)
shape = (5, 20)
dtype = np.dtype("int32")
# This passes for V3
zarray = Array.create(
store=None,
shape=shape,
zarray = ZArray(
chunks=chunks,
compressor="zlib",
dtype=dtype,
chunk_shape=chunks,
codecs=[BytesCodec(), ZstdCodec()],
fill_value=0.0,
filters=None,
zarr_format=3,
order="C",
shape=shape,
zarr_format=2,
)

chunks_dict = {
Expand All @@ -50,11 +47,9 @@ class TestEquals:
def test_equals(self):
chunks = (5, 10)
shape = (5, 20)
# This passes for v2
zarray = Array.create(
store=None,
chunk_shape=chunks,
compressor=dict(id="zlib", level=1),
zarray = ZArray(
chunks=chunks,
compressor="zlib",
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -89,13 +84,9 @@ def test_equals(self):
class TestConcat:
def test_concat_along_existing_dim(self):
# both manifest arrays in this example have the same zarray properties
# Does this need to work for both Zarr v2 and v3?
# Because eventually the zarray.metadata object is different and the
# concatenation check has to branch based on v2 and v3
zarray = Array.create(
store=None,
chunk_shape=(1, 10),
compressor=dict(id="zlib", level=1),
zarray = ZArray(
chunks=(1, 10),
compressor="zlib",
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down
11 changes: 6 additions & 5 deletions virtualizarr/tests/test_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ def test_zarr_v3_roundtrip(tmpdir):
),
zarray=dict(
shape=(2, 3),
dtype=np.dtype("<i8"),
chunks=(2, 3),
compressor=None,
filters=None,
data_type=np.dtype("<i8"),
chunk_grid={"name": "regular", "configuration": {"chunk_shape": [2, 3]}},
chunk_key_encoding={"name": "default", "configuration": {"separator": "."}},
codecs=(),
attributes={},
dimension_names=None,
fill_value=np.nan,
order="C",
zarr_format=3,
),
)
Expand Down
Loading
Loading