-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Replace VirtualiZarr.ZArray with zarr ArrayMetadata #175
base: main
Are you sure you want to change the base?
Changes from 2 commits
5e12b88
92a7e81
42b3f3a
c8c9020
5fa1dea
cea2214
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
from dataclasses import replace | ||
from typing import TYPE_CHECKING, Callable, Iterable | ||
from typing import TYPE_CHECKING, Callable, Iterable, Union | ||
|
||
import numpy as np | ||
from zarr.metadata import ArrayV3Metadata | ||
from zarr.abc.codec import Codec as ZCodec | ||
from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid | ||
|
||
from virtualizarr.zarr import Codec, ceildiv | ||
|
||
|
@@ -36,15 +37,7 @@ def _check_combineable_zarr_arrays(arrays: Iterable["ManifestArray"]) -> None: | |
|
||
# Can't combine different codecs in one manifest | ||
# see https://github.com/zarr-developers/zarr-specs/issues/288 | ||
# If we want to support Zarr's v2 and v3 metadata, we have to branch here | ||
# based on the type of arr.zarray.metadata | ||
_check_same_codecs( | ||
[ | ||
arr.zarray.metadata.codecs # type: ignore | ||
for arr in arrays | ||
if isinstance(arr.zarray.metadata, ArrayV3Metadata) | ||
] | ||
) | ||
_check_same_codecs([arr.zarray for arr in arrays]) | ||
|
||
# Would require variable-length chunks ZEP | ||
_check_same_chunk_shapes([arr.chunks for arr in arrays]) | ||
|
@@ -61,7 +54,20 @@ def _check_same_dtypes(dtypes: list[np.dtype]) -> None: | |
) | ||
|
||
|
||
def _check_same_codecs(codecs: list[Codec]) -> None: | ||
def _check_same_codecs(zarrays: list[ArrayMetadata]) -> None: | ||
if len({zarry.zarr_format for zarry in zarrays}) > 1: | ||
raise ValueError("Cannot concatenate arrays with different zarr formats.") | ||
|
||
def to_codec(zarray: ArrayMetadata) -> Union[Codec | tuple[ZCodec, ...]]: | ||
match zarray: | ||
case ArrayV2Metadata(compressor=compressor, filters=filters): | ||
return Codec(compressor=compressor, filters=filters) | ||
case ArrayV3Metadata(codecs=codecs): | ||
return codecs | ||
case _: | ||
raise ValueError("Unknown ArrayMetadata type") | ||
Comment on lines
+62
to
+68
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason to prefer this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mainly a style preference coming from languages with discriminated unions and compilers, but if you'd like the whole code base to be consistent I can rewrite it with |
||
|
||
codecs = [to_codec(zarray) for zarray in zarrays] | ||
first_codec, *other_codecs = codecs | ||
for codec in other_codecs: | ||
if codec != first_codec: | ||
|
@@ -155,7 +161,6 @@ def concatenate( | |
|
||
# chunk shape has not changed, there are just now more chunks along the concatenation axis | ||
new_zarray = replace(first_arr.zarray, shape=tuple(new_shape)) | ||
|
||
return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray) | ||
|
||
|
||
|
@@ -247,11 +252,10 @@ def stack( | |
old_chunks = first_arr.chunks | ||
new_chunks = list(old_chunks) | ||
new_chunks.insert(axis, 1) | ||
|
||
new_zarray = replace( | ||
first_arr.zarray, | ||
chunk_shape=tuple(new_chunks), | ||
shape=tuple(new_shape), | ||
chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunks)), | ||
) | ||
|
||
return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray) | ||
|
@@ -325,8 +329,8 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra | |
|
||
new_zarray = replace( | ||
x.zarray, | ||
chunk_shape=new_chunk_shape, | ||
shape=new_shape, | ||
shape=tuple(new_shape), | ||
chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunk_shape)), | ||
) | ||
|
||
return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,10 +4,11 @@ | |
import numpy as np | ||
import pytest | ||
from packaging.version import Version | ||
from zarr.array import ArrayV2Metadata | ||
|
||
from virtualizarr.manifests import ChunkManifest, ManifestArray | ||
from virtualizarr.manifests.manifest import join | ||
from virtualizarr.zarr import ZArray, ceildiv | ||
from virtualizarr.zarr import ceildiv | ||
|
||
network = pytest.mark.network | ||
|
||
|
@@ -46,15 +47,14 @@ def create_manifestarray( | |
The manifest is populated with a (somewhat) unique path, offset, and length for each key. | ||
""" | ||
|
||
zarray = ZArray( | ||
zarray = ArrayV2Metadata( | ||
chunks=chunks, | ||
compressor="zlib", | ||
compressor={"id": "zlib"}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the presence of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In V3, the Actually please ignore any of the changes in |
||
dtype=np.dtype("float32"), | ||
fill_value=0.0, # TODO change this to NaN? | ||
filters=None, | ||
order="C", | ||
shape=shape, | ||
zarr_format=2, | ||
) | ||
|
||
chunk_grid_shape = tuple( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is a nice check, but I think we could actually perform this check at
ManifestArray
construction time, because right now a lot of other things will break if the chunk grid is not regular.