diff --git a/README.md b/README.md index a4f6205..5badefb 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,18 @@ Unlike [binsparse-python](https://github.com/ivirshup/binsparse-python), the dif This does make reading specific parts (e.g. the coordinates) in a single request a bit harder, but having a single logical array map to a on-disk zarr array does have its advantages. +Useful links: + +- zarr-python PR: https://github.com/zarr-developers/zarr-python/pull/3529 +- sparse indexing adapter: https://github.com/keewis/sparse-indexing-adapter + ## Installation `zarr-sparse` currently requires a special version of zarr. To install it, use: ```sh pip install \ - "zarr @ git+https://github.com/keewis/zarr-python.git@zarr-sparse-patch" \ + "zarr @ git+https://github.com/keewis/zarr-python.git@array-registry" \ "zarr-sparse @ git+https://github.com/keewis/zarr-sparse.git@main" ``` diff --git a/pyproject.toml b/pyproject.toml index d8a77e4..b0d9454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ test = [ [dependency-groups] dev = [ "black>=25.1.0", + "dask>=2025.10.0", "hatch>=1.14.1", "hypothesis>=6.138.0", "ipdb>=0.13.13", @@ -34,6 +35,7 @@ dev = [ "pyinstrument>=5.1.1", "pytest>=8.3.5", "pytest-xdist>=3.6.1", + "xarray>=2025.10.1", ] [tool.hatch] diff --git a/zarr_sparse/buffer.py b/zarr_sparse/buffer.py index 8216716..e013037 100644 --- a/zarr_sparse/buffer.py +++ b/zarr_sparse/buffer.py @@ -10,7 +10,7 @@ from zarr.registry import register_ndbuffer from zarr_sparse.chunk_grid import ChunkGrid -from zarr_sparse.combine import combine_nd +from zarr_sparse.combine import combine_nd, first_value from zarr_sparse.slices import slice_size from zarr_sparse.utils import as_decorator @@ -21,6 +21,16 @@ def sparse_equal(a, b, equal_nan: bool) -> bool: equal_nan = equal_nan if a.dtype.kind not in ("U", "S", "T", "O", "V") else False + if isinstance(a, ChunkGrid): + if len(a.data) == 1: + a = next(iter(a.data.values())) + else: + raise RuntimeError("comparing multi-chunk grid") + if isinstance(b, ChunkGrid): + if len(b.data) == 1: + b = next(iter(b.data.values())) + else: + raise RuntimeError("comparing multi-chunk grid") if b.ndim == 0: if not np.array_equal( @@ -104,7 +114,9 @@ def __getitem__(self, key: Any) -> Self: def __setitem__(self, key: Any, value: Any) -> None: if isinstance(value, NDBuffer): - value = value._data + if len(value._data.data) != 1: + raise RuntimeError("setting a non-one-sized buffer is not allowed") + value = first_value(value._data.data) slice_sizes = tuple( slice_size(slice_, size) for slice_, size in zip(key, self._data.shape) diff --git a/zarr_sparse/codec/codec.py b/zarr_sparse/codec/codec.py index b6ab650..9409c3c 100644 --- a/zarr_sparse/codec/codec.py +++ b/zarr_sparse/codec/codec.py @@ -9,11 +9,12 @@ from zarr.codecs import BytesCodec, ZstdCodec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import Buffer, NDBuffer +from zarr.core.buffer.cpu import Buffer as CPUBuffer from zarr.core.common import JSON, parse_named_configuration from zarr.core.dtype.npy.int import Int64 from zarr.registry import get_pipeline_class, register_codec -from zarr_sparse.buffer import sparse_buffer_prototype +from zarr_sparse.buffer import SparseNDBuffer, sparse_buffer_prototype from zarr_sparse.codec import metadata from zarr_sparse.combine import first_value from zarr_sparse.comparison import compare_fill_value @@ -104,6 +105,9 @@ async def decode_metadata_table(table_data: Buffer) -> dict[str, Any]: class SparseArrayCodec(ArrayBytesCodec): + codec_input = SparseNDBuffer + codec_output = CPUBuffer + def __init__(self): self.array_codecs = (BytesCodec(), ZstdCodec()) self.table_codecs = (BytesCodec(),)