diff --git a/pyproject.toml b/pyproject.toml index e9722711b..80e03322a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -177,7 +177,6 @@ extend-select = [ ignore = [ "RUF003", "RUF005", - "RUF009", "RUF012", "RUF015", ] diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 028d1757c..0836d878a 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -13,7 +13,8 @@ if TYPE_CHECKING: from typing_extensions import Self - from zarr.common import ArraySpec, SliceSelection + from zarr.common import ArraySpec + from zarr.indexing import SelectorTuple from zarr.metadata import ArrayMetadata @@ -155,13 +156,13 @@ class ArrayBytesCodecPartialDecodeMixin: """Mixin for array-to-bytes codecs that implement partial decoding.""" async def _decode_partial_single( - self, byte_getter: ByteGetter, selection: SliceSelection, chunk_spec: ArraySpec + self, byte_getter: ByteGetter, selection: SelectorTuple, chunk_spec: ArraySpec ) -> NDBuffer | None: raise NotImplementedError async def decode_partial( self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteGetter, SelectorTuple, ArraySpec]], ) -> Iterable[NDBuffer | None]: """Partially decodes a batch of chunks. This method determines parts of a chunk from the slice selection, @@ -169,7 +170,7 @@ async def decode_partial( Parameters ---------- - batch_info : Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]] + batch_info : Iterable[tuple[ByteGetter, SelectorTuple, ArraySpec]] Ordered set of information about slices of encoded chunks. The slice selection determines which parts of the chunk will be fetched. The ByteGetter is used to fetch the necessary bytes. @@ -196,14 +197,14 @@ async def _encode_partial_single( self, byte_setter: ByteSetter, chunk_array: NDBuffer, - selection: SliceSelection, + selection: SelectorTuple, chunk_spec: ArraySpec, ) -> None: raise NotImplementedError async def encode_partial( self, - batch_info: Iterable[tuple[ByteSetter, NDBuffer, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteSetter, NDBuffer, SelectorTuple, ArraySpec]], ) -> None: """Partially encodes a batch of chunks. This method determines parts of a chunk from the slice selection, encodes them and @@ -213,7 +214,7 @@ async def encode_partial( Parameters ---------- - batch_info : Iterable[tuple[ByteSetter, NDBuffer, SliceSelection, ArraySpec]] + batch_info : Iterable[tuple[ByteSetter, NDBuffer, SelectorTuple, ArraySpec]] Ordered set of information about slices of to-be-encoded chunks. The slice selection determines which parts of the chunk will be encoded. The ByteSetter is used to write the necessary bytes and fetch bytes for existing chunk data. @@ -342,15 +343,16 @@ async def encode( @abstractmethod async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, + drop_axes: tuple[int, ...] = (), ) -> None: """Reads chunk data from the store, decodes it and writes it into an output array. Partial decoding may be utilized if the codecs and stores support it. Parameters ---------- - batch_info : Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]] + batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]] Ordered set of information about the chunks. The first slice selection determines which parts of the chunk will be fetched. The second slice selection determines where in the output array the chunk data will be written. @@ -363,8 +365,9 @@ async def read( @abstractmethod async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, + drop_axes: tuple[int, ...] = (), ) -> None: """Encodes chunk data and writes it to the store. Merges with existing chunk data by reading first, if necessary. @@ -372,7 +375,7 @@ async def write( Parameters ---------- - batch_info : Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]] + batch_info : Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]] Ordered set of information about the chunks. The first slice selection determines which parts of the chunk will be encoded. The second slice selection determines where in the value array the chunk data is located. diff --git a/src/zarr/array.py b/src/zarr/array.py index 2c6fab6e5..3e6cf5531 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -12,7 +12,7 @@ from asyncio import gather from collections.abc import Iterable from dataclasses import dataclass, replace -from typing import Any, Literal +from typing import Any, Literal, cast import numpy as np import numpy.typing as npt @@ -33,9 +33,32 @@ Selection, ZarrFormat, concurrent_map, + product, ) from zarr.config import config, parse_indexing_order -from zarr.indexing import BasicIndexer +from zarr.indexing import ( + BasicIndexer, + BasicSelection, + BlockIndex, + BlockIndexer, + BlockSelection, + CoordinateIndexer, + CoordinateSelection, + Fields, + Indexer, + MaskIndexer, + MaskSelection, + OIndex, + OrthogonalIndexer, + OrthogonalSelection, + VIndex, + check_fields, + check_no_multi_fields, + is_pure_fancy_indexing, + is_pure_orthogonal_indexing, + is_scalar, + pop_fields, +) from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata from zarr.store import StoreLike, StorePath, make_store_path from zarr.sync import sync @@ -378,6 +401,51 @@ def basename(self) -> str | None: return self.name.split("/")[-1] return None + async def _get_selection( + self, + indexer: Indexer, + *, + out: NDBuffer | None = None, + factory: Factory.Create = NDBuffer.create, + fields: Fields | None = None, + ) -> NDArrayLike: + # check fields are sensible + out_dtype = check_fields(fields, self.dtype) + + # setup output buffer + if out is not None: + if isinstance(out, NDBuffer): + out_buffer = out + else: + raise TypeError(f"out argument needs to be an NDBuffer. Got {type(out)!r}") + if out_buffer.shape != indexer.shape: + raise ValueError( + f"shape of out argument doesn't match. Expected {indexer.shape}, got {out.shape}" + ) + else: + out_buffer = factory( + shape=indexer.shape, + dtype=out_dtype, + order=self.order, + fill_value=self.metadata.fill_value, + ) + if product(indexer.shape) > 0: + # reading chunks and decoding them + await self.metadata.codec_pipeline.read( + [ + ( + self.store_path / self.metadata.encode_chunk_key(chunk_coords), + self.metadata.get_chunk_spec(chunk_coords, self.order), + chunk_selection, + out_selection, + ) + for chunk_coords, chunk_selection, out_selection in indexer + ], + out_buffer, + drop_axes=indexer.drop_axes, + ) + return out_buffer.as_ndarray_like() + async def getitem( self, selection: Selection, *, factory: Factory.Create = NDBuffer.create ) -> NDArrayLike: @@ -386,48 +454,24 @@ async def getitem( shape=self.metadata.shape, chunk_grid=self.metadata.chunk_grid, ) - - # setup output array - out = factory( - shape=indexer.shape, - dtype=self.metadata.dtype, - order=self.order, - fill_value=0, # TODO use fill_value - ) - - # reading chunks and decoding them - await self.metadata.codec_pipeline.read( - [ - ( - self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self.order), - chunk_selection, - out_selection, - ) - for chunk_coords, chunk_selection, out_selection in indexer - ], - out, - ) - return out.as_ndarray_like() + return await self._get_selection(indexer, factory=factory) async def _save_metadata(self, metadata: ArrayMetadata) -> None: to_save = metadata.to_buffer_dict() awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] await gather(*awaitables) - async def setitem( + async def _set_selection( self, - selection: Selection, + indexer: Indexer, value: NDArrayLike, + *, factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like, + fields: Fields | None = None, ) -> None: - indexer = BasicIndexer( - selection, - shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, - ) - - sel_shape = indexer.shape + # check fields are sensible + check_fields(fields, self.dtype) + fields = check_no_multi_fields(fields) # check value shape if np.isscalar(value): @@ -435,7 +479,9 @@ async def setitem( else: if not hasattr(value, "shape"): value = np.asarray(value, self.metadata.dtype) - assert value.shape == sel_shape + # assert ( + # value.shape == indexer.shape + # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}" if value.dtype.name != self.metadata.dtype.name: value = value.astype(self.metadata.dtype, order="A") @@ -456,7 +502,21 @@ async def setitem( for chunk_coords, chunk_selection, out_selection in indexer ], value_buffer, + drop_axes=indexer.drop_axes, + ) + + async def setitem( + self, + selection: Selection, + value: NDArrayLike, + factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like, + ) -> None: + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_grid=self.metadata.chunk_grid, ) + return await self._set_selection(indexer, value, factory=factory) async def resize( self, new_shape: ChunkCoords, delete_outside_chunks: bool = True @@ -621,14 +681,135 @@ def order(self) -> Literal["C", "F"]: return self._async_array.order def __getitem__(self, selection: Selection) -> NDArrayLike: - return sync( - self._async_array.getitem(selection), - ) + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, self.ndim): + return self.vindex[cast(CoordinateSelection | MaskSelection, selection)] + elif is_pure_orthogonal_indexing(pure_selection, self.ndim): + return self.get_orthogonal_selection(pure_selection, fields=fields) + else: + return self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields) def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - sync( - self._async_array.setitem(selection, value), - ) + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, self.ndim): + self.vindex[cast(CoordinateSelection | MaskSelection, selection)] = value + elif is_pure_orthogonal_indexing(pure_selection, self.ndim): + self.set_orthogonal_selection(pure_selection, value, fields=fields) + else: + self.set_basic_selection(cast(BasicSelection, pure_selection), value, fields=fields) + + def get_basic_selection( + self, + selection: BasicSelection = Ellipsis, + out: NDBuffer | None = None, + fields: Fields | None = None, + ) -> NDArrayLike: + if self.shape == (): + raise NotImplementedError + else: + return sync( + self._async_array._get_selection( + BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + out=out, + fields=fields, + ) + ) + + def set_basic_selection( + self, selection: BasicSelection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_orthogonal_selection( + self, + selection: OrthogonalSelection, + out: NDBuffer | None = None, + fields: Fields | None = None, + ) -> NDArrayLike: + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) + + def set_orthogonal_selection( + self, selection: OrthogonalSelection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + return sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_mask_selection( + self, mask: MaskSelection, out: NDBuffer | None = None, fields: Fields | None = None + ) -> NDArrayLike: + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) + + def set_mask_selection( + self, mask: MaskSelection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_coordinate_selection( + self, + selection: CoordinateSelection, + out: NDBuffer | None = None, + fields: Fields | None = None, + ) -> NDArrayLike: + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + out_array = sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) + + # restore shape + out_array = out_array.reshape(indexer.sel_shape) + return out_array + + def set_coordinate_selection( + self, selection: CoordinateSelection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + # setup indexer + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + + # handle value - need ndarray-like flatten value + if not is_scalar(value, self.dtype): + try: + from numcodecs.compat import ensure_ndarray_like + + value = ensure_ndarray_like(value) # TODO replace with agnostic + except TypeError: + # Handle types like `list` or `tuple` + value = np.array(value) # TODO replace with agnostic + if hasattr(value, "shape") and len(value.shape) > 1: + value = value.reshape(-1) + + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_block_selection( + self, + selection: BlockSelection, + out: NDBuffer | None = None, + fields: Fields | None = None, + ) -> NDArrayLike: + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) + + def set_block_selection( + self, + selection: BlockSelection, + value: NDArrayLike, + fields: Fields | None = None, + ) -> None: + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + @property + def vindex(self) -> VIndex: + return VIndex(self) + + @property + def oindex(self) -> OIndex: + return OIndex(self) + + @property + def blocks(self) -> BlockIndex: + return BlockIndex(self) def resize(self, new_shape: ChunkCoords) -> Array: return type(self)( diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 59994e70d..138c7f66d 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -63,7 +63,9 @@ def __getitem__(self, key: slice) -> Self: ... def __setitem__(self, key: slice, value: Any) -> None: ... - def reshape(self, shape: ChunkCoords, *, order: Literal["A", "C", "F"] = ...) -> Self: ... + def reshape( + self, shape: ChunkCoords | Literal[-1], *, order: Literal["A", "C", "F"] = ... + ) -> Self: ... def view(self, dtype: npt.DTypeLike) -> Self: ... @@ -304,7 +306,7 @@ class NDBuffer: """ def __init__(self, array: NDArrayLike): - assert array.ndim > 0 + # assert array.ndim > 0 assert array.dtype != object self._data = array @@ -418,7 +420,11 @@ def byteorder(self) -> Endian: else: return Endian(sys.byteorder) - def reshape(self, newshape: ChunkCoords) -> Self: + def reshape(self, newshape: ChunkCoords | Literal[-1]) -> Self: + return self.__class__(self._data.reshape(newshape)) + + def squeeze(self, axis: tuple[int, ...]) -> Self: + newshape = tuple(a for i, a in enumerate(self.shape) if i not in axis) return self.__class__(self._data.reshape(newshape)) def astype(self, dtype: npt.DTypeLike, order: Literal["K", "A", "C", "F"] = "K") -> Self: @@ -435,6 +441,9 @@ def __setitem__(self, key: Any, value: Any) -> None: def __len__(self) -> int: return self._data.__len__() + def __repr__(self) -> str: + return f"" + def all_equal(self, other: Any) -> bool: return bool((self._data == other).all()) diff --git a/src/zarr/chunk_grids.py b/src/zarr/chunk_grids.py index f6366b803..941f79984 100644 --- a/src/zarr/chunk_grids.py +++ b/src/zarr/chunk_grids.py @@ -1,8 +1,11 @@ from __future__ import annotations import itertools +import operator +from abc import abstractmethod from collections.abc import Iterator from dataclasses import dataclass +from functools import reduce from typing import TYPE_CHECKING from zarr.abc.metadata import Metadata @@ -13,7 +16,7 @@ parse_named_configuration, parse_shapelike, ) -from zarr.indexing import _ceildiv +from zarr.indexing import ceildiv if TYPE_CHECKING: from typing_extensions import Self @@ -31,8 +34,13 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid) -> ChunkGrid: return RegularChunkGrid._from_dict(data) raise ValueError(f"Unknown chunk grid. Got {name_parsed}.") + @abstractmethod def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: - raise NotImplementedError + pass + + @abstractmethod + def get_nchunks(self, array_shape: ChunkCoords) -> int: + pass @dataclass(frozen=True) @@ -55,5 +63,12 @@ def to_dict(self) -> dict[str, JSON]: def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: return itertools.product( - *(range(0, _ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) + *(range(0, ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) + ) + + def get_nchunks(self, array_shape: ChunkCoords) -> int: + return reduce( + operator.mul, + (ceildiv(s, c) for s, c in zip(array_shape, self.chunk_shape, strict=True)), + 1, ) diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 3ef3a87db..939428431 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -11,12 +11,12 @@ __all__ = [ "BatchedCodecPipeline", - "BloscCodec", "BloscCname", + "BloscCodec", "BloscShuffle", "BytesCodec", - "Endian", "Crc32cCodec", + "Endian", "GzipCodec", "ShardingCodec", "ShardingCodecIndexLocation", diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 6f493c9e8..ada4ae23f 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -20,13 +20,13 @@ from zarr.codecs.registry import get_codec_class from zarr.common import JSON, concurrent_map, parse_named_configuration from zarr.config import config -from zarr.indexing import is_total_slice +from zarr.indexing import SelectorTuple, is_scalar, is_total_slice from zarr.metadata import ArrayMetadata if TYPE_CHECKING: from typing_extensions import Self - from zarr.common import ArraySpec, SliceSelection + from zarr.common import ArraySpec T = TypeVar("T") U = TypeVar("U") @@ -247,7 +247,7 @@ async def decode_batch( async def decode_partial_batch( self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteGetter, SelectorTuple, ArraySpec]], ) -> Iterable[NDBuffer | None]: assert self.supports_partial_decode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) @@ -282,7 +282,7 @@ async def encode_batch( async def encode_partial_batch( self, - batch_info: Iterable[tuple[ByteSetter, NDBuffer, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteSetter, NDBuffer, SelectorTuple, ArraySpec]], ) -> None: assert self.supports_partial_encode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) @@ -290,8 +290,9 @@ async def encode_partial_batch( async def read_batch( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, + drop_axes: tuple[int, ...] = (), ) -> None: if self.supports_partial_decode: chunk_array_batch = await self.decode_partial_batch( @@ -326,14 +327,55 @@ async def read_batch( ): if chunk_array is not None: tmp = chunk_array[chunk_selection] + if drop_axes != (): + tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: out[out_selection] = chunk_spec.fill_value + def _merge_chunk_array( + self, + existing_chunk_array: NDBuffer | None, + value: NDBuffer, + out_selection: SelectorTuple, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + drop_axes: tuple[int, ...], + ) -> NDBuffer: + if is_total_slice(chunk_selection, chunk_spec.shape) and value.shape == chunk_spec.shape: + return value + if existing_chunk_array is None: + chunk_array = NDBuffer.create( + shape=chunk_spec.shape, + dtype=chunk_spec.dtype, + order=chunk_spec.order, + fill_value=chunk_spec.fill_value, + ) + else: + chunk_array = existing_chunk_array.copy() # make a writable copy + if chunk_selection == (): + chunk_value = value + elif is_scalar(value.as_ndarray_like(), chunk_spec.dtype): + chunk_value = value + else: + chunk_value = value[out_selection] + # handle missing singleton dimensions + if drop_axes != (): + item = tuple( + None # equivalent to np.newaxis + if idx in drop_axes + else slice(None) + for idx in range(chunk_spec.ndim) + ) + chunk_value = chunk_value[item] + chunk_array[chunk_selection] = chunk_value + return chunk_array + async def write_batch( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, + drop_axes: tuple[int, ...] = (), ) -> None: if self.supports_partial_encode: await self.encode_partial_batch( @@ -368,28 +410,10 @@ async def _read_key(byte_setter: ByteSetter | None) -> Buffer | None: ], ) - def _merge_chunk_array( - existing_chunk_array: NDBuffer | None, - new_chunk_array_slice: NDBuffer, - chunk_spec: ArraySpec, - chunk_selection: SliceSelection, - ) -> NDBuffer: - if is_total_slice(chunk_selection, chunk_spec.shape): - return new_chunk_array_slice - if existing_chunk_array is None: - chunk_array = NDBuffer.create( - shape=chunk_spec.shape, - dtype=chunk_spec.dtype, - order=chunk_spec.order, - fill_value=chunk_spec.fill_value, - ) - else: - chunk_array = existing_chunk_array.copy() # make a writable copy - chunk_array[chunk_selection] = new_chunk_array_slice - return chunk_array - chunk_array_batch = [ - _merge_chunk_array(chunk_array, value[out_selection], chunk_spec, chunk_selection) + self._merge_chunk_array( + chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes + ) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( chunk_array_batch, batch_info, strict=False ) @@ -450,12 +474,13 @@ async def encode( async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, + drop_axes: tuple[int, ...] = (), ) -> None: await concurrent_map( [ - (single_batch_info, out) + (single_batch_info, out, drop_axes) for single_batch_info in batched(batch_info, self.batch_size) ], self.read_batch, @@ -464,12 +489,13 @@ async def read( async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, + drop_axes: tuple[int, ...] = (), ) -> None: await concurrent_map( [ - (single_batch_info, value) + (single_batch_info, value, drop_axes) for single_batch_info in batched(batch_info, self.batch_size) ], self.write_batch, diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index a7b6edc3b..dab2810f3 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -33,11 +33,7 @@ parse_shapelike, product, ) -from zarr.indexing import ( - BasicIndexer, - c_order_iter, - morton_order_iter, -) +from zarr.indexing import BasicIndexer, SelectorTuple, c_order_iter, get_indexer, morton_order_iter from zarr.metadata import ArrayMetadata, parse_codecs if TYPE_CHECKING: @@ -45,7 +41,7 @@ from typing_extensions import Self - from zarr.common import JSON, SliceSelection + from zarr.common import JSON MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -423,7 +419,7 @@ async def _decode_single( async def _decode_partial_single( self, byte_getter: ByteGetter, - selection: SliceSelection, + selection: SelectorTuple, shard_spec: ArraySpec, ) -> NDBuffer | None: shard_shape = shard_spec.shape @@ -431,7 +427,7 @@ async def _decode_partial_single( chunks_per_shard = self._get_chunks_per_shard(shard_spec) chunk_spec = self._get_chunk_spec(shard_spec) - indexer = BasicIndexer( + indexer = get_indexer( selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), @@ -520,7 +516,7 @@ async def _encode_partial_single( self, byte_setter: ByteSetter, shard_array: NDBuffer, - selection: SliceSelection, + selection: SelectorTuple, shard_spec: ArraySpec, ) -> None: shard_shape = shard_spec.shape @@ -535,10 +531,8 @@ async def _encode_partial_single( ) indexer = list( - BasicIndexer( - selection, - shape=shard_shape, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + get_indexer( + selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape) ) ) diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 6bc83d506..98130fe0c 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -2,84 +2,294 @@ import itertools import math -from collections.abc import Iterator -from typing import TYPE_CHECKING, NamedTuple +import numbers +import operator +from collections.abc import Iterator, Sequence +from dataclasses import dataclass +from enum import Enum +from functools import reduce +from types import EllipsisType +from typing import ( + TYPE_CHECKING, + Any, + NamedTuple, + Protocol, + TypeGuard, + TypeVar, + cast, + runtime_checkable, +) -from zarr.common import ChunkCoords, Selection, SliceSelection, product +import numpy as np +import numpy.typing as npt + +from zarr.common import ChunkCoords, product if TYPE_CHECKING: + from zarr.array import Array + from zarr.buffer import NDArrayLike from zarr.chunk_grids import ChunkGrid +BasicSelector = int | slice | EllipsisType +BasicSelectorTuple = tuple[BasicSelector, ...] +BasicSelection = BasicSelector | BasicSelectorTuple +BasicSelectionNormalized = tuple[int | slice, ...] +CoordinateSelector = list[int] | npt.NDArray[np.intp] +CoordinateSelection = CoordinateSelector | tuple[CoordinateSelector, ...] +CoordinateSelectionNormalized = tuple[npt.NDArray[np.intp], ...] +BlockSelector = int | slice +BlockSelection = BlockSelector | tuple[BlockSelector, ...] +BlockSelectionNormalized = tuple[BlockSelector, ...] +MaskSelection = npt.NDArray[np.bool_] +OrthogonalSelector = int | slice | npt.NDArray[np.intp] | npt.NDArray[np.bool_] +OrthogonalSelection = OrthogonalSelector | tuple[OrthogonalSelector, ...] +OrthogonalSelectionNormalized = tuple[OrthogonalSelector, ...] -def _ensure_tuple(v: Selection) -> SliceSelection: - if not isinstance(v, tuple): - v = (v,) - return v +Selection = ( + BasicSelection | CoordinateSelection | BlockSelection | MaskSelection | OrthogonalSelection +) +SelectionNormalized = ( + BasicSelectionNormalized + | CoordinateSelectionNormalized + | BlockSelectionNormalized + | MaskSelection + | OrthogonalSelectionNormalized +) +Selector = int | slice | npt.NDArray[np.intp] | npt.NDArray[np.bool_] +SelectionWithFields = Selection | str | Sequence[str] +SelectorTuple = tuple[Selector, ...] | npt.NDArray[np.intp] | slice +Fields = str | list[str] | tuple[str, ...] + + +class ArrayIndexError(IndexError): + pass + + +class BoundsCheckError(IndexError): + _msg = "" + + def __init__(self, dim_len: int): + self._msg = f"index out of bounds for dimension with length {dim_len}" + + +class NegativeStepError(IndexError): + _msg = "only slices with step >= 1 are supported" -def _err_too_many_indices(selection: SliceSelection, shape: ChunkCoords) -> None: +class VindexInvalidSelectionError(IndexError): + _msg = ( + "unsupported selection type for vectorized indexing; only " + "coordinate selection (tuple of integer arrays) and mask selection " + "(single Boolean array) are supported; got {0!r}" + ) + + +def err_too_many_indices(selection: Any, shape: ChunkCoords) -> None: raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}") -def _err_negative_step() -> None: - raise IndexError("only slices with step >= 1 are supported") +@runtime_checkable +class Indexer(Protocol): + shape: ChunkCoords + drop_axes: ChunkCoords + def __iter__(self) -> Iterator[ChunkProjection]: ... -def _check_selection_length(selection: SliceSelection, shape: ChunkCoords) -> None: - if len(selection) > len(shape): - _err_too_many_indices(selection, shape) +def ceildiv(a: float, b: float) -> int: + return math.ceil(a / b) -def _ensure_selection( - selection: Selection, - shape: ChunkCoords, -) -> SliceSelection: - selection = _ensure_tuple(selection) - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += (slice(None),) * (len(shape) - len(selection)) +def is_integer(x: Any) -> TypeGuard[int]: + """True if x is an integer (both pure Python or NumPy). + + Note that Python's bool is considered an integer too. + """ + return isinstance(x, numbers.Integral) + + +def is_integer_list(x: Any) -> TypeGuard[list[int]]: + """True if x is a list of integers. + + This function assumes ie *does not check* that all elements of the list + have the same type. Mixed type lists will result in other errors that will + bubble up anyway. + """ + return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) + + +def is_integer_array(x: Any, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.intp]]: + t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" + if ndim is not None: + t = t and hasattr(x, "shape") and len(x.shape) == ndim + return t + + +def is_bool_array(x: Any, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.bool_]]: + t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool + if ndim is not None: + t = t and hasattr(x, "shape") and len(x.shape) == ndim + return t + + +def is_scalar(value: Any, dtype: np.dtype[Any]) -> bool: + if np.isscalar(value): + return True + if hasattr(value, "shape") and value.shape == (): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + return False + + +def is_pure_fancy_indexing(selection: Any, ndim: int) -> bool: + """Check whether a selection contains only scalars or integer array-likes. + + Parameters + ---------- + selection : tuple, slice, or scalar + A valid selection value for indexing into arrays. + + Returns + ------- + is_pure : bool + True if the selection is a pure fancy indexing expression (ie not mixed + with boolean or slices). + """ + if ndim == 1: + if is_integer_list(selection) or is_integer_array(selection): + return True + # if not, we go through the normal path below, because a 1-tuple + # of integers is also allowed. + no_slicing = ( + isinstance(selection, tuple) + and len(selection) == ndim + and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) + ) + return ( + no_slicing + and all( + is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) + for elem in selection + ) + and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) + ) - # check selection not too long - _check_selection_length(selection, shape) - return selection +def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> TypeGuard[OrthogonalSelection]: + if not ndim: + return False + # Case 1: Selection is a single iterable of integers + if is_integer_list(selection) or is_integer_array(selection, ndim=1): + return True + + # Case two: selection contains either zero or one integer iterables. + # All other selection elements are slices or integers + return ( + isinstance(selection, tuple) + and len(selection) == ndim + and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 + and all( + is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, int | slice) + for elem in selection + ) + ) + + +def get_chunk_shape(chunk_grid: ChunkGrid) -> ChunkCoords: + from zarr.chunk_grids import RegularChunkGrid + + assert isinstance( + chunk_grid, RegularChunkGrid + ), "Only regular chunk grid is supported, currently." + return chunk_grid.chunk_shape + + +def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise BoundsCheckError(dim_len) + + return dim_sel + + +class ChunkDimProjection(NamedTuple): + """A mapping from chunk to output array for a single dimension. + + Parameters + ---------- + dim_chunk_ix + Index of chunk. + dim_chunk_sel + Selection of items from chunk array. + dim_out_sel + Selection of items in target (output) array. + + """ -class _ChunkDimProjection(NamedTuple): dim_chunk_ix: int - dim_chunk_sel: slice - dim_out_sel: slice | None + dim_chunk_sel: Selector + dim_out_sel: Selector | None -def _ceildiv(a: float, b: float) -> int: - return math.ceil(a / b) +@dataclass(frozen=True) +class IntDimIndexer: + dim_sel: int + dim_len: int + dim_chunk_len: int + nitems: int = 1 + + def __init__(self, dim_sel: int, dim_len: int, dim_chunk_len: int): + object.__setattr__(self, "dim_sel", normalize_integer_selection(dim_sel, dim_len)) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + def __iter__(self) -> Iterator[ChunkDimProjection]: + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -class _SliceDimIndexer: - dim_sel: slice + +@dataclass(frozen=True) +class SliceDimIndexer: dim_len: int dim_chunk_len: int nitems: int + nchunks: int start: int stop: int step: int def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int): - self.start, self.stop, self.step = dim_sel.indices(dim_len) - if self.step < 1: - _err_negative_step() + # normalize + start, stop, step = dim_sel.indices(dim_len) + if step < 1: + raise NegativeStepError + + object.__setattr__(self, "start", start) + object.__setattr__(self, "stop", stop) + object.__setattr__(self, "step", step) - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nitems = max(0, _ceildiv((self.stop - self.start), self.step)) - self.nchunks = _ceildiv(self.dim_len, self.dim_chunk_len) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) + object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) - def __iter__(self) -> Iterator[_ChunkDimProjection]: + def __iter__(self) -> Iterator[ChunkDimProjection]: # figure out the range of chunks we need to visit dim_chunk_ix_from = self.start // self.dim_chunk_len - dim_chunk_ix_to = _ceildiv(self.stop, self.dim_chunk_len) + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) # iterate over chunks in range for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): @@ -97,7 +307,7 @@ def __iter__(self) -> Iterator[_ChunkDimProjection]: if remainder: dim_chunk_sel_start += self.step - remainder # compute number of previous items, provides offset into output array - dim_out_offset = _ceildiv((dim_offset - self.start), self.step) + dim_out_offset = ceildiv((dim_offset - self.start), self.step) else: # selection starts within current chunk @@ -113,43 +323,609 @@ def __iter__(self) -> Iterator[_ChunkDimProjection]: dim_chunk_sel_stop = self.stop - dim_offset dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = _ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + # If there are no elements on the selection within this chunk, then skip + if dim_chunk_nitems == 0: + continue + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - yield _ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def check_selection_length(selection: SelectionNormalized, shape: ChunkCoords) -> None: + if len(selection) > len(shape): + err_too_many_indices(selection, shape) + + +def replace_ellipsis(selection: Any, shape: ChunkCoords) -> SelectionNormalized: + selection = ensure_tuple(selection) + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) + + # check selection not too long + check_selection_length(selection, shape) + + return cast(SelectionNormalized, selection) + + +def replace_lists(selection: SelectionNormalized) -> SelectionNormalized: + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection + ) -class _ChunkProjection(NamedTuple): +T = TypeVar("T") + + +def ensure_tuple(v: Any) -> SelectionNormalized: + if not isinstance(v, tuple): + v = (v,) + return cast(SelectionNormalized, v) + + +class ChunkProjection(NamedTuple): + """A mapping of items from chunk to output array. Can be used to extract items from the + chunk array for loading into an output array. Can also be used to extract items from a + value array for setting/updating in a chunk array. + + Parameters + ---------- + chunk_coords + Indices of chunk. + chunk_selection + Selection of items from chunk array. + out_selection + Selection of items in target (output) array. + + """ + chunk_coords: ChunkCoords - chunk_selection: SliceSelection - out_selection: SliceSelection + chunk_selection: tuple[Selector, ...] | npt.NDArray[np.intp] + out_selection: tuple[Selector, ...] | npt.NDArray[np.intp] | slice + + +def is_slice(s: Any) -> TypeGuard[slice]: + return isinstance(s, slice) + + +def is_contiguous_slice(s: Any) -> TypeGuard[slice]: + return is_slice(s) and (s.step is None or s.step == 1) + +def is_positive_slice(s: Any) -> TypeGuard[slice]: + return is_slice(s) and (s.step is None or s.step >= 1) -class BasicIndexer: - dim_indexers: list[_SliceDimIndexer] + +def is_contiguous_selection(selection: Any) -> TypeGuard[slice]: + selection = ensure_tuple(selection) + return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) + + +def is_basic_selection(selection: Any) -> TypeGuard[BasicSelection]: + selection = ensure_tuple(selection) + return all(is_integer(s) or is_positive_slice(s) for s in selection) + + +@dataclass(frozen=True) +class BasicIndexer(Indexer): + dim_indexers: list[IntDimIndexer | SliceDimIndexer] shape: ChunkCoords + drop_axes: ChunkCoords def __init__( self, - selection: Selection, - shape: tuple[int, ...], + selection: BasicSelection, + shape: ChunkCoords, chunk_grid: ChunkGrid, ): - from zarr.chunk_grids import RegularChunkGrid + chunk_shape = get_chunk_shape(chunk_grid) + # handle ellipsis + selection_normalized = replace_ellipsis(selection, shape) - assert isinstance( - chunk_grid, RegularChunkGrid - ), "Only regular chunk grid is supported, currently." # setup per-dimension indexers - self.dim_indexers = [ - _SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - for dim_sel, dim_len, dim_chunk_len in zip( - _ensure_selection(selection, shape), shape, chunk_grid.chunk_shape, strict=False + dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] + for dim_sel, dim_len, dim_chunk_len in zip( + selection_normalized, shape, chunk_shape, strict=True + ): + dim_indexer: IntDimIndexer | SliceDimIndexer + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_slice(dim_sel): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + "unsupported selection item for basic indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__( + self, + "shape", + tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)), + ) + object.__setattr__(self, "drop_axes", ()) + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - ] - self.shape = tuple(s.nitems for s in self.dim_indexers) - def __iter__(self) -> Iterator[_ChunkProjection]: + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class BoolArrayDimIndexer: + dim_sel: npt.NDArray[np.bool_] + dim_len: int + dim_chunk_len: int + nchunks: int + + chunk_nitems: npt.NDArray[Any] + chunk_nitems_cumsum: npt.NDArray[Any] + nitems: int + dim_chunk_ixs: npt.NDArray[np.intp] + + def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: int): + # check number of dimensions + if not is_bool_array(dim_sel, 1): + raise IndexError("Boolean arrays in an orthogonal selection must be 1-dimensional only") + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError( + f"Boolean array has the wrong length for dimension; expected {dim_len}, got {dim_sel.shape[0]}" + ) + + # precompute number of selected items for each chunk + nchunks = ceildiv(dim_len, dim_chunk_len) + chunk_nitems = np.zeros(nchunks, dtype="i8") + for dim_chunk_ix in range(nchunks): + dim_offset = dim_chunk_ix * dim_chunk_len + chunk_nitems[dim_chunk_ix] = np.count_nonzero( + dim_sel[dim_offset : dim_offset + dim_chunk_len] + ) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + nitems = chunk_nitems_cumsum[-1] + dim_chunk_ixs = np.nonzero(chunk_nitems)[0] + + # store attributes + object.__setattr__(self, "dim_sel", dim_sel) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nchunks", nchunks) + object.__setattr__(self, "chunk_nitems", chunk_nitems) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + object.__setattr__(self, "nitems", nitems) + object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +class Order(Enum): + UNKNOWN = 0 + INCREASING = 1 + DECREASING = 2 + UNORDERED = 3 + + @staticmethod + def check(a: npt.NDArray[Any]) -> Order: + diff = np.diff(a) + diff_positive = diff >= 0 + n_diff_positive = np.count_nonzero(diff_positive) + all_increasing = n_diff_positive == len(diff_positive) + any_increasing = n_diff_positive > 0 + if all_increasing: + order = Order.INCREASING + elif any_increasing: + order = Order.UNORDERED + else: + order = Order.DECREASING + return order + + +def wraparound_indices(x: npt.NDArray[Any], dim_len: int) -> None: + loc_neg = x < 0 + if np.any(loc_neg): + x[loc_neg] = x[loc_neg] + dim_len + + +def boundscheck_indices(x: npt.NDArray[Any], dim_len: int) -> None: + if np.any(x < 0) or np.any(x >= dim_len): + raise BoundsCheckError(dim_len) + + +@dataclass(frozen=True) +class IntArrayDimIndexer: + """Integer array selection against a single dimension.""" + + dim_len: int + dim_chunk_len: int + nchunks: int + nitems: int + order: Order + dim_sel: npt.NDArray[np.intp] + dim_out_sel: npt.NDArray[np.intp] + chunk_nitems: int + dim_chunk_ixs: npt.NDArray[np.intp] + chunk_nitems_cumsum: npt.NDArray[np.intp] + + def __init__( + self, + dim_sel: npt.NDArray[np.intp], + dim_len: int, + dim_chunk_len: int, + wraparound: bool = True, + boundscheck: bool = True, + order: Order = Order.UNKNOWN, + ): + # ensure 1d array + dim_sel = np.asanyarray(dim_sel) + if not is_integer_array(dim_sel, 1): + raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") + + nitems = len(dim_sel) + nchunks = ceildiv(dim_len, dim_chunk_len) + + # handle wraparound + if wraparound: + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + if boundscheck: + boundscheck_indices(dim_sel, dim_len) + + # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck + dim_sel_chunk = dim_sel // dim_chunk_len + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + order = Order(order) + + if order == Order.INCREASING: + dim_sel = dim_sel + dim_out_sel = None + elif order == Order.DECREASING: + dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange + dim_out_sel = np.arange(nitems - 1, -1, -1) + else: + # sort indices to group by chunk + dim_out_sel = np.argsort(dim_sel_chunk) + dim_sel = np.take(dim_sel, dim_out_sel) + + # precompute number of selected items for each chunk + chunk_nitems = np.bincount(dim_sel_chunk, minlength=nchunks) + + # find chunks that we need to visit + dim_chunk_ixs = np.nonzero(chunk_nitems)[0] + + # compute offsets into the output array + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + + # store attributes + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nchunks", nchunks) + object.__setattr__(self, "nitems", nitems) + object.__setattr__(self, "order", order) + object.__setattr__(self, "dim_sel", dim_sel) + object.__setattr__(self, "dim_out_sel", dim_out_sel) + object.__setattr__(self, "chunk_nitems", chunk_nitems) + object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + for dim_chunk_ix in self.dim_chunk_ixs: + dim_out_sel: slice | npt.NDArray[np.intp] + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.order == Order.INCREASING: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_out_sel[start:stop] + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def slice_to_range(s: slice, length: int) -> range: + return range(*s.indices(length)) + + +def ix_(selection: Any, shape: ChunkCoords) -> npt.NDArray[np.intp]: + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ + but with support for slices and single ints.""" + + # normalisation + selection = replace_ellipsis(selection, shape) + + # replace slice and int as these are not supported by numpy.ix_ + selection = [ + slice_to_range(dim_sel, dim_len) + if isinstance(dim_sel, slice) + else [dim_sel] + if is_integer(dim_sel) + else dim_sel + for dim_sel, dim_len in zip(selection, shape, strict=True) + ] + + # now get numpy to convert to a coordinate selection + selection = np.ix_(*selection) + + return cast(npt.NDArray[np.intp], selection) + + +def oindex(a: npt.NDArray[Any], selection: Selection) -> npt.NDArray[Any]: + """Implementation of orthogonal indexing with slices and ints.""" + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + result = a[selection] + if drop_axes: + result = result.squeeze(axis=drop_axes) + return result + + +def oindex_set(a: npt.NDArray[Any], selection: Selection, value: Any) -> None: + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + if not np.isscalar(value) and drop_axes: + value = np.asanyarray(value) + value_selection: list[Selector | None] = [slice(None)] * len(a.shape) + for i in drop_axes: + value_selection[i] = np.newaxis + value = value[tuple(value_selection)] + a[selection] = value + + +@dataclass(frozen=True) +class OrthogonalIndexer(Indexer): + dim_indexers: list[IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer] + shape: ChunkCoords + chunk_shape: ChunkCoords + is_advanced: bool + drop_axes: tuple[int, ...] + + def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + # handle ellipsis + selection = replace_ellipsis(selection, shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers: list[ + IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer + ] = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + dim_indexer: IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_integer_array(dim_sel): + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_bool_array(dim_sel): + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + "unsupported selection item for orthogonal indexing; " + "expected integer, slice, integer array or Boolean " + f"array, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + dim_indexers = dim_indexers + shape = tuple(s.nitems for s in dim_indexers if not isinstance(s, IntDimIndexer)) + chunk_shape = chunk_shape + is_advanced = not is_basic_selection(selection) + if is_advanced: + drop_axes = tuple( + i + for i, dim_indexer in enumerate(dim_indexers) + if isinstance(dim_indexer, IntDimIndexer) + ) + else: + drop_axes = () + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "is_advanced", is_advanced) + object.__setattr__(self, "drop_axes", drop_axes) + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection: tuple[Selector, ...] | npt.NDArray[Any] = tuple( + p.dim_chunk_sel for p in dim_projections + ) + out_selection: tuple[Selector, ...] | npt.NDArray[Any] = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # N.B., numpy doesn't support orthogonal indexing directly as yet, + # so need to work around via np.ix_. Also np.ix_ does not support a + # mixture of arrays and slices or integers, so need to convert slices + # and integers into ranges. + chunk_selection = ix_(chunk_selection, self.chunk_shape) + + # special case for non-monotonic indices + if not is_basic_selection(out_selection): + out_selection = ix_(out_selection, self.shape) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class OIndex: + array: Array + + def __getitem__(self, selection: OrthogonalSelection) -> NDArrayLike: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.get_orthogonal_selection( + cast(OrthogonalSelection, new_selection), fields=fields + ) + + def __setitem__(self, selection: OrthogonalSelection, value: NDArrayLike) -> None: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.set_orthogonal_selection( + cast(OrthogonalSelection, new_selection), value, fields=fields + ) + + +@dataclass(frozen=True) +class BlockIndexer(Indexer): + dim_indexers: list[SliceDimIndexer] + shape: ChunkCoords + drop_axes: ChunkCoords + + def __init__(self, selection: BlockSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + # handle ellipsis + selection_normalized = replace_ellipsis(selection, shape) + + # normalize list to array + selection_normalized = replace_lists(selection_normalized) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_size in zip( + selection_normalized, shape, chunk_shape, strict=True + ): + dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) + + if is_integer(dim_sel): + if dim_sel < 0: + dim_sel = dim_numchunks + dim_sel + + start = dim_sel * dim_chunk_size + stop = start + dim_chunk_size + slice_ = slice(start, stop) + + elif is_slice(dim_sel): + start = dim_sel.start if dim_sel.start is not None else 0 + stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks + + if dim_sel.step not in {1, None}: + raise IndexError( + "unsupported selection item for block indexing; " + f"expected integer or slice with step=1, got {type(dim_sel)!r}" + ) + + # Can't reuse wraparound_indices because it expects a numpy array + # We have integers here. + if start < 0: + start = dim_numchunks + start + if stop < 0: + stop = dim_numchunks + stop + + start = start * dim_chunk_size + stop = stop * dim_chunk_size + slice_ = slice(start, stop) + + else: + raise IndexError( + "unsupported selection item for block indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexers.append(dim_indexer) + + if start >= dim_len or start < 0: + raise BoundsCheckError(dim_len) + + dim_indexers = dim_indexers + shape = tuple(s.nitems for s in dim_indexers) + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "drop_axes", ()) + + def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) @@ -157,7 +933,292 @@ def __iter__(self) -> Iterator[_ChunkProjection]: p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - yield _ChunkProjection(chunk_coords, chunk_selection, out_selection) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class BlockIndex: + array: Array + + def __getitem__(self, selection: BlockSelection) -> NDArrayLike: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.get_block_selection(cast(BlockSelection, new_selection), fields=fields) + + def __setitem__(self, selection: BlockSelection, value: NDArrayLike) -> None: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.set_block_selection( + cast(BlockSelection, new_selection), value, fields=fields + ) + + +def is_coordinate_selection( + selection: SelectionNormalized, shape: ChunkCoords +) -> TypeGuard[CoordinateSelectionNormalized]: + return ( + isinstance(selection, tuple) + and len(selection) == len(shape) + and all(is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection) + ) + + +def is_mask_selection(selection: Selection, shape: ChunkCoords) -> TypeGuard[MaskSelection]: + return ( + isinstance(selection, tuple) + and len(selection) == 1 + and is_bool_array(selection[0]) + and selection[0].shape == shape + ) + + +@dataclass(frozen=True) +class CoordinateIndexer(Indexer): + sel_shape: ChunkCoords + selection: CoordinateSelectionNormalized + sel_sort: npt.NDArray[np.intp] | None + chunk_nitems_cumsum: npt.NDArray[np.intp] + chunk_rixs: npt.NDArray[np.intp] + chunk_mixs: tuple[npt.NDArray[np.intp], ...] + shape: ChunkCoords + chunk_shape: ChunkCoords + drop_axes: ChunkCoords + + def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + cdata_shape: ChunkCoords + if shape == (): + cdata_shape = (1,) + else: + cdata_shape = tuple(math.ceil(s / c) for s, c in zip(shape, chunk_shape, strict=True)) + nchunks = reduce(operator.mul, cdata_shape, 1) + + # some initial normalization + selection_normalized = cast(CoordinateSelectionNormalized, ensure_tuple(selection)) + selection_normalized = tuple( + np.asarray([i]) if is_integer(i) else i for i in selection_normalized + ) + selection_normalized = cast( + CoordinateSelectionNormalized, replace_lists(selection_normalized) + ) + + # validation + if not is_coordinate_selection(selection_normalized, shape): + raise IndexError( + "invalid coordinate selection; expected one integer " + "(coordinate) array per dimension of the target array, " + f"got {selection!r}" + ) + + # handle wraparound, boundscheck + for dim_sel, dim_len in zip(selection_normalized, shape, strict=True): + # handle wraparound + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + boundscheck_indices(dim_sel, dim_len) + + # compute chunk index for each point in the selection + chunks_multi_index = tuple( + dim_sel // dim_chunk_len + for (dim_sel, dim_chunk_len) in zip(selection_normalized, chunk_shape, strict=True) + ) + + # broadcast selection - this will raise error if array dimensions don't match + selection_broadcast = tuple(np.broadcast_arrays(*selection_normalized)) + chunks_multi_index_broadcast = np.broadcast_arrays(*chunks_multi_index) + + # remember shape of selection, because we will flatten indices for processing + sel_shape = selection_broadcast[0].shape if selection_broadcast[0].shape else (1,) + + # flatten selection + selection_broadcast = tuple(dim_sel.reshape(-1) for dim_sel in selection_broadcast) + chunks_multi_index_broadcast = [ + dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index_broadcast + ] + + # ravel chunk indices + chunks_raveled_indices = np.ravel_multi_index( + chunks_multi_index_broadcast, dims=cdata_shape + ) + + # group points by chunk + if np.any(np.diff(chunks_raveled_indices) < 0): + # optimisation, only sort if needed + sel_sort = np.argsort(chunks_raveled_indices) + selection_broadcast = tuple(dim_sel[sel_sort] for dim_sel in selection_broadcast) + else: + sel_sort = None + + shape = selection_broadcast[0].shape if selection_broadcast[0].shape else (1,) + + # precompute number of selected items for each chunk + chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + # locate the chunks we need to process + chunk_rixs = np.nonzero(chunk_nitems)[0] + + # unravel chunk indices + chunk_mixs = np.unravel_index(chunk_rixs, cdata_shape) + + object.__setattr__(self, "sel_shape", sel_shape) + object.__setattr__(self, "selection", selection_broadcast) + object.__setattr__(self, "sel_sort", sel_sort) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + object.__setattr__(self, "chunk_rixs", chunk_rixs) + object.__setattr__(self, "chunk_mixs", chunk_mixs) + object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "drop_axes", ()) + + def __iter__(self) -> Iterator[ChunkProjection]: + # iterate over chunks + for i, chunk_rix in enumerate(self.chunk_rixs): + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + out_selection: slice | npt.NDArray[np.intp] + if self.sel_sort is None: + out_selection = slice(start, stop) + else: + out_selection = self.sel_sort[start:stop] + + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.chunk_shape, strict=True) + ) + chunk_selection = tuple( + dim_sel[start:stop] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets, strict=True) + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class MaskIndexer(CoordinateIndexer): + def __init__(self, selection: MaskSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): + # some initial normalization + selection_normalized = cast(tuple[MaskSelection], ensure_tuple(selection)) + selection_normalized = cast(tuple[MaskSelection], replace_lists(selection_normalized)) + + # validation + if not is_mask_selection(selection_normalized, shape): + raise IndexError( + "invalid mask selection; expected one Boolean (mask)" + f"array with the same shape as the target array, got {selection_normalized!r}" + ) + + # convert to indices + selection_indices = np.nonzero(selection_normalized[0]) + + # delegate the rest to superclass + super().__init__(selection_indices, shape, chunk_grid) + + +@dataclass(frozen=True) +class VIndex: + array: Array + + def __getitem__(self, selection: CoordinateSelection | MaskSelection) -> NDArrayLike: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + if is_coordinate_selection(new_selection, self.array.shape): + return self.array.get_coordinate_selection(new_selection, fields=fields) + elif is_mask_selection(new_selection, self.array.shape): + return self.array.get_mask_selection(new_selection, fields=fields) + else: + raise VindexInvalidSelectionError(new_selection) + + def __setitem__( + self, selection: CoordinateSelection | MaskSelection, value: NDArrayLike + ) -> None: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + if is_coordinate_selection(new_selection, self.array.shape): + self.array.set_coordinate_selection(new_selection, value, fields=fields) + elif is_mask_selection(new_selection, self.array.shape): + self.array.set_mask_selection(new_selection, value, fields=fields) + else: + raise VindexInvalidSelectionError(new_selection) + + +def check_fields(fields: Fields | None, dtype: np.dtype[Any]) -> np.dtype[Any]: + # early out + if fields is None: + return dtype + # check type + if not isinstance(fields, str | list | tuple): + raise IndexError( + f"'fields' argument must be a string or list of strings; found {type(fields)!r}" + ) + if fields: + if dtype.names is None: + raise IndexError("invalid 'fields' argument, array does not have any fields") + try: + if isinstance(fields, str): + # single field selection + out_dtype = dtype[fields] + else: + # multiple field selection + out_dtype = np.dtype([(f, dtype[f]) for f in fields]) + except KeyError as e: + raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") from e + else: + return out_dtype + else: + return dtype + + +def check_no_multi_fields(fields: Fields | None) -> Fields | None: + if isinstance(fields, list): + if len(fields) == 1: + return fields[0] + elif len(fields) > 1: + raise IndexError("multiple fields are not supported for this operation") + return fields + + +def pop_fields(selection: SelectionWithFields) -> tuple[Fields | None, Selection]: + if isinstance(selection, str): + # single field selection + return selection, () + elif not isinstance(selection, tuple): + # single selection item, no fields + # leave selection as-is + return None, cast(Selection, selection) + else: + # multiple items, split fields from selection items + fields: Fields = [f for f in selection if isinstance(f, str)] + fields = fields[0] if len(fields) == 1 else fields + selection_tuple = tuple(s for s in selection if not isinstance(s, str)) + selection = cast( + Selection, selection_tuple[0] if len(selection_tuple) == 1 else selection_tuple + ) + return fields, selection + + +def make_slice_selection(selection: Any) -> list[int | slice]: + ls: list[int | slice] = [] + for dim_selection in selection: + if is_integer(dim_selection): + ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) + elif isinstance(dim_selection, np.ndarray): + if len(dim_selection) == 1: + ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) + else: + raise ArrayIndexError + else: + ls.append(dim_selection) + return ls def morton_order_iter(chunk_shape: ChunkCoords) -> Iterator[ChunkCoords]: @@ -198,7 +1259,8 @@ def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: item = (item,) if isinstance(item, tuple): return all( - ( + isinstance(dim_sel, slice) + and ( (dim_sel == slice(None)) or ((dim_sel.stop - dim_sel.start == dim_len) and (dim_sel.step in [1, None])) ) @@ -206,3 +1268,22 @@ def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: ) else: raise TypeError(f"expected slice or tuple of slices, found {item!r}") + + +def get_indexer( + selection: SelectionWithFields, shape: ChunkCoords, chunk_grid: ChunkGrid +) -> Indexer: + _, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, len(shape)): + new_selection = ensure_tuple(selection) + new_selection = replace_lists(new_selection) + if is_coordinate_selection(new_selection, shape): + return CoordinateIndexer(cast(CoordinateSelection, selection), shape, chunk_grid) + elif is_mask_selection(new_selection, shape): + return MaskIndexer(cast(MaskSelection, selection), shape, chunk_grid) + else: + raise VindexInvalidSelectionError(new_selection) + elif is_pure_orthogonal_indexing(pure_selection, len(shape)): + return OrthogonalIndexer(cast(OrthogonalSelection, selection), shape, chunk_grid) + else: + return BasicIndexer(cast(BasicSelection, selection), shape, chunk_grid) diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 39a1d5319..ca8cf1cdd 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -5,7 +5,7 @@ from collections.abc import Iterable from dataclasses import dataclass, field, replace from enum import Enum -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any import numpy as np import numpy.typing as npt @@ -118,6 +118,7 @@ def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: class ArrayMetadata(Metadata, ABC): shape: ChunkCoords chunk_grid: ChunkGrid + fill_value: Any attributes: dict[str, JSON] @property @@ -310,7 +311,7 @@ class ArrayV2Metadata(ArrayMetadata): filters: list[dict[str, JSON]] | None = None dimension_separator: Literal[".", "/"] = "." compressor: dict[str, JSON] | None = None - attributes: dict[str, JSON] = cast(dict[str, JSON], field(default_factory=dict)) + attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) def __init__( diff --git a/src/zarr/py.typed b/src/zarr/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/src/zarr/v2/indexing.py b/src/zarr/v2/indexing.py index 0e266ad90..242e9ae84 100644 --- a/src/zarr/v2/indexing.py +++ b/src/zarr/v2/indexing.py @@ -346,7 +346,7 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.drop_axes = None + self.drop_axes = () def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): @@ -625,7 +625,7 @@ def __init__(self, selection, array): if isinstance(dim_indexer, IntDimIndexer) ) else: - self.drop_axes = None + self.drop_axes = () def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): @@ -724,7 +724,7 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers) - self.drop_axes = None + self.drop_axes = () def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): @@ -823,7 +823,7 @@ def __init__(self, selection, array): self.selection = selection self.sel_sort = sel_sort self.shape = selection[0].shape if selection[0].shape else (1,) - self.drop_axes = None + self.drop_axes = () self.array = array # precompute number of selected items for each chunk diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index af89bab62..9ce9b07a2 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -368,7 +368,7 @@ async def test_asyncgroup_update_attributes( @pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) @pytest.mark.parametrize("zarr_format", (2, 3)) -async def test_group_init(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: +def test_group_init(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: agroup = sync(AsyncGroup.create(store=store, zarr_format=zarr_format)) group = Group(agroup) assert group._async_group == agroup diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py new file mode 100644 index 000000000..9ce485945 --- /dev/null +++ b/tests/v3/test_indexing.py @@ -0,0 +1,1721 @@ +from __future__ import annotations + +from collections import Counter +from collections.abc import Iterator +from typing import Any +from uuid import uuid4 + +import numpy as np +import numpy.typing as npt +import pytest +from numpy.testing import assert_array_equal + +import zarr +from zarr.abc.store import Store +from zarr.buffer import NDBuffer +from zarr.common import ChunkCoords +from zarr.indexing import ( + make_slice_selection, + normalize_integer_selection, + oindex, + oindex_set, + replace_ellipsis, +) +from zarr.store.core import StorePath +from zarr.store.memory import MemoryStore + + +@pytest.fixture +def store() -> Iterator[Store]: + yield StorePath(MemoryStore(mode="w")) + + +def zarr_array_from_numpy_array( + store: StorePath, + a: npt.NDArray[Any], + chunk_shape: ChunkCoords | None = None, +) -> zarr.Array: + z = zarr.Array.create( + store=store / str(uuid4()), + shape=a.shape, + dtype=a.dtype, + chunk_shape=chunk_shape or a.shape, + chunk_key_encoding=("v2", "."), + ) + z[:] = a + return z + + +class CountingDict(MemoryStore): + def __init__(self): + super().__init__(mode="w") + self.counter = Counter() + + async def get(self, key, byte_range=None): + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__getitem__", key_suffix] += 1 + return await super().get(key, byte_range) + + async def set(self, key, value, byte_range=None): + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__setitem__", key_suffix] += 1 + return await super().set(key, value, byte_range) + + +def test_normalize_integer_selection(): + assert 1 == normalize_integer_selection(1, 100) + assert 99 == normalize_integer_selection(-1, 100) + with pytest.raises(IndexError): + normalize_integer_selection(100, 100) + with pytest.raises(IndexError): + normalize_integer_selection(1000, 100) + with pytest.raises(IndexError): + normalize_integer_selection(-1000, 100) + + +def test_replace_ellipsis(): + # 1D, single item + assert (0,) == replace_ellipsis(0, (100,)) + + # 1D + assert (slice(None),) == replace_ellipsis(Ellipsis, (100,)) + assert (slice(None),) == replace_ellipsis(slice(None), (100,)) + assert (slice(None, 100),) == replace_ellipsis(slice(None, 100), (100,)) + assert (slice(0, None),) == replace_ellipsis(slice(0, None), (100,)) + assert (slice(None),) == replace_ellipsis((slice(None), Ellipsis), (100,)) + assert (slice(None),) == replace_ellipsis((Ellipsis, slice(None)), (100,)) + + # 2D, single item + assert (0, 0) == replace_ellipsis((0, 0), (100, 100)) + assert (-1, 1) == replace_ellipsis((-1, 1), (100, 100)) + + # 2D, single col/row + assert (0, slice(None)) == replace_ellipsis((0, slice(None)), (100, 100)) + assert (0, slice(None)) == replace_ellipsis((0,), (100, 100)) + assert (slice(None), 0) == replace_ellipsis((slice(None), 0), (100, 100)) + + # 2D slice + assert (slice(None), slice(None)) == replace_ellipsis(Ellipsis, (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis(slice(None), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((slice(None), slice(None)), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((Ellipsis, slice(None)), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((slice(None), Ellipsis), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis( + (slice(None), Ellipsis, slice(None)), (100, 100) + ) + assert (slice(None), slice(None)) == replace_ellipsis( + (Ellipsis, slice(None), slice(None)), (100, 100) + ) + assert (slice(None), slice(None)) == replace_ellipsis( + (slice(None), slice(None), Ellipsis), (100, 100) + ) + + +@pytest.mark.xfail(reason="zero-dimension arrays are not supported in v3") +def test_get_basic_selection_0d(store: StorePath): + # setup + a = np.array(42) + z = zarr_array_from_numpy_array(store, a) + + assert_array_equal(a, z.get_basic_selection(Ellipsis)) + assert_array_equal(a, z[...]) + assert 42 == z.get_basic_selection(()) + assert 42 == z[()] + + # test out param + b = NDBuffer.from_numpy_array(np.zeros_like(a)) + z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(a, b) + + # test structured array + value = (b"aaa", 1, 4.2) + a = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + z = zarr_array_from_numpy_array(store, a) + z[()] = value + assert_array_equal(a, z.get_basic_selection(Ellipsis)) + assert_array_equal(a, z[...]) + assert a[()] == z.get_basic_selection(()) + assert a[()] == z[()] + assert b"aaa" == z.get_basic_selection((), fields="foo") + assert b"aaa" == z["foo"] + assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) + assert a[["foo", "bar"]] == z["foo", "bar"] + # test out param + b = NDBuffer.from_numpy_array(np.zeros_like(a)) + z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(a, b) + c = NDBuffer.from_numpy_array(np.zeros_like(a[["foo", "bar"]])) + z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) + assert_array_equal(a[["foo", "bar"]], c) + + +basic_selections_1d = [ + # single value + 42, + -1, + # slices + slice(0, 1050), + slice(50, 150), + slice(0, 2000), + slice(-150, -50), + slice(-2000, 2000), + slice(0, 0), # empty result + slice(-1, 0), # empty result + # total selections + slice(None), + Ellipsis, + (), + (Ellipsis, slice(None)), + # slice with step + slice(None), + slice(None, None), + slice(None, None, 1), + slice(None, None, 10), + slice(None, None, 100), + slice(None, None, 1000), + slice(None, None, 10000), + slice(0, 1050), + slice(0, 1050, 1), + slice(0, 1050, 10), + slice(0, 1050, 100), + slice(0, 1050, 1000), + slice(0, 1050, 10000), + slice(1, 31, 3), + slice(1, 31, 30), + slice(1, 31, 300), + slice(81, 121, 3), + slice(81, 121, 30), + slice(81, 121, 300), + slice(50, 150), + slice(50, 150, 1), + slice(50, 150, 10), +] + + +basic_selections_1d_bad = [ + # only positive step supported + slice(None, None, -1), + slice(None, None, -10), + slice(None, None, -100), + slice(None, None, -1000), + slice(None, None, -10000), + slice(1050, -1, -1), + slice(1050, -1, -10), + slice(1050, -1, -100), + slice(1050, -1, -1000), + slice(1050, -1, -10000), + slice(1050, 0, -1), + slice(1050, 0, -10), + slice(1050, 0, -100), + slice(1050, 0, -1000), + slice(1050, 0, -10000), + slice(150, 50, -1), + slice(150, 50, -10), + slice(31, 1, -3), + slice(121, 81, -3), + slice(-1, 0, -1), + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +def _test_get_basic_selection(a, z, selection): + print(a, z, selection) + expect = a[selection] + actual = z.get_basic_selection(selection) + assert_array_equal(expect, actual) + actual = z[selection] + assert_array_equal(expect, actual) + + # test out param + b = NDBuffer.from_numpy_array(np.empty(shape=expect.shape, dtype=expect.dtype)) + z.get_basic_selection(selection, out=b) + assert_array_equal(expect, b.as_numpy_array()) + + +# noinspection PyStatementEffect +def test_get_basic_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + for selection in basic_selections_1d: + _test_get_basic_selection(a, z, selection) + + for selection in basic_selections_1d_bad: + with pytest.raises(IndexError): + z.get_basic_selection(selection) + with pytest.raises(IndexError): + z[selection] + + with pytest.raises(IndexError): + z.get_basic_selection([1, 0]) + + +basic_selections_2d = [ + # single row + 42, + -1, + (42, slice(None)), + (-1, slice(None)), + # single col + (slice(None), 4), + (slice(None), -1), + # row slices + slice(None), + slice(0, 1000), + slice(250, 350), + slice(0, 2000), + slice(-350, -250), + slice(0, 0), # empty result + slice(-1, 0), # empty result + slice(-2000, 0), + slice(-2000, 2000), + # 2D slices + (slice(None), slice(1, 5)), + (slice(250, 350), slice(None)), + (slice(250, 350), slice(1, 5)), + (slice(250, 350), slice(-5, -1)), + (slice(250, 350), slice(-50, 50)), + (slice(250, 350, 10), slice(1, 5)), + (slice(250, 350), slice(1, 5, 2)), + (slice(250, 350, 33), slice(1, 5, 3)), + # total selections + (slice(None), slice(None)), + Ellipsis, + (), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), slice(None)), +] + + +basic_selections_2d_bad = [ + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (2.3, slice(None)), + # only positive step supported + slice(None, None, -1), + (slice(None, None, -1), slice(None)), + (0, 0, 0), + (slice(None), slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_basic_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + for selection in basic_selections_2d: + _test_get_basic_selection(a, z, selection) + + bad_selections = basic_selections_2d_bad + [ + # integer arrays + [0, 1], + (slice(None), [0, 1]), + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_basic_selection(selection) + # check fallback on fancy indexing + fancy_selection = ([0, 1], [0, 1]) + np.testing.assert_array_equal(z[fancy_selection], [0, 11]) + + +def test_fancy_indexing_fallback_on_get_setitem(store: StorePath): + z = zarr_array_from_numpy_array(store, np.zeros((20, 20))) + z[[1, 2, 3], [1, 2, 3]] = 1 + np.testing.assert_array_equal( + z[:4, :4], + [ + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + ], + ) + np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) + # test broadcasting + np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) + # test 1D fancy indexing + z2 = zarr_array_from_numpy_array(store, np.zeros(5)) + z2[[1, 2, 3]] = 1 + np.testing.assert_array_equal(z2[:], [0, 1, 1, 1, 0]) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[0, 1, 2], [3, 4, 5]]), + # List first, then slice + (([0, 1], slice(None)), [[0, 1, 2], [3, 4, 5]]), + # List first, then slice + (([0, 1], slice(1, None)), [[1, 2], [4, 5]]), + # Slice first, then list + ((slice(0, 2), [0, 2]), [[0, 2], [3, 5]]), + # Slices only + ((slice(0, 2), slice(0, 2)), [[0, 1], [3, 4]]), + # List with repeated index + (([1, 0, 1], slice(1, None)), [[4, 5], [1, 2], [4, 5]]), + # 1D indexing + (([1, 0, 1]), [[3, 4, 5], [0, 1, 2], [3, 4, 5]]), + ], +) +def test_orthogonal_indexing_fallback_on_getitem_2d(store: StorePath, index, expected_result): + """ + Tests the orthogonal indexing fallback on __getitem__ for a 2D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # [0, 1, 2], + # [3, 4, 5], + # [6, 7, 8] + a = np.arange(9).reshape(3, 3) + z = zarr_array_from_numpy_array(store, a) + + np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[index], expected_result) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]), + # One slice, two integers + ((slice(0, 2), 1, 1), [4, 13]), + # One integer, two slices + ((slice(0, 2), 1, slice(0, 2)), [[3, 4], [12, 13]]), + # Two slices and a list + ((slice(0, 2), [1, 2], slice(0, 2)), [[[3, 4], [6, 7]], [[12, 13], [15, 16]]]), + ], +) +def test_orthogonal_indexing_fallback_on_getitem_3d(store: StorePath, index, expected_result): + """ + Tests the orthogonal indexing fallback on __getitem__ for a 3D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # [[[ 0, 1, 2], + # [ 3, 4, 5], + # [ 6, 7, 8]], + + # [[ 9, 10, 11], + # [12, 13, 14], + # [15, 16, 17]], + + # [[18, 19, 20], + # [21, 22, 23], + # [24, 25, 26]]] + a = np.arange(27).reshape(3, 3, 3) + z = zarr_array_from_numpy_array(store, a) + + np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[index], expected_result) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[1, 1, 1], [1, 1, 1], [0, 0, 0]]), + # List and slice combined + (([0, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), + # Index repetition is ignored on setitem + (([0, 1, 1, 1, 1, 1, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), + # Slice with step + (([0, 2], slice(None, None, 2)), [[1, 0, 1], [0, 0, 0], [1, 0, 1]]), + ], +) +def test_orthogonal_indexing_fallback_on_setitem_2d(store: StorePath, index, expected_result): + """ + Tests the orthogonal indexing fallback on __setitem__ for a 3D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # Slice + fancy index + a = np.zeros((3, 3)) + z = zarr_array_from_numpy_array(store, a) + z[index] = 1 + a[index] = 1 + np.testing.assert_array_equal(z[:], expected_result) + np.testing.assert_array_equal(z[:], a, err_msg="Indexing disagrees with numpy") + + +def test_fancy_indexing_doesnt_mix_with_implicit_slicing(store: StorePath): + z2 = zarr_array_from_numpy_array(store, np.zeros((5, 5, 5))) + with pytest.raises(IndexError): + z2[[1, 2, 3], [1, 2, 3]] = 2 + with pytest.raises(IndexError): + np.testing.assert_array_equal(z2[[1, 2, 3], [1, 2, 3]], 0) + with pytest.raises(IndexError): + z2[..., [1, 2, 3]] = 2 + with pytest.raises(IndexError): + np.testing.assert_array_equal(z2[..., [1, 2, 3]], 0) + + +@pytest.mark.xfail(reason="zero-dimension arrays are not supported in v3") +def test_set_basic_selection_0d(store: StorePath): + # setup + v = np.array(42) + a = np.zeros_like(v) + z = zarr_array_from_numpy_array(store, v) + assert_array_equal(a, z[:]) + + # tests + z.set_basic_selection(Ellipsis, v) + assert_array_equal(v, z[:]) + z[...] = 0 + assert_array_equal(a, z[:]) + z[...] = v + assert_array_equal(v, z[:]) + + # test structured array + value = (b"aaa", 1, 4.2) + v = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + a = np.zeros_like(v) + z = zarr_array_from_numpy_array(store, a) + + # tests + z.set_basic_selection(Ellipsis, v) + assert_array_equal(v, z[:]) + z.set_basic_selection(Ellipsis, a) + assert_array_equal(a, z[:]) + z[...] = v + assert_array_equal(v, z[:]) + z[...] = a + assert_array_equal(a, z[:]) + # with fields + z.set_basic_selection(Ellipsis, v["foo"], fields="foo") + assert v["foo"] == z["foo"] + assert a["bar"] == z["bar"] + assert a["baz"] == z["baz"] + z["bar"] = v["bar"] + assert v["foo"] == z["foo"] + assert v["bar"] == z["bar"] + assert a["baz"] == z["baz"] + # multiple field assignment not supported + with pytest.raises(IndexError): + z.set_basic_selection(Ellipsis, v[["foo", "bar"]], fields=["foo", "bar"]) + with pytest.raises(IndexError): + z[..., "foo", "bar"] = v[["foo", "bar"]] + + +def _test_get_orthogonal_selection(a, z, selection): + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_1d_bool(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_get_orthogonal_selection(a, z, ix) + + # test errors + with pytest.raises(IndexError): + z.oindex[np.zeros(50, dtype=bool)] # too short + with pytest.raises(IndexError): + z.oindex[np.zeros(2000, dtype=bool)] # too long + with pytest.raises(IndexError): + z.oindex[[[True, False], [False, True]]] # too many dimensions + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_1d_int(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + # unordered + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_get_orthogonal_selection(a, z, ix) + # increasing + ix.sort() + _test_get_orthogonal_selection(a, z, ix) + # decreasing + ix = ix[::-1] + _test_get_orthogonal_selection(a, z, ix) + + selections = basic_selections_1d + [ + # test wraparound + [0, 3, 10, -23, -12, -1], + # explicit test not sorted + [3, 105, 23, 127], + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + bad_selections = basic_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + [[2, 4], [6, 8]], # too many dimensions + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_orthogonal_selection(selection) + with pytest.raises(IndexError): + z.oindex[selection] + + +def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (ix0, slice(1, 5, 2)), + (slice(250, 350), ix1), + (slice(250, 350, 10), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + + # mixed int array / bool array + selections = ( + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ) + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_get_orthogonal_selection(a, z, selection) + + for selection in basic_selections_2d_bad: + with pytest.raises(IndexError): + z.get_orthogonal_selection(selection) + with pytest.raises(IndexError): + z.oindex[selection] + + +def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): + selections = [ + # single value + (84, 42, 4), + (-1, -1, -1), + # index all axes with array + (ix0, ix1, ix2), + # mixed indexing with single array / slices + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, slice(15, 25, 5), slice(1, 5, 2)), + (slice(50, 70, 3), ix1, slice(1, 5, 2)), + (slice(50, 70, 3), slice(15, 25, 5), ix2), + # mixed indexing with single array / ints + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + # mixed indexing with single array / slice / int + (ix0, slice(15, 25), 4), + (42, ix1, slice(1, 5)), + (slice(50, 70), 42, ix2), + # mixed indexing with two array / slice + (ix0, ix1, slice(1, 5)), + (slice(50, 70), ix1, ix2), + (ix0, slice(15, 25), ix2), + # mixed indexing with two array / integer + (ix0, ix1, 4), + (42, ix1, ix2), + (ix0, 42, ix2), + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + +def test_get_orthogonal_selection_3d(store: StorePath): + # setup + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(60, 20, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() + ix2.sort() + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_edge_cases(store: StorePath): + a = np.arange(6).reshape(1, 2, 3) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(1, 2, 3)) + + expect = oindex(a, (0, slice(None), [0, 1, 2])) + actual = z.oindex[0, :, [0, 1, 2]] + assert_array_equal(expect, actual) + + expect = oindex(a, (0, slice(None), [True, True, True])) + actual = z.oindex[0, :, [True, True, True]] + assert_array_equal(expect, actual) + + +def _test_set_orthogonal_selection(v, a, z, selection): + for value in 42, oindex(v, selection), oindex(v, selection).tolist(): + if isinstance(value, list) and value == []: + # skip these cases as cannot preserve all dimensions + continue + # setup expectation + a[:] = 0 + oindex_set(a, selection, value) + # long-form API + z[:] = 0 + z.set_orthogonal_selection(selection, value) + assert_array_equal(a, z[:]) + # short-form API + z[:] = 0 + z.oindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_orthogonal_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + # test with different degrees of sparseness + np.random.seed(42) + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_orthogonal_selection(v, a, z, ix) + + # integer arrays + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_set_orthogonal_selection(v, a, z, ix) + ix.sort() + _test_set_orthogonal_selection(v, a, z, ix) + ix = ix[::-1] + _test_set_orthogonal_selection(v, a, z, ix) + + # basic selections + for selection in basic_selections_1d: + _test_set_orthogonal_selection(v, a, z, selection) + + +def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice or int + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + (ix0, 4), + (42, ix1), + ] + for selection in selections: + _test_set_orthogonal_selection(v, a, z, selection) + + +def test_set_orthogonal_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_set_orthogonal_selection(v, a, z, selection) + + +def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): + selections = ( + # single value + (84, 42, 4), + (-1, -1, -1), + # index all axes with bool array + (ix0, ix1, ix2), + # mixed indexing with single bool array / slice or int + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + (ix0, slice(15, 25), 4), + (slice(50, 70), ix1, 4), + (slice(50, 70), 42, ix2), + # indexing with two arrays / slice + (ix0, ix1, slice(1, 5)), + # indexing with two arrays / integer + (ix0, ix1, 4), + ) + for selection in selections: + _test_set_orthogonal_selection(v, a, z, selection) + + +def test_set_orthogonal_selection_3d(store: StorePath): + # setup + v = np.arange(100000, dtype=int).reshape(200, 50, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(60, 20, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted increasing + ix0.sort() + ix1.sort() + ix2.sort() + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted decreasing + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_fallback_on_get_setitem(store: StorePath): + z = zarr_array_from_numpy_array(store, np.zeros((20, 20))) + z[[1, 2, 3], [1, 2, 3]] = 1 + np.testing.assert_array_equal( + z[:4, :4], + [ + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + ], + ) + np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) + # test broadcasting + np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) + # test 1D fancy indexing + z2 = zarr_array_from_numpy_array(store, np.zeros(5)) + z2[[1, 2, 3]] = 1 + np.testing.assert_array_equal(z2[:], [0, 1, 1, 1, 0]) + + +def _test_get_coordinate_selection(a, z, selection): + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +coordinate_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + Ellipsis, + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_coordinate_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + _test_get_coordinate_selection(a, z, ix) + ix.sort() + _test_get_coordinate_selection(a, z, ix) + ix = ix[::-1] + _test_get_coordinate_selection(a, z, ix) + + selections = [ + # test single item + 42, + -1, + # test wraparound + [0, 3, 10, -23, -12, -1], + # test out of order + [3, 105, 23, 127], # not monotonically increasing + # test multi-dimensional selection + np.array([[2, 4], [6, 8]]), + ] + for selection in selections: + _test_get_coordinate_selection(a, z, selection) + + # test errors + bad_selections = coordinate_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + z.vindex[selection] + + +def test_get_coordinate_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # single value + (42, 4), + (-1, -1), + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + (42, 4), + ] + for selection in selections: + _test_get_coordinate_selection(a, z, selection) + + # not monotonically increasing (first dim) + ix0 = [3, 3, 4, 2, 5] + ix1 = [1, 3, 5, 7, 9] + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + # not monotonically increasing (second dim) + ix0 = [1, 1, 2, 2, 5] + ix1 = [1, 3, 2, 1, 0] + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + # multi-dimensional selection + ix0 = np.array([[1, 1, 2], [2, 2, 5]]) + ix1 = np.array([[1, 3, 2], [1, 0, 0]]) + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = [1, 2, 3], slice(5, 15) + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis + z.get_coordinate_selection(selection) + + +def _test_set_coordinate_selection(v, a, z, selection): + for value in 42, v[selection], v[selection].tolist(): + # setup expectation + a[:] = 0 + a[selection] = value + # test long-form API + z[:] = 0 + z.set_coordinate_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.vindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_coordinate_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + _test_set_coordinate_selection(v, a, z, ix) + + # multi-dimensional selection + ix = np.array([[2, 4], [6, 8]]) + _test_set_coordinate_selection(v, a, z, ix) + + for selection in coordinate_selections_1d_bad: + with pytest.raises(IndexError): + z.set_coordinate_selection(selection, 42) + with pytest.raises(IndexError): + z.vindex[selection] = 42 + + +def test_set_coordinate_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + + selections = ( + (42, 4), + (-1, -1), + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ) + for selection in selections: + _test_set_coordinate_selection(v, a, z, selection) + + # multi-dimensional selection + ix0 = np.array([[1, 2, 3], [4, 5, 6]]) + ix1 = np.array([[1, 3, 2], [2, 0, 5]]) + _test_set_coordinate_selection(v, a, z, (ix0, ix1)) + + +def _test_get_block_selection(a, z, selection, expected_idx): + expect = a[expected_idx] + actual = z.get_block_selection(selection) + assert_array_equal(expect, actual) + actual = z.blocks[selection] + assert_array_equal(expect, actual) + + +block_selections_1d = [ + # test single item + 0, + 5, + # test wraparound + -1, + -4, + # test slice + slice(5), + slice(None, 3), + slice(5, 6), + slice(-3, -1), + slice(None), # Full slice +] + +block_selections_1d_array_projection = [ + # test single item + slice(100), + slice(500, 600), + # test wraparound + slice(1000, None), + slice(700, 800), + # test slice + slice(500), + slice(None, 300), + slice(500, 600), + slice(800, 1000), + slice(None), +] + +block_selections_1d_bad = [ + # slice not supported + slice(3, 8, 2), + # bad stuff + 2.3, + # "foo", # TODO + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), + [0, 5, 3], +] + + +def test_get_block_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + for selection, expected_idx in zip( + block_selections_1d, block_selections_1d_array_projection, strict=True + ): + _test_get_block_selection(a, z, selection, expected_idx) + + bad_selections = block_selections_1d_bad + [ + z.metadata.chunk_grid.get_nchunks(z.shape) + 1, # out of bounds + -(z.metadata.chunk_grid.get_nchunks(z.shape) + 1), # out of bounds + ] + + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_block_selection(selection) + with pytest.raises(IndexError): + z.blocks[selection] + + +block_selections_2d = [ + # test single item + (0, 0), + (1, 2), + # test wraparound + (-1, -1), + (-3, -2), + # test slice + (slice(1), slice(2)), + (slice(None, 2), slice(-2, -1)), + (slice(2, 3), slice(-2, None)), + (slice(-3, -1), slice(-3, -2)), + (slice(None), slice(None)), # Full slice +] + +block_selections_2d_array_projection = [ + # test single item + (slice(300), slice(3)), + (slice(300, 600), slice(6, 9)), + # test wraparound + (slice(900, None), slice(9, None)), + (slice(300, 600), slice(6, 9)), + # test slice + (slice(300), slice(6)), + (slice(None, 600), slice(6, 9)), + (slice(600, 900), slice(6, None)), + (slice(300, 900), slice(3, 6)), + (slice(None), slice(None)), # Full slice +] + + +def test_get_block_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + for selection, expected_idx in zip( + block_selections_2d, block_selections_2d_array_projection, strict=True + ): + _test_get_block_selection(a, z, selection, expected_idx) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_block_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_block_selection(selection) + with pytest.raises(IndexError): # out of bounds + selection = slice(15, 20), slice(None) + z.get_block_selection(selection) + + +def _test_set_block_selection(v: np.ndarray, a: np.ndarray, z: zarr.Array, selection, expected_idx): + for value in 42, v[expected_idx], v[expected_idx].tolist(): + # setup expectation + a[:] = 0 + a[expected_idx] = value + # test long-form API + z[:] = 0 + z.set_block_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.blocks[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_block_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + for selection, expected_idx in zip( + block_selections_1d, block_selections_1d_array_projection, strict=True + ): + _test_set_block_selection(v, a, z, selection, expected_idx) + + for selection in block_selections_1d_bad: + with pytest.raises(IndexError): + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): + z.blocks[selection] = 42 + + +def test_set_block_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + for selection, expected_idx in zip( + block_selections_2d, block_selections_2d_array_projection, strict=True + ): + _test_set_block_selection(v, a, z, selection, expected_idx) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): # out of bounds + selection = slice(15, 20), slice(None) + z.set_block_selection(selection, 42) + + +def _test_get_mask_selection(a, z, selection): + expect = a[selection] + actual = z.get_mask_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +mask_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + Ellipsis, + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_mask_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_get_mask_selection(a, z, ix) + + # test errors + bad_selections = mask_selections_1d_bad + [ + np.zeros(50, dtype=bool), # too short + np.zeros(2000, dtype=bool), # too long + [[True, False], [False, True]], # too many dimensions + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_mask_selection(selection) + with pytest.raises(IndexError): + z.vindex[selection] + + +# noinspection PyStatementEffect +def test_get_mask_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + _test_get_mask_selection(a, z, ix) + + # test errors + with pytest.raises(IndexError): + z.vindex[np.zeros((1000, 5), dtype=bool)] # too short + with pytest.raises(IndexError): + z.vindex[np.zeros((2000, 10), dtype=bool)] # too long + with pytest.raises(IndexError): + z.vindex[[True, False]] # wrong no. dimensions + + +def _test_set_mask_selection(v, a, z, selection): + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z.set_mask_selection(selection, v[selection]) + assert_array_equal(a, z[:]) + z[:] = 0 + z.vindex[selection] = v[selection] + assert_array_equal(a, z[:]) + + +def test_set_mask_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_mask_selection(v, a, z, ix) + + for selection in mask_selections_1d_bad: + with pytest.raises(IndexError): + z.set_mask_selection(selection, 42) + with pytest.raises(IndexError): + z.vindex[selection] = 42 + + +def test_set_mask_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + _test_set_mask_selection(v, a, z, ix) + + +def test_get_selection_out(store: StorePath): + # basic selections + a = np.arange(1050) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + selections = [ + slice(50, 150), + slice(0, 1050), + slice(1, 2), + ] + for selection in selections: + expect = a[selection] + out = NDBuffer.from_numpy_array(np.empty(expect.shape)) + z.get_basic_selection(selection, out=out) + assert_array_equal(expect, out.as_numpy_array()[:]) + + with pytest.raises(TypeError): + z.get_basic_selection(Ellipsis, out=[]) + + # orthogonal selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + # mixed int array / bool array + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ] + for selection in selections: + expect = oindex(a, selection) + out = NDBuffer.from_numpy_array(np.zeros(expect.shape, dtype=expect.dtype)) + z.get_orthogonal_selection(selection, out=out) + assert_array_equal(expect, out.as_numpy_array()[:]) + + # coordinate selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + expect = a[selection] + out = NDBuffer.from_numpy_array(np.zeros(expect.shape, dtype=expect.dtype)) + z.get_coordinate_selection(selection, out=out) + assert_array_equal(expect, out.as_numpy_array()[:]) + + +@pytest.mark.xfail(reason="fields are not supported in v3") +def test_get_selections_with_fields(store: StorePath): + a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] + a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(2,)) + + fields_fixture = [ + "foo", + ["foo"], + ["foo", "bar"], + ["foo", "baz"], + ["bar", "baz"], + ["foo", "bar", "baz"], + ["bar", "foo"], + ["baz", "bar", "foo"], + ] + + for fields in fields_fixture: + # total selection + expect = a[fields] + actual = z.get_basic_selection(Ellipsis, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[fields[0], fields[1]] + assert_array_equal(expect, actual) + if isinstance(fields, str): + actual = z[..., fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[..., fields[0], fields[1]] + assert_array_equal(expect, actual) + + # basic selection with slice + expect = a[fields][0:2] + actual = z.get_basic_selection(slice(0, 2), fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[0:2, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[0:2, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # basic selection with single item + expect = a[fields][1] + actual = z.get_basic_selection(1, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[1, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[1, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # orthogonal selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_orthogonal_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.oindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.oindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # coordinate selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_coordinate_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # mask selection + ix = [True, False, True] + expect = a[fields][ix] + actual = z.get_mask_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # missing/bad fields + with pytest.raises(IndexError): + z.get_basic_selection(Ellipsis, fields=["notafield"]) + with pytest.raises(IndexError): + z.get_basic_selection(Ellipsis, fields=slice(None)) + + +@pytest.mark.xfail(reason="fields are not supported in v3") +def test_set_selections_with_fields(store: StorePath): + v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] + v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, v, chunk_shape=(2,)) + + fields_fixture = [ + "foo", + [], + ["foo"], + ["foo", "bar"], + ["foo", "baz"], + ["bar", "baz"], + ["foo", "bar", "baz"], + ["bar", "foo"], + ["baz", "bar", "foo"], + ] + + for fields in fields_fixture: + # currently multi-field assignment is not supported in numpy, so we won't support + # it either + if isinstance(fields, list) and len(fields) > 1: + with pytest.raises(IndexError): + z.set_basic_selection(Ellipsis, v, fields=fields) + with pytest.raises(IndexError): + z.set_orthogonal_selection([0, 2], v, fields=fields) + with pytest.raises(IndexError): + z.set_coordinate_selection([0, 2], v, fields=fields) + with pytest.raises(IndexError): + z.set_mask_selection([True, False, True], v, fields=fields) + + else: + if isinstance(fields, list) and len(fields) == 1: + # work around numpy does not support multi-field assignment even if there + # is only one field + key = fields[0] + elif isinstance(fields, list) and len(fields) == 0: + # work around numpy ambiguity about what is a field selection + key = Ellipsis + else: + key = fields + + # setup expectation + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + assert_array_equal(a, z[:]) + a[key] = v[key] + # total selection + z.set_basic_selection(Ellipsis, v[key], fields=fields) + assert_array_equal(a, z[:]) + + # basic selection with slice + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + a[key][0:2] = v[key][0:2] + z.set_basic_selection(slice(0, 2), v[key][0:2], fields=fields) + assert_array_equal(a, z[:]) + + # orthogonal selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [0, 2] + a[key][ix] = v[key][ix] + z.set_orthogonal_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + # coordinate selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [0, 2] + a[key][ix] = v[key][ix] + z.set_coordinate_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + # mask selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [True, False, True] + a[key][ix] = v[key][ix] + z.set_mask_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + +def test_slice_selection_uints(): + arr = np.arange(24).reshape((4, 6)) + idx = np.uint64(3) + slice_sel = make_slice_selection((idx,)) + assert arr[tuple(slice_sel)].shape == (1, 6) + + +def test_numpy_int_indexing(store: StorePath): + a = np.arange(1050) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + assert a[42] == z[42] + assert a[np.int64(42)] == z[np.int64(42)] + + +@pytest.mark.parametrize( + "shape, chunks, ops", + [ + # 1D test cases + ((1070,), (50,), [("__getitem__", (slice(200, 400),))]), + ((1070,), (50,), [("__getitem__", (slice(200, 400, 100),))]), + ( + (1070,), + (50,), + [ + ("__getitem__", (slice(200, 400),)), + ("__setitem__", (slice(200, 400, 100),)), + ], + ), + # 2D test cases + ( + (40, 50), + (5, 8), + [ + ("__getitem__", (slice(6, 37, 13), (slice(4, 10)))), + ("__setitem__", (slice(None), (slice(None)))), + ], + ), + ], +) +def test_accessed_chunks(shape, chunks, ops): + # Test that only the required chunks are accessed during basic selection operations + # shape: array shape + # chunks: chunk size + # ops: list of tuples with (optype, tuple of slices) + # optype = "__getitem__" or "__setitem__", tuple length must match number of dims + import itertools + + # Use a counting dict as the backing store so we can track the items access + store = CountingDict() + z = zarr_array_from_numpy_array(StorePath(store), np.zeros(shape), chunk_shape=chunks) + + for ii, (optype, slices) in enumerate(ops): + # Resolve the slices into the accessed chunks for each dimension + chunks_per_dim = [] + for N, C, sl in zip(shape, chunks, slices, strict=True): + chunk_ind = np.arange(N, dtype=int)[sl] // C + chunks_per_dim.append(np.unique(chunk_ind)) + + # Combine and generate the cartesian product to determine the chunks keys that + # will be accessed + chunks_accessed = [] + for comb in itertools.product(*chunks_per_dim): + chunks_accessed.append(".".join([str(ci) for ci in comb])) + + counts_before = store.counter.copy() + + # Perform the operation + if optype == "__getitem__": + z[slices] + else: + z[slices] = ii + + # Get the change in counts + delta_counts = store.counter - counts_before + + # Check that the access counts for the operation have increased by one for all + # the chunks we expect to be included + for ci in chunks_accessed: + assert delta_counts.pop((optype, ci)) == 1 + + # If the chunk was partially written to it will also have been read once. We + # don't determine if the chunk was actually partial here, just that the + # counts are consistent that this might have happened + if optype == "__setitem__": + assert ("__getitem__", ci) not in delta_counts or delta_counts.pop( + ("__getitem__", ci) + ) == 1 + # Check that no other chunks were accessed + assert len(delta_counts) == 0 diff --git a/tests/v3/test_sync.py b/tests/v3/test_sync.py index 5b953573d..7e3b8dd11 100644 --- a/tests/v3/test_sync.py +++ b/tests/v3/test_sync.py @@ -1,5 +1,4 @@ import asyncio -import time from collections.abc import AsyncGenerator from unittest.mock import AsyncMock, patch @@ -48,7 +47,7 @@ def test_sync_timeout() -> None: duration = 0.002 async def foo() -> None: - time.sleep(duration) + await asyncio.sleep(duration) with pytest.raises(asyncio.TimeoutError): sync(foo(), timeout=duration / 2)