Skip to content

Commit

Permalink
Merge branch 'v3' into feature/zip-store
Browse files Browse the repository at this point in the history
  • Loading branch information
d-v-b committed Sep 13, 2024
2 parents e9f808b + 52d6849 commit 9cbd5df
Show file tree
Hide file tree
Showing 9 changed files with 448 additions and 385 deletions.
3 changes: 2 additions & 1 deletion src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from zarr.core.array import Array, AsyncArray
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
from zarr.core.group import AsyncGroup
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import ArrayV2Metadata
from zarr.core.metadata.v3 import ArrayV3Metadata
from zarr.store import (
StoreLike,
make_store_path,
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
get_indexer,
morton_order_iter,
)
from zarr.core.metadata import parse_codecs
from zarr.core.metadata.v3 import parse_codecs
from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec

if TYPE_CHECKING:
Expand Down
4 changes: 3 additions & 1 deletion src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
is_scalar,
pop_fields,
)
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import ArrayV2Metadata
from zarr.core.metadata.v3 import ArrayV3Metadata
from zarr.core.sync import sync
from zarr.registry import get_pipeline_class
from zarr.store import StoreLike, StorePath, make_store_path
Expand All @@ -67,6 +68,7 @@
from collections.abc import Iterable

from zarr.abc.codec import Codec, CodecPipeline
from zarr.core.metadata.common import ArrayMetadata

# Array and AsyncArray are defined in the base ``zarr`` namespace
__all__ = ["parse_array_metadata", "create_codec_pipeline"]
Expand Down
4 changes: 4 additions & 0 deletions src/zarr/core/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .v2 import ArrayV2Metadata
from .v3 import ArrayV3Metadata

__all__ = ["ArrayV2Metadata", "ArrayV3Metadata"]
67 changes: 67 additions & 0 deletions src/zarr/core/metadata/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any, Literal

import numpy as np
from typing_extensions import Self

from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.chunk_grids import ChunkGrid
from zarr.core.common import JSON, ChunkCoords, ZarrFormat

from abc import ABC, abstractmethod
from dataclasses import dataclass

from zarr.abc.metadata import Metadata


@dataclass(frozen=True, kw_only=True)
class ArrayMetadata(Metadata, ABC):
shape: ChunkCoords
fill_value: Any
chunk_grid: ChunkGrid
attributes: dict[str, JSON]
zarr_format: ZarrFormat

@property
@abstractmethod
def dtype(self) -> np.dtype[Any]:
pass

@property
@abstractmethod
def ndim(self) -> int:
pass

@abstractmethod
def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
pass

@abstractmethod
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
pass

@abstractmethod
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
pass

@abstractmethod
def update_shape(self, shape: ChunkCoords) -> Self:
pass

@abstractmethod
def update_attributes(self, attributes: dict[str, JSON]) -> Self:
pass


def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]:
if data is None:
return {}

return data
235 changes: 235 additions & 0 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any, Literal

import numpy.typing as npt
from typing_extensions import Self

from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import JSON, ChunkCoords

import json
from dataclasses import dataclass, field, replace

import numpy as np

from zarr.core.array_spec import ArraySpec
from zarr.core.chunk_grids import RegularChunkGrid
from zarr.core.chunk_key_encodings import parse_separator
from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike
from zarr.core.config import config, parse_indexing_order
from zarr.core.metadata.common import ArrayMetadata, parse_attributes


@dataclass(frozen=True, kw_only=True)
class ArrayV2Metadata(ArrayMetadata):
shape: ChunkCoords
chunk_grid: RegularChunkGrid
data_type: np.dtype[Any]
fill_value: None | int | float = 0
order: Literal["C", "F"] = "C"
filters: list[dict[str, JSON]] | None = None
dimension_separator: Literal[".", "/"] = "."
compressor: dict[str, JSON] | None = None
attributes: dict[str, JSON] = field(default_factory=dict)
zarr_format: Literal[2] = field(init=False, default=2)

def __init__(
self,
*,
shape: ChunkCoords,
dtype: npt.DTypeLike,
chunks: ChunkCoords,
fill_value: Any,
order: Literal["C", "F"],
dimension_separator: Literal[".", "/"] = ".",
compressor: dict[str, JSON] | None = None,
filters: list[dict[str, JSON]] | None = None,
attributes: dict[str, JSON] | None = None,
):
"""
Metadata for a Zarr version 2 array.
"""
shape_parsed = parse_shapelike(shape)
data_type_parsed = parse_dtype(dtype)
chunks_parsed = parse_shapelike(chunks)
compressor_parsed = parse_compressor(compressor)
order_parsed = parse_indexing_order(order)
dimension_separator_parsed = parse_separator(dimension_separator)
filters_parsed = parse_filters(filters)
fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
attributes_parsed = parse_attributes(attributes)

object.__setattr__(self, "shape", shape_parsed)
object.__setattr__(self, "data_type", data_type_parsed)
object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed))
object.__setattr__(self, "compressor", compressor_parsed)
object.__setattr__(self, "order", order_parsed)
object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
object.__setattr__(self, "filters", filters_parsed)
object.__setattr__(self, "fill_value", fill_value_parsed)
object.__setattr__(self, "attributes", attributes_parsed)

# ensure that the metadata document is consistent
_ = parse_metadata(self)

@property
def ndim(self) -> int:
return len(self.shape)

@property
def dtype(self) -> np.dtype[Any]:
return self.data_type

@property
def chunks(self) -> ChunkCoords:
return self.chunk_grid.chunk_shape

def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
def _json_convert(
o: Any,
) -> Any:
if isinstance(o, np.dtype):
if o.fields is None:
return o.str
else:
return o.descr
if np.isscalar(o):
# convert numpy scalar to python type, and pass
# python types through
return getattr(o, "item", lambda: o)()
raise TypeError

zarray_dict = self.to_dict()

# todo: remove this check when we can ensure that to_dict always returns dicts.
if not isinstance(zarray_dict, dict):
raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")

zattrs_dict = zarray_dict.pop("attributes", {})
json_indent = config.get("json_indent")
return {
ZARRAY_JSON: prototype.buffer.from_bytes(
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
),
ZATTRS_JSON: prototype.buffer.from_bytes(
json.dumps(zattrs_dict, indent=json_indent).encode()
),
}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
# make a copy to protect the original from modification
_data = data.copy()
# check that the zarr_format attribute is correct
_ = parse_zarr_format(_data.pop("zarr_format"))
return cls(**_data)

def to_dict(self) -> JSON:
zarray_dict = super().to_dict()

# todo: remove this check when we can ensure that to_dict always returns dicts.
if not isinstance(zarray_dict, dict):
raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")

_ = zarray_dict.pop("chunk_grid")
zarray_dict["chunks"] = self.chunk_grid.chunk_shape

_ = zarray_dict.pop("data_type")
zarray_dict["dtype"] = self.data_type.str

return zarray_dict

def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
return ArraySpec(
shape=self.chunk_grid.chunk_shape,
dtype=self.dtype,
fill_value=self.fill_value,
order=order,
prototype=prototype,
)

def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
return "0" if chunk_identifier == "" else chunk_identifier

def update_shape(self, shape: ChunkCoords) -> Self:
return replace(self, shape=shape)

def update_attributes(self, attributes: dict[str, JSON]) -> Self:
return replace(self, attributes=attributes)


def parse_zarr_format(data: Literal[2]) -> Literal[2]:
if data == 2:
return data
raise ValueError(f"Invalid value. Expected 2. Got {data}.")


def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
return data


def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
return data


def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
msg = (
f"The `shape` and `chunks` attributes must have the same length. "
f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
)
raise ValueError(msg)
return data


def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""
Parse a potential fill value into a value that is compatible with the provided dtype.
Parameters
----------
fill_value: Any
A potential fill value.
dtype: np.dtype[Any]
A numpy dtype.
Returns
An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
"""

if fill_value is None or dtype.hasobject:
# no fill value
pass
elif not isinstance(fill_value, np.void) and fill_value == 0:
# this should be compatible across numpy versions for any array type, including
# structured arrays
fill_value = np.zeros((), dtype=dtype)[()]

elif dtype.kind == "U":
# special case unicode because of encoding issues on Windows if passed through numpy
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713

if not isinstance(fill_value, str):
raise ValueError(
f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
)
else:
try:
if isinstance(fill_value, bytes) and dtype.kind == "V":
# special case for numpy 1.14 compatibility
fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()]
else:
fill_value = np.array(fill_value, dtype=dtype)[()]

except Exception as e:
msg = f"Fill_value {fill_value} is not valid for dtype {dtype}."
raise ValueError(msg) from e

return fill_value
Loading

0 comments on commit 9cbd5df

Please sign in to comment.