From 697846af60274016f84ed76a3ba81585c5f3985a Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 9 Sep 2025 14:21:38 +0100 Subject: [PATCH 1/5] Add pydantic classes for codecs --- docs/api/v3.md | 2 +- docs/api/v3/codecs.md | 1 + mkdocs.yaml | 7 +- pyproject.toml | 3 +- src/pydantic_zarr/{v3.py => v3/__init__.py} | 3 +- src/pydantic_zarr/v3/codecs.py | 152 ++++++++++++++++++++ test.py | 5 + tests/test_pydantic_zarr/test_v3.py | 7 +- 8 files changed, 173 insertions(+), 7 deletions(-) create mode 100644 docs/api/v3/codecs.md rename src/pydantic_zarr/{v3.py => v3/__init__.py} (99%) create mode 100644 src/pydantic_zarr/v3/codecs.py create mode 100644 test.py diff --git a/docs/api/v3.md b/docs/api/v3.md index 4934fa4..04cb984 100644 --- a/docs/api/v3.md +++ b/docs/api/v3.md @@ -1 +1 @@ -::: pydantic_zarr.v3 \ No newline at end of file +::: pydantic_zarr.v3 diff --git a/docs/api/v3/codecs.md b/docs/api/v3/codecs.md new file mode 100644 index 0000000..b74ab60 --- /dev/null +++ b/docs/api/v3/codecs.md @@ -0,0 +1 @@ +::: pydantic_zarr.v3.codecs diff --git a/mkdocs.yaml b/mkdocs.yaml index ddf31c8..3feb6d4 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -33,7 +33,10 @@ nav: - API: - core: api/core.md - v2: api/v2.md - - v3: api/v3.md + - v3: + - Core: api/v3.md + - Codecs: api/v3/codecs.md + plugins: - mkdocstrings: @@ -47,6 +50,8 @@ plugins: docstring_options: ignore_init_summary: true merge_init_into_class: true + extensions: + - griffe_pydantic: markdown_extensions: - pymdownx.highlight: diff --git a/pyproject.toml b/pyproject.toml index e9fd4b8..ac54387 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,11 +31,12 @@ Source = "https://github.com/zarr-developers/pydantic-zarr" test = ["coverage", "pytest<8.4", "pytest-cov", "pytest-examples"] docs = [ + "griffe-pydantic", "mkdocs-material", "mkdocstrings[python]", "pytest-examples", "pydantic==2.11", - "zarr>=3.1.0" + "zarr>=3.1.0", ] [tool.hatch] diff --git a/src/pydantic_zarr/v3.py b/src/pydantic_zarr/v3/__init__.py similarity index 99% rename from src/pydantic_zarr/v3.py rename to src/pydantic_zarr/v3/__init__.py index da40886..9c4f809 100644 --- a/src/pydantic_zarr/v3.py +++ b/src/pydantic_zarr/v3/__init__.py @@ -35,6 +35,7 @@ model_like, tuplify_json, ) +from pydantic_zarr.v3.codecs import Codec if TYPE_CHECKING: from collections.abc import Sequence @@ -95,7 +96,7 @@ class AnyNamedConfig(NamedConfig[str, Mapping[str, object]]): """ -CodecLike = str | AnyNamedConfig +CodecLike = str | AnyNamedConfig | Codec """A type modelling the permissible declarations for codecs""" diff --git a/src/pydantic_zarr/v3/codecs.py b/src/pydantic_zarr/v3/codecs.py new file mode 100644 index 0000000..9391406 --- /dev/null +++ b/src/pydantic_zarr/v3/codecs.py @@ -0,0 +1,152 @@ +from typing import Any, Literal + +from pydantic import BaseModel, Field, PositiveInt, PrivateAttr, field_validator, model_serializer + + +class Codec(BaseModel): + """ + Base class for codec models. + """ + + name: str + configuration: BaseModel + _codec_type: Literal["array-array", "array-bytes", "bytes-bytes"] = PrivateAttr() + + +class BloscConfiguration(BaseModel): + """ + Configuration for blosc codec. + """ + + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] + clevel: Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + shuffle: Literal["noshuffle", "shuffle", "bitshuffle"] + typesize: PositiveInt + blocksize: int + + +class Blosc(Codec): + """ + Blosc codec. + """ + + name: Literal["blosc"] = "blosc" + configuration: BloscConfiguration + _codec_type: Literal["bytes-bytes"] = "bytes-bytes" + + +class BytesConfig(BaseModel): + """ + Configuration for bytes codec. + """ + + endian: Literal["big", "little"] | None = None + + @model_serializer + def ser_model(self) -> dict[str, Any]: + if self.endian is None: + return {} + else: + return super().model_dump() + + +class Bytes(Codec): + """ + Bytes codec. + """ + + name: Literal["bytes"] = "bytes" + configuration: BytesConfig + _codec_type: Literal["array-bytes"] = "array-bytes" + + +class CRC32CConfig(BaseModel): + """ + Configuration for crc32c codec. + """ + + +class CRC32C(Codec): + """ + CRC32C codec. + """ + + name: Literal["crc32c"] = "crc32c" + configuration: CRC32CConfig = Field(default=CRC32CConfig()) + + _codec_type: Literal["bytes-bytes"] = "bytes-bytes" + + +class GzipConfig(BaseModel): + """ + Configuration for gzip codec. + """ + + level: Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + +class Gzip(Codec): + """ + Gzip codec. + """ + + name: Literal["gzip"] = "gzip" + configuration: GzipConfig + + _codec_type: Literal["bytes-bytes"] = "bytes-bytes" + + +class ShardingConfig(BaseModel): + """ + Configuration for sharding codec. + """ + + chunk_shape: tuple[int, ...] + codecs: tuple[Codec, ...] + # Default is recommended in the specification + index_codecs: tuple[Codec, ...] = Field( + default=(Bytes(configuration=BytesConfig(endian="little")), CRC32C()) + ) + index_location: Literal["start", "end"] = "end" + + @field_validator("codecs", "index_codecs") + @classmethod + def check_single_array_bytes_codec(cls, codecs: tuple[Codec, ...]) -> tuple[Codec, ...]: + if sum([(codec._codec_type == "array-bytes") for codec in codecs]) != 1: + raise ValueError("Codec list must contain exactly one array-bytes codec") + return codecs + + +class Sharding(Codec): + """ + Sharding codec. + """ + + name: Literal["sharding_indexed"] = "sharding_indexed" + configuration: ShardingConfig + _codec_type: Literal["bytes-bytes"] = "bytes-bytes" + + +class TransposeConfig(BaseModel): + """ + Configuration for transpose codec. + """ + + order: tuple[int, ...] + + @field_validator("order") + @classmethod + def check_order(cls, order: tuple[int, ...]) -> tuple[int, ...]: + if set(range(len(order))) != set(order): + raise ValueError("order must be a permutation of positive integers starting from 0") + return order + + +class Transpose(Codec): + """ + Transpose codec. + """ + + name: Literal["transpose"] = "transpose" + configuration: TransposeConfig + _codec_type: Literal["array-array"] = "array-array" diff --git a/test.py b/test.py new file mode 100644 index 0000000..9c5b0c2 --- /dev/null +++ b/test.py @@ -0,0 +1,5 @@ +import zarr + +from pydantic_zarr.v3 import ArraySpec + +ArraySpec.from_array(zarr.empty((1, 1, 1))) diff --git a/tests/test_pydantic_zarr/test_v3.py b/tests/test_pydantic_zarr/test_v3.py index 74f8bdd..ca6b223 100644 --- a/tests/test_pydantic_zarr/test_v3.py +++ b/tests/test_pydantic_zarr/test_v3.py @@ -22,6 +22,7 @@ RegularChunkingConfig, auto_codecs, ) +from pydantic_zarr.v3.codecs import Bytes, BytesConfig, Gzip, GzipConfig from .conftest import DTYPE_EXAMPLES_V3, DTypeExample @@ -31,14 +32,14 @@ def test_serialize_deserialize() -> None: group_attributes = {"group": True} - array_spec = ArraySpec( + array_spec: AnyArraySpec = ArraySpec( attributes=array_attributes, shape=[1000, 1000], dimension_names=["rows", "columns"], data_type="float64", chunk_grid=NamedConfig(name="regular", configuration={"chunk_shape": [1000, 100]}), chunk_key_encoding=NamedConfig(name="default", configuration={"separator": "/"}), - codecs=[NamedConfig(name="GZip", configuration={"level": 1})], + codecs=[Gzip(configuration=GzipConfig(level=1))], fill_value="NaN", storage_transformers=[], ) @@ -205,7 +206,7 @@ def test_from_flat() -> None: @staticmethod def test_from_zarr_depth() -> None: - codecs = ({"name": "bytes", "configuration": {}},) + codecs = (Bytes(configuration=BytesConfig()),) tree: dict[str, AnyGroupSpec | AnyArraySpec] = { "": GroupSpec(members=None, attributes={"level": 0, "type": "group"}), "/1": GroupSpec(members=None, attributes={"level": 1, "type": "group"}), From 325b460637635ec297cc42936c67ffe2fe563a5e Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Oct 2025 15:44:43 +0100 Subject: [PATCH 2/5] Watch src for docs changes --- mkdocs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mkdocs.yaml b/mkdocs.yaml index 3feb6d4..3cdeb9e 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -37,6 +37,8 @@ nav: - Core: api/v3.md - Codecs: api/v3/codecs.md +watch: + - src plugins: - mkdocstrings: From c0ff9e4a29aa170d8a0d46f3101821c00e33eb73 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Oct 2025 15:45:22 +0100 Subject: [PATCH 3/5] Remove _codec_type field --- src/pydantic_zarr/v3/codecs.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/pydantic_zarr/v3/codecs.py b/src/pydantic_zarr/v3/codecs.py index 9391406..24b5960 100644 --- a/src/pydantic_zarr/v3/codecs.py +++ b/src/pydantic_zarr/v3/codecs.py @@ -1,6 +1,9 @@ +""" +Models for Zarr v3 codecs.""" + from typing import Any, Literal -from pydantic import BaseModel, Field, PositiveInt, PrivateAttr, field_validator, model_serializer +from pydantic import BaseModel, Field, PositiveInt, field_validator, model_serializer class Codec(BaseModel): @@ -10,7 +13,6 @@ class Codec(BaseModel): name: str configuration: BaseModel - _codec_type: Literal["array-array", "array-bytes", "bytes-bytes"] = PrivateAttr() class BloscConfiguration(BaseModel): @@ -32,7 +34,6 @@ class Blosc(Codec): name: Literal["blosc"] = "blosc" configuration: BloscConfiguration - _codec_type: Literal["bytes-bytes"] = "bytes-bytes" class BytesConfig(BaseModel): @@ -57,7 +58,6 @@ class Bytes(Codec): name: Literal["bytes"] = "bytes" configuration: BytesConfig - _codec_type: Literal["array-bytes"] = "array-bytes" class CRC32CConfig(BaseModel): @@ -74,8 +74,6 @@ class CRC32C(Codec): name: Literal["crc32c"] = "crc32c" configuration: CRC32CConfig = Field(default=CRC32CConfig()) - _codec_type: Literal["bytes-bytes"] = "bytes-bytes" - class GzipConfig(BaseModel): """ @@ -93,8 +91,6 @@ class Gzip(Codec): name: Literal["gzip"] = "gzip" configuration: GzipConfig - _codec_type: Literal["bytes-bytes"] = "bytes-bytes" - class ShardingConfig(BaseModel): """ @@ -109,13 +105,6 @@ class ShardingConfig(BaseModel): ) index_location: Literal["start", "end"] = "end" - @field_validator("codecs", "index_codecs") - @classmethod - def check_single_array_bytes_codec(cls, codecs: tuple[Codec, ...]) -> tuple[Codec, ...]: - if sum([(codec._codec_type == "array-bytes") for codec in codecs]) != 1: - raise ValueError("Codec list must contain exactly one array-bytes codec") - return codecs - class Sharding(Codec): """ @@ -124,7 +113,6 @@ class Sharding(Codec): name: Literal["sharding_indexed"] = "sharding_indexed" configuration: ShardingConfig - _codec_type: Literal["bytes-bytes"] = "bytes-bytes" class TransposeConfig(BaseModel): @@ -149,4 +137,3 @@ class Transpose(Codec): name: Literal["transpose"] = "transpose" configuration: TransposeConfig - _codec_type: Literal["array-array"] = "array-array" From a612a1cec80d10b552472be58a8d103dd3778a42 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Oct 2025 15:45:32 +0100 Subject: [PATCH 4/5] Require all codecs to be Codecs --- src/pydantic_zarr/v3/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pydantic_zarr/v3/__init__.py b/src/pydantic_zarr/v3/__init__.py index 9c4f809..1e2ef8b 100644 --- a/src/pydantic_zarr/v3/__init__.py +++ b/src/pydantic_zarr/v3/__init__.py @@ -96,7 +96,7 @@ class AnyNamedConfig(NamedConfig[str, Mapping[str, object]]): """ -CodecLike = str | AnyNamedConfig | Codec +CodecLike = str | Codec """A type modelling the permissible declarations for codecs""" From 539ae579c0f7b0f24c4e0559e8d34f243c4a200c Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Oct 2025 15:54:10 +0100 Subject: [PATCH 5/5] Delete test file --- test.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 9c5b0c2..0000000 --- a/test.py +++ /dev/null @@ -1,5 +0,0 @@ -import zarr - -from pydantic_zarr.v3 import ArraySpec - -ArraySpec.from_array(zarr.empty((1, 1, 1)))