From 441c2a4982045cac2ff1badbd1e78e18f347337c Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Sun, 1 Dec 2024 15:07:39 -0500 Subject: [PATCH] Add zarr codec with test (#69) --- gribberish/tests/read.rs | 4 +- python/README.md | 12 ++++++ python/gribberish/zarr/__init__.py | 0 python/gribberish/zarr/codec.py | 65 ++++++++++++++++++++++++++++++ python/pyproject.toml | 9 +++++ python/tests/test_zarr_codec.py | 34 ++++++++++++++++ 6 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 python/gribberish/zarr/__init__.py create mode 100644 python/gribberish/zarr/codec.py create mode 100644 python/tests/test_zarr_codec.py diff --git a/gribberish/tests/read.rs b/gribberish/tests/read.rs index 5d826f5..47647e9 100644 --- a/gribberish/tests/read.rs +++ b/gribberish/tests/read.rs @@ -65,6 +65,8 @@ fn read_simple() { assert!(message.is_some()); let message = message.unwrap(); + println!("{:?}", message.grid_dimensions().unwrap()); + let start = Instant::now(); let data = message.data(); let end = Instant::now(); @@ -128,4 +130,4 @@ fn read_complex_zerod() { assert!(data.is_ok()); let data = data.unwrap(); println!("spatial complex zero unpacking data() took {:?} for {} data points", end.duration_since(start), data.len()); -} \ No newline at end of file +} diff --git a/python/README.md b/python/README.md index 12036b8..dfccc2b 100644 --- a/python/README.md +++ b/python/README.md @@ -30,6 +30,12 @@ With optional `kerchunk` support: pip install "gribberish[kerchunk]" ``` +With optional `zarr` support: + +```bash +pip install "gribberish[zarr]" +``` + ### Manually With pip: @@ -74,3 +80,9 @@ This package also supports building virtual datasets with [`kerchunk`](https://g - [`kerchunk_gefs_wave.ipynb`](./examples/kerchunk_gefs_wave.ipynb) shows how to build a single virtual dataset from an entire GEFS Wave Ensemble model run (30 ensemble members, 384 hour time horizon) - [`kerchunk_hrrr_subhourly.ipynb`](./examples/kerchunk_hrrr_subhourly.ipynb) shows how to build a single virtual dataset from an entire HRRR subhourly surface model run. This results in a virtual dataset with data at 15 minute time intervals over the following 18 hours. + +### `zarr` + +This package also supports use with `zarr` for reading unmodified GRIB2 messages (arrays) as chunks using the `gribberish.zarr.GribberishCodec` codec. This usually will not be used directly, but with [`VirtualiZarr`](https://virtualizarr.readthedocs.io/en/latest/) or [`kerchunk`](https://github.com/fsspec/kerchunk) + +Examples to come soon. \ No newline at end of file diff --git a/python/gribberish/zarr/__init__.py b/python/gribberish/zarr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/gribberish/zarr/codec.py b/python/gribberish/zarr/codec.py new file mode 100644 index 0000000..d5c9197 --- /dev/null +++ b/python/gribberish/zarr/codec.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass +from typing import Self + +from gribberish import parse_grib_array, parse_grib_message_metadata +from zarr.abc.codec import ArrayBytesCodec +from zarr.core.array_spec import ArraySpec +from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer +from zarr.core.common import JSON, parse_named_configuration +from zarr.registry import register_codec + + +@dataclass(frozen=True) +class GribberishCodec(ArrayBytesCodec): + """Transform GRIB2 bytes into zarr arrays using gribberish library""" + + var: str | None + + def __init__(self, var: str | None) -> Self: + object.__setattr__(self, "var", var) + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + _, configuration_parsed = parse_named_configuration( + data, "gribberish", require_configuration=False + ) + configuration_parsed = configuration_parsed or {} + return cls(**configuration_parsed) # type: ignore[arg-type] + + def to_dict(self) -> dict[str, JSON]: + if not self.var: + return {"name": "gribberish"} + else: + return {"name": "gribberish", "configuration": {"var": self.var}} + + async def _decode_single( + self, + chunk_data: Buffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + assert isinstance(chunk_data, Buffer) + chunk_bytes = chunk_data.to_bytes() + + if self.var == 'latitude' or self.var == 'longitude': + message = parse_grib_message_metadata(chunk_bytes, 0) + lat, lng = message.latlng() + data: NDArrayLike = lat if self.var == 'latitude' else lng + else: + data: NDArrayLike = parse_grib_array(chunk_bytes, 0) + + if chunk_spec.dtype != data.dtype: + data = data.astype(chunk_spec.dtype) + if data.shape != chunk_spec.shape: + data = data.reshape(chunk_spec.shape) + + return data + + async def _encode_single( + self, + chunk_data: NDBuffer, + chunk_spec: ArraySpec, + ) -> Buffer | None: + # This is a read-only codec + raise NotImplementedError + +register_codec("gribberish", GribberishCodec) diff --git a/python/pyproject.toml b/python/pyproject.toml index 8cb2538..55dfa68 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,6 +17,7 @@ dependencies = ["numpy >= 2"] [project.optional-dependencies] xarray = ["xarray", "fsspec"] kerchunk = ["kerchunk", "zarr", "numcodecs", "fsspec"] +zarr = ["zarr==3.0.0b2"] dev = [ "matplotlib", "cf_xarray", @@ -31,6 +32,8 @@ dev = [ "s3fs", "cfgrib", "pyproj", + "pytest", + "pytest-asyncio", ] [project.entry-points."xarray.backends"] @@ -39,6 +42,12 @@ gribberish = "gribberish.gribberish_backend:GribberishBackend" [project.entry-points."numcodecs.codecs"] gribberish = "gribberish.kerchunk:GribberishCodec" +[project.entry-points."zarr.codecs"] +gribberish = "gribberish.zarr:GribberishCodec" + [tool.maturin] features = ["pyo3/extension-module"] module-name = "gribberish._gribberish_python" + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/python/tests/test_zarr_codec.py b/python/tests/test_zarr_codec.py new file mode 100644 index 0000000..2c87dec --- /dev/null +++ b/python/tests/test_zarr_codec.py @@ -0,0 +1,34 @@ +import pytest + +import numpy as np + +zarr = pytest.importorskip("zarr") + + +async def test_decode_data_var_gribberish(): + from gribberish.zarr.codec import GribberishCodec + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import default_buffer_prototype + + with open("./../gribberish/tests/data/hrrr.t06z.wrfsfcf01-UGRD.grib2", "rb") as f: + raw_data = f.read() + + buffer = default_buffer_prototype().buffer.from_bytes(raw_data) + codec = GribberishCodec(var="UGRD") + data = await codec._decode_single( + buffer, + ArraySpec( + shape=(1059, 1799), + dtype="float64", + fill_value=0, + order="C", + prototype=np.ndarray, + ), + ) + + assert data.shape == (1059, 1799) + assert data.dtype == np.dtype("float64") + ( + np.testing.assert_almost_equal(data[0][1000], -4.46501350402832), + "Data not decoded correctly", + )