Skip to content

Commit

Permalink
Add zarr codec with test (#69)
Browse files Browse the repository at this point in the history
  • Loading branch information
mpiannucci authored Dec 1, 2024
1 parent 542bb93 commit 441c2a4
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 1 deletion.
4 changes: 3 additions & 1 deletion gribberish/tests/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ fn read_simple() {
assert!(message.is_some());
let message = message.unwrap();

println!("{:?}", message.grid_dimensions().unwrap());

let start = Instant::now();
let data = message.data();
let end = Instant::now();
Expand Down Expand Up @@ -128,4 +130,4 @@ fn read_complex_zerod() {
assert!(data.is_ok());
let data = data.unwrap();
println!("spatial complex zero unpacking data() took {:?} for {} data points", end.duration_since(start), data.len());
}
}
12 changes: 12 additions & 0 deletions python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ With optional `kerchunk` support:
pip install "gribberish[kerchunk]"
```

With optional `zarr` support:

```bash
pip install "gribberish[zarr]"
```

### Manually

With pip:
Expand Down Expand Up @@ -74,3 +80,9 @@ This package also supports building virtual datasets with [`kerchunk`](https://g

- [`kerchunk_gefs_wave.ipynb`](./examples/kerchunk_gefs_wave.ipynb) shows how to build a single virtual dataset from an entire GEFS Wave Ensemble model run (30 ensemble members, 384 hour time horizon)
- [`kerchunk_hrrr_subhourly.ipynb`](./examples/kerchunk_hrrr_subhourly.ipynb) shows how to build a single virtual dataset from an entire HRRR subhourly surface model run. This results in a virtual dataset with data at 15 minute time intervals over the following 18 hours.

### `zarr`

This package also supports use with `zarr` for reading unmodified GRIB2 messages (arrays) as chunks using the `gribberish.zarr.GribberishCodec` codec. This usually will not be used directly, but with [`VirtualiZarr`](https://virtualizarr.readthedocs.io/en/latest/) or [`kerchunk`](https://github.com/fsspec/kerchunk)

Examples to come soon.
Empty file.
65 changes: 65 additions & 0 deletions python/gribberish/zarr/codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from dataclasses import dataclass
from typing import Self

from gribberish import parse_grib_array, parse_grib_message_metadata
from zarr.abc.codec import ArrayBytesCodec
from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
from zarr.core.common import JSON, parse_named_configuration
from zarr.registry import register_codec


@dataclass(frozen=True)
class GribberishCodec(ArrayBytesCodec):
"""Transform GRIB2 bytes into zarr arrays using gribberish library"""

var: str | None

def __init__(self, var: str | None) -> Self:
object.__setattr__(self, "var", var)

@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(
data, "gribberish", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
return cls(**configuration_parsed) # type: ignore[arg-type]

def to_dict(self) -> dict[str, JSON]:
if not self.var:
return {"name": "gribberish"}
else:
return {"name": "gribberish", "configuration": {"var": self.var}}

async def _decode_single(
self,
chunk_data: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
assert isinstance(chunk_data, Buffer)
chunk_bytes = chunk_data.to_bytes()

if self.var == 'latitude' or self.var == 'longitude':
message = parse_grib_message_metadata(chunk_bytes, 0)
lat, lng = message.latlng()
data: NDArrayLike = lat if self.var == 'latitude' else lng
else:
data: NDArrayLike = parse_grib_array(chunk_bytes, 0)

if chunk_spec.dtype != data.dtype:
data = data.astype(chunk_spec.dtype)
if data.shape != chunk_spec.shape:
data = data.reshape(chunk_spec.shape)

return data

async def _encode_single(
self,
chunk_data: NDBuffer,
chunk_spec: ArraySpec,
) -> Buffer | None:
# This is a read-only codec
raise NotImplementedError

register_codec("gribberish", GribberishCodec)
9 changes: 9 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = ["numpy >= 2"]
[project.optional-dependencies]
xarray = ["xarray", "fsspec"]
kerchunk = ["kerchunk", "zarr", "numcodecs", "fsspec"]
zarr = ["zarr==3.0.0b2"]
dev = [
"matplotlib",
"cf_xarray",
Expand All @@ -31,6 +32,8 @@ dev = [
"s3fs",
"cfgrib",
"pyproj",
"pytest",
"pytest-asyncio",
]

[project.entry-points."xarray.backends"]
Expand All @@ -39,6 +42,12 @@ gribberish = "gribberish.gribberish_backend:GribberishBackend"
[project.entry-points."numcodecs.codecs"]
gribberish = "gribberish.kerchunk:GribberishCodec"

[project.entry-points."zarr.codecs"]
gribberish = "gribberish.zarr:GribberishCodec"

[tool.maturin]
features = ["pyo3/extension-module"]
module-name = "gribberish._gribberish_python"

[tool.pytest.ini_options]
asyncio_mode = "auto"
34 changes: 34 additions & 0 deletions python/tests/test_zarr_codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pytest

import numpy as np

zarr = pytest.importorskip("zarr")


async def test_decode_data_var_gribberish():
from gribberish.zarr.codec import GribberishCodec
from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import default_buffer_prototype

with open("./../gribberish/tests/data/hrrr.t06z.wrfsfcf01-UGRD.grib2", "rb") as f:
raw_data = f.read()

buffer = default_buffer_prototype().buffer.from_bytes(raw_data)
codec = GribberishCodec(var="UGRD")
data = await codec._decode_single(
buffer,
ArraySpec(
shape=(1059, 1799),
dtype="float64",
fill_value=0,
order="C",
prototype=np.ndarray,
),
)

assert data.shape == (1059, 1799)
assert data.dtype == np.dtype("float64")
(
np.testing.assert_almost_equal(data[0][1000], -4.46501350402832),
"Data not decoded correctly",
)

0 comments on commit 441c2a4

Please sign in to comment.