fsspec · ghidalgo3 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -23,4 +23,5 @@ jobs:
       - name: Test with pytest
         shell: bash -l {0}
         run: |
+          export ZARR_V3_EXPERIMENTAL_API=1
           pytest -v --cov
diff --git a/kerchunk/fits.py b/kerchunk/fits.py
@@ -3,12 +3,10 @@
 import numcodecs
 import numcodecs.abc
 import numpy as np
-import zarr
 
 from fsspec.implementations.reference import LazyReferenceMapper
 
-
-from kerchunk.utils import class_factory
+from kerchunk.utils import class_factory, _zarr_open
 from kerchunk.codecs import AsciiTableCodec, VarArrCodec
 
 try:
@@ -40,6 +38,7 @@ def process_file(
     inline_threshold=100,
     primary_attr_to_group=False,
     out=None,
+    zarr_version=None,
 ):
     """
     Create JSON references for a single FITS file as a zarr group
@@ -62,7 +61,9 @@ def process_file(
         This allows you to supply an fsspec.implementations.reference.LazyReferenceMapper
         to write out parquet as the references get filled, or some other dictionary-like class
         to customise how references get stored
-
+    zarr_version: int
+        The desired zarr spec version to target (currently 2 or 3). The default
+        of None will use the default zarr version.
 
     Returns
     -------
@@ -72,7 +73,7 @@ def process_file(
 
     storage_options = storage_options or {}
     out = out or {}
-    g = zarr.open(out)
+    g = _zarr_open(out, zarr_version=zarr_version)
 
     with fsspec.open(url, mode="rb", **storage_options) as f:
         infile = fits.open(f, do_not_scale_image_data=True)

diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py
@@ -20,11 +20,10 @@
         )
 
 import fsspec
-import zarr
 import xarray
 import numpy as np
 
-from kerchunk.utils import class_factory, _encode_for_JSON
+from kerchunk.utils import class_factory, _encode_for_JSON, _zarr_init_group_and_store
 from kerchunk.codecs import GRIBCodec
 from kerchunk.combine import MultiZarrToZarr, drop
 
@@ -113,6 +112,7 @@ def scan_grib(
     inline_threshold=100,
     skip=0,
     filter={},
+    zarr_version=None,
 ):
     """
     Generate references for a GRIB2 file
@@ -134,6 +134,9 @@ def scan_grib(
         the exact value or is in the given set, are processed.
         E.g., the cf-style filter ``{'typeOfLevel': 'heightAboveGround', 'level': 2}``
         only keeps messages where heightAboveGround==2.
+    zarr_version: int
+        The desired zarr spec version to target (currently 2 or 3). The default
+        of None will use the default zarr version.
 
     Returns
     -------
@@ -192,7 +195,7 @@ def scan_grib(
             if good is False:
                 continue
 
-            z = zarr.open_group(store)
+            z, store = _zarr_init_group_and_store(store, zarr_version=zarr_version)
             global_attrs = {
                 f"GRIB_{k}": m[k]
                 for k in cfgrib.dataset.GLOBAL_ATTRIBUTES_KEYS
@@ -399,7 +402,7 @@ def grib_tree(
 
     # TODO allow passing a LazyReferenceMapper as output?
     zarr_store = {}
-    zroot = zarr.open_group(store=zarr_store)
+    zroot, zarr_store = _zarr_init_group_and_store(zarr_store, overwrite=False)
 
     aggregations: Dict[str, List] = defaultdict(list)
     aggregation_dims: Dict[str, Set] = defaultdict(set)

diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py
@@ -10,7 +10,7 @@
 import numcodecs
 
 from .codecs import FillStringsCodec
-from .utils import _encode_for_JSON
+from .utils import _encode_for_JSON, encode_fill_value, _zarr_init_group_and_store
 
 try:
     import h5py
@@ -21,12 +21,6 @@
         "for more details."
     )
 
-try:
-    from zarr.meta import encode_fill_value
-except ModuleNotFoundError:
-    # https://github.com/zarr-developers/zarr-python/issues/2021
-    from zarr.v2.meta import encode_fill_value
-
 lggr = logging.getLogger("h5-to-zarr")
 _HIDDEN_ATTRS = {  # from h5netcdf.attrs
     "REFERENCE_LIST",
@@ -71,10 +65,10 @@ class SingleHdf5ToZarr:
         encode: save the ID-to-value mapping in a codec, to produce the real values at read
         time; requires this library to be available. Can be efficient storage where there
         are few unique values.
-    out: dict-like or None
+    out: dict-like, StoreLike, or None
         This allows you to supply an fsspec.implementations.reference.LazyReferenceMapper
-        to write out parquet as the references get filled, or some other dictionary-like class
-        to customise how references get stored
+        or a ZarrV3 StoreLike to write out parquet as the references get filled,
+        or some other dictionary-like class to customise how references get stored
     """
 
     def __init__(
@@ -87,6 +81,7 @@ def __init__(
         error="warn",
         vlen_encode="embed",
         out=None,
+        zarr_version=None,
     ):
 
         # Open HDF5 file in read mode...
@@ -111,9 +106,9 @@ def __init__(
         if vlen_encode not in ["embed", "null", "leave", "encode"]:
             raise NotImplementedError
         self.vlen = vlen_encode
-        self.store = out or {}
-        self._zroot = zarr.group(store=self.store, overwrite=True)
-
+        self._zroot, self.store = _zarr_init_group_and_store(
+            out or {}, zarr_version=zarr_version or 2
+        )
         self._uri = url
         self.error = error
         lggr.debug(f"HDF5 file URI: {self._uri}")

diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py
@@ -5,7 +5,11 @@
 from fsspec.implementations.reference import LazyReferenceMapper
 import fsspec
 
-from kerchunk.utils import _encode_for_JSON, inline_array
+from kerchunk.utils import (
+    _encode_for_JSON,
+    inline_array,
+    _zarr_open,
+)
 
 try:
     from scipy.io._netcdf import ZERO, NC_VARIABLE, netcdf_file, netcdf_variable
@@ -31,6 +35,7 @@ def __init__(
         inline_threshold=100,
         max_chunk_size=0,
         out=None,
+        zarr_version=None,
         **kwargs,
     ):
         """
@@ -52,6 +57,9 @@ def __init__(
             This allows you to supply an fsspec.implementations.reference.LazyReferenceMapper
             to write out parquet as the references get filled, or some other dictionary-like class
             to customise how references get stored
+        zarr_version: int
+            The desired zarr spec version to target (currently 2 or 3). The default
+            of None will use the default zarr version.
         args, kwargs: passed to scipy superclass ``scipy.io.netcdf.netcdf_file``
         """
         assert kwargs.pop("mmap", False) is False
@@ -63,6 +71,7 @@ def __init__(
         self.chunks = {}
         self.threshold = inline_threshold
         self.max_chunk_size = max_chunk_size
+        self.zarr_version = zarr_version
         self.out = out or {}
         self.storage_options = storage_options
         self.fp = fsspec.open(filename, **(storage_options or {})).open()
@@ -164,10 +173,9 @@ def translate(self):
         Parameters
         ----------
         """
-        import zarr
 
         out = self.out
-        z = zarr.open(out, mode="w")
+        zroot = _zarr_open(out, mode="w")
         for dim, var in self.variables.items():
             if dim in self.chunks:
                 shape = self.chunks[dim][-1]
@@ -191,18 +199,25 @@ def translate(self):
                     fill = float(fill)
                 if fill is not None and var.data.dtype.kind == "i":
                     fill = int(fill)
-                arr = z.create_dataset(
+                arr = zroot.create_dataset(
                     name=dim,
                     shape=shape,
                     dtype=var.data.dtype,
                     fill_value=fill,
                     chunks=shape,
                     compression=None,
+                    overwrite=True,
                 )
-                part = ".".join(["0"] * len(shape)) or "0"
-                k = f"{dim}/{part}"
-                out[k] = [
-                    self.filename,
+
+                if self.zarr_version == 3:
+                    part = "/".join(["0"] * len(shape)) or "0"
+                    key = f"data/root/{dim}/c{part}"
+                else:
+                    part = ".".join(["0"] * len(shape)) or "0"
+
+                    key = f"{dim}/{part}"
+
+                self.out[key] = [self.filename] + [
                     int(self.chunks[dim][0]),
                     int(self.chunks[dim][1]),
                 ]
@@ -245,13 +260,14 @@ def translate(self):
                     fill = float(fill)
                 if fill is not None and base.kind == "i":
                     fill = int(fill)
-                arr = z.create_dataset(
+                arr = zroot.create_dataset(
                     name=name,
                     shape=shape,
                     dtype=base,
                     fill_value=fill,
                     chunks=(1,) + dtype.shape,
                     compression=None,
+                    overwrite=True,
                 )
                 arr.attrs.update(
                     {
@@ -266,18 +282,33 @@ def translate(self):
 
                 arr.attrs["_ARRAY_DIMENSIONS"] = list(var.dimensions)
 
-                suffix = (
-                    ("." + ".".join("0" for _ in dtype.shape)) if dtype.shape else ""
-                )
+                if self.zarr_version == 3:
+                    suffix = (
+                        ("/" + "/".join("0" for _ in dtype.shape))
+                        if dtype.shape
+                        else ""
+                    )
+                else:
+                    suffix = (
+                        ("." + ".".join("0" for _ in dtype.shape))
+                        if dtype.shape
+                        else ""
+                    )
+
                 for i in range(outer_shape):
-                    out[f"{name}/{i}{suffix}"] = [
+                    if self.zarr_version == 3:
+                        key = f"data/root/{name}/c{i}{suffix}"
+                    else:
+                        key = f"{name}/{i}{suffix}"
+
+                    self.out[key] = [
                         self.filename,
                         int(offset + i * dt.itemsize),
                         int(dtype.itemsize),
                     ]
 
                 offset += dtype.itemsize
-        z.attrs.update(
+        zroot.attrs.update(
             {
                 k: v.decode() if isinstance(v, bytes) else str(v)
                 for k, v in self._attributes.items()

diff --git a/kerchunk/tests/test_fits.py b/kerchunk/tests/test_fits.py
@@ -13,12 +13,13 @@
 var = os.path.join(testdir, "variable_length_table.fits")
 
 
-def test_ascii_table():
+@pytest.mark.parametrize("zarr_version", [2])
+def test_ascii_table(zarr_version):
     # this one directly hits a remote server - should cache?
     url = "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits"
-    out = kerchunk.fits.process_file(url, extension=1)
+    out = kerchunk.fits.process_file(url, extension=1, zarr_version=zarr_version)
     m = fsspec.get_mapper("reference://", fo=out, remote_protocol="https")
-    g = zarr.open(m)
+    g = zarr.open(m, zarr_version=zarr_version)
     arr = g["u5780205r_cvt.c0h.tab"][:]
     with fsspec.open(
         "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits"
@@ -28,10 +29,11 @@ def test_ascii_table():
         assert list(hdu.data.astype(arr.dtype) == arr) == [True, True, True, True]
 
 
-def test_binary_table():
-    out = kerchunk.fits.process_file(btable, extension=1)
+@pytest.mark.parametrize("zarr_version", [2, 3])
+def test_binary_table(zarr_version):
+    out = kerchunk.fits.process_file(btable, extension=1, zarr_version=zarr_version)
     m = fsspec.get_mapper("reference://", fo=out)
-    z = zarr.open(m)
+    z = zarr.open(m, zarr_version=zarr_version)
     arr = z["1"]
     with open(btable, "rb") as f:
         hdul = fits.open(f)
@@ -45,38 +47,41 @@ def test_binary_table():
         ).all()  # string come out as bytes
 
 
-def test_cube():
-    out = kerchunk.fits.process_file(range_im)
+@pytest.mark.parametrize("zarr_version", [2])
+def test_cube(zarr_version):
+    out = kerchunk.fits.process_file(range_im, zarr_version=zarr_version)
     m = fsspec.get_mapper("reference://", fo=out)
-    z = zarr.open(m)
+    z = zarr.open(m, zarr_version=zarr_version)
     arr = z["PRIMARY"]
     with open(range_im, "rb") as f:
         hdul = fits.open(f)
         expected = hdul[0].data
     assert (arr[:] == expected).all()
 
 
-def test_with_class():
+@pytest.mark.parametrize("zarr_version", [2])
+def test_with_class(zarr_version):
     ftz = kerchunk.fits.FitsToZarr(range_im)
     out = ftz.translate()
     assert "fits" in repr(ftz)
     m = fsspec.get_mapper("reference://", fo=out)
-    z = zarr.open(m)
+    z = zarr.open(m, zarr_version=zarr_version)
     arr = z["PRIMARY"]
     with open(range_im, "rb") as f:
         hdul = fits.open(f)
         expected = hdul[0].data
     assert (arr[:] == expected).all()
 
 
-def test_var():
+@pytest.mark.parametrize("zarr_version", [2])
+def test_var(zarr_version):
     data = fits.open(var)[1].data
     expected = [_.tolist() for _ in data["var"]]
 
     ftz = kerchunk.fits.FitsToZarr(var)
     out = ftz.translate()
     m = fsspec.get_mapper("reference://", fo=out)
-    z = zarr.open(m)
+    z = zarr.open(m, zarr_version=zarr_version)
     arr = z["1"]
     vars = [_.tolist() for _ in arr["var"]]
 

diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py
@@ -21,14 +21,19 @@
 here = os.path.dirname(__file__)
 
 
-def test_one():
+@pytest.mark.parametrize("zarr_version", [2])
+def test_one(zarr_version):
     # from https://dd.weather.gc.ca/model_gem_regional/10km/grib2/00/000
     fn = os.path.join(here, "CMC_reg_DEPR_ISBL_10_ps10km_2022072000_P000.grib2")
-    out = scan_grib(fn)
+    out = scan_grib(fn, zarr_version=zarr_version)
     ds = xr.open_dataset(
         "reference://",
         engine="zarr",
-        backend_kwargs={"consolidated": False, "storage_options": {"fo": out[0]}},
+        backend_kwargs={
+            "consolidated": False,
+            "zarr_version": zarr_version,
+            "storage_options": {"fo": out[0]},
+        },
     )
 
     assert ds.attrs["GRIB_centre"] == "cwao"