Skip to content

Commit

Permalink
Merge pull request #349 from martindurant/zarr_archive
Browse files Browse the repository at this point in the history
working with zarr in zip
  • Loading branch information
martindurant authored Aug 25, 2023
2 parents a81f22a + fdd8dcc commit 8ebc8e3
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 9 deletions.
3 changes: 2 additions & 1 deletion ci/environment-py310.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- zarr
- xarray
- h5netcdf
- h5py<3.9
- pandas
- cfgrib
- cftime
Expand All @@ -24,7 +25,7 @@ dependencies:
- black
- fastparquet
- pip
- pyopenssl=23.1.1
- pyopenssl
- tifffile
- netCDF4
- pip:
Expand Down
3 changes: 2 additions & 1 deletion ci/environment-py38.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- zarr
- xarray
- h5netcdf
- h5py<3.9
- pandas
- cfgrib
- cftime
Expand All @@ -22,7 +23,7 @@ dependencies:
- python-blosc
- flake8
- fastparquet
- pyopenssl=23.1.1
- pyopenssl
- black
- pip
- tifffile
Expand Down
3 changes: 2 additions & 1 deletion ci/environment-py39.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- zarr
- xarray
- h5netcdf
- h5py<3.9
- pandas
- cfgrib
- cftime
Expand All @@ -22,7 +23,7 @@ dependencies:
- python-blosc
- flake8
- fastparquet
- pyopenssl=23.1.1
- pyopenssl
- black
- pip
- tifffile
Expand Down
3 changes: 2 additions & 1 deletion kerchunk/netCDF3.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(
self.threshold = inline_threshold
self.max_chunk_size = max_chunk_size
self.out = {}
self.storage_options = storage_options
with fsspec.open(filename, **(storage_options or {})) as fp:
super().__init__(
fp, *args, mmap=False, mode="r", maskandscale=False, **kwargs
Expand Down Expand Up @@ -259,7 +260,7 @@ def translate(self):
)

if self.threshold > 0:
out = do_inline(out, self.threshold)
out = do_inline(out, self.threshold, remote_options=self.storage_options)
out = _encode_for_JSON(out)

return {"version": 1, "refs": out}
Expand Down
2 changes: 1 addition & 1 deletion kerchunk/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_subchunk_exact(m, chunks):

out = kerchunk.utils.subchunk(ref, "data", 5)
nchunk = 10 // chunks[0] * 5
assert list(ref) == [".zgroup", "data/.zarray"] + [
assert list(out) == [".zgroup", "data/.zarray"] + [
f"data/{_}.0" for _ in range(nchunk)
]

Expand Down
65 changes: 65 additions & 0 deletions kerchunk/tests/test_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import xarray as xr
import pandas as pd
import pytest
import numpy as np

import kerchunk.zarr
import kerchunk.utils


@pytest.fixture(scope="module")
def ds():
ds = xr.Dataset(
{
"x": xr.DataArray(np.linspace(-np.pi, np.pi, 10), dims=["x"]),
"y": xr.DataArray(np.linspace(-np.pi / 2, np.pi / 2, 10), dims=["y"]),
"time": xr.DataArray(pd.date_range("2020", "2021"), dims=["time"]),
},
)
ds["temp"] = (
np.cos(ds.x)
* np.sin(ds.y)
* xr.ones_like(ds.time).astype("float")
* np.random.random(ds.time.shape)
)
return ds


@pytest.fixture
def zarr_in_zip(tmpdir, ds):
def _zip(file):
import os
import zipfile

filename = file + os.path.extsep + "zip"
with zipfile.ZipFile(
filename, "w", compression=zipfile.ZIP_STORED, allowZip64=True
) as fh:
for root, _, filenames in os.walk(file):
for each_filename in filenames:
each_filename = os.path.join(root, each_filename)
fh.write(each_filename, os.path.relpath(each_filename, file))
return filename

fn = f"{tmpdir}/test.zarr"
ds.to_zarr(fn, mode="w")
return _zip(fn)


def test_zarr_in_zip(zarr_in_zip, ds):
out = kerchunk.zarr.ZarrToZarr(
url="zip://", storage_options={"fo": zarr_in_zip}
).translate()
ds2 = xr.open_dataset(
"reference://",
engine="zarr",
backend_kwargs={
"storage_options": {
"fo": out,
"remote_protocol": "zip",
"remote_options": {"fo": zarr_in_zip},
},
"consolidated": False,
},
)
assert ds.equals(ds2)
10 changes: 8 additions & 2 deletions kerchunk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,18 @@ def _encode_for_JSON(store):
return store


def do_inline(store, threshold, remote_options=None):
def do_inline(store, threshold, remote_options=None, remote_protocol=None):
"""Replace short chunks with the value of that chunk
The chunk may need encoding with base64 if not ascii, so actual
length may be larger than threshold.
"""
fs = fsspec.filesystem("reference", fo=store, **(remote_options or {}))
fs = fsspec.filesystem(
"reference",
fo=store,
remote_options=remote_options,
remote_protocol=remote_protocol,
)
out = fs.references.copy()
get_keys = [
k
Expand Down Expand Up @@ -232,6 +237,7 @@ def subchunk(store, variable, factor):
modified store
"""
fs = fsspec.filesystem("reference", fo=store)
store = copy.deepcopy(store)
meta_file = f"{variable}/.zarray"
meta = ujson.loads(fs.cat(meta_file))
if meta["compressor"] is not None:
Expand Down
3 changes: 2 additions & 1 deletion kerchunk/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def single_zarr(uri_or_store, storage_options=None, inline_threshold=100, inline
refs[k] = mapper[k]
else:
refs[k] = [fsspec.utils._unstrip_protocol(mapper._key_to_str(k), mapper.fs)]
refs = do_inline(refs, inline_threshold)
if inline_threshold:
refs = do_inline(refs, inline_threshold, remote_options=storage_options)
return refs


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"fill_hdf_strings = kerchunk.codecs:FillStringsCodec",
"FITSAscii = kerchunk.codecs:AsciiTableCodec",
"FITSVarBintable = kerchunk.codecs:VarArrCodec",
"record_member = kerchunk.codecs.RecordArrayMember",
"record_member = kerchunk.codecs:RecordArrayMember",
],
},
zip_safe=False,
Expand Down

0 comments on commit 8ebc8e3

Please sign in to comment.