Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

object-store-based Store implementation #1661

Open
wants to merge 44 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
14be826
Initial object-store implementation
kylebarron Feb 8, 2024
a492bf0
Merge branch 'v3' into kyle/object-store
d-v-b Feb 12, 2024
50b6c47
Merge branch 'v3' into kyle/object-store
jhamman Feb 27, 2024
afa79af
Update src/zarr/v3/store/object_store.py
kylebarron Feb 27, 2024
c466f9f
Merge branch 'main' into kyle/object-store
kylebarron Oct 21, 2024
c3e7296
Merge branch 'main' into kyle/object-store
kylebarron Oct 22, 2024
f5c884b
update
kylebarron Oct 22, 2024
af2a39b
Handle list streams
kylebarron Nov 1, 2024
d7cfbee
Update get
kylebarron Nov 1, 2024
cb40015
wip refactor get_partial_values
kylebarron Nov 1, 2024
619df43
Merge branch 'main' into kyle/object-store
kylebarron Nov 1, 2024
b976450
Fixes to _get_partial_values
kylebarron Nov 7, 2024
cca70d7
Merge branch 'main' into kyle/object-store
kylebarron Nov 7, 2024
f2c827d
Fix constructing prototype from get
kylebarron Nov 7, 2024
5c8903f
lint
kylebarron Nov 7, 2024
50e1dec
Merge branch 'main' into kyle/object-store
kylebarron Nov 18, 2024
8bb252e
Add docstring
kylebarron Nov 18, 2024
559eafd
Make names private
kylebarron Nov 18, 2024
5486e69
Implement eq
kylebarron Nov 18, 2024
9a05c01
Add obstore as a test dep
maxrjones Nov 18, 2024
56b7a0b
Run store tests on ObjectStore
maxrjones Nov 18, 2024
d5d0d4d
Merge pull request #1 from maxrjones/object-store-tests
kylebarron Nov 20, 2024
b38ada1
import or skip
kylebarron Nov 21, 2024
ab00b46
Bump obstore beta version
kylebarron Nov 22, 2024
9c65e4d
bump pre-commit
kylebarron Nov 22, 2024
77d7c12
Add read_only param for __init__
kylebarron Dec 5, 2024
4418426
Bump obstore
maxrjones Dec 13, 2024
7a71174
Add fixtures for object store tests
maxrjones Dec 13, 2024
e73bcc9
Cast return from __eq__ as bool
maxrjones Dec 13, 2024
c2cd6b8
Avoid recursion error on repr
maxrjones Dec 13, 2024
a95ec59
Check store type at runtime
maxrjones Dec 13, 2024
fb8b16d
Merge pull request #2 from maxrjones/update-tests
kylebarron Dec 13, 2024
ca261b1
Check if store is writable for setting or deleting objects
maxrjones Dec 13, 2024
0eb416a
Add test for object store repr
maxrjones Dec 13, 2024
247432f
Add attribute tests
maxrjones Dec 13, 2024
4b31b3c
Add get and set methods to test class
maxrjones Dec 14, 2024
d49d1ff
Raise an exeption for previously set key
maxrjones Dec 14, 2024
c2ebc8f
Update src/zarr/testing/store.py
maxrjones Dec 14, 2024
eb76698
Update _transform_list_dir to not remove all items
maxrjones Dec 15, 2024
f310260
Return bytes from GetResult
maxrjones Dec 15, 2024
86951b8
Don't raise an exception on set_if_not_exists
maxrjones Dec 15, 2024
f989884
Remove test that stores empty file
maxrjones Dec 15, 2024
40e1b25
Handle None as start or end of byte range request
maxrjones Dec 15, 2024
3aa3578
Merge pull request #3 from maxrjones/check_writable
kylebarron Dec 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.3
hooks:
- id: ruff
args: ["--fix", "--show-fixes"]
- id: ruff-format
- id: ruff
args: ["--fix", "--show-fixes"]
- id: ruff-format
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
Expand All @@ -20,7 +20,7 @@ repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-yaml
- id: check-yaml
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.13.0
hooks:
Expand All @@ -35,6 +35,7 @@ repos:
- numpy
- typing_extensions
- universal-pathlib
- obstore==0.3.0-beta.8
# Tests
- pytest
- repo: https://github.com/scientific-python/cookie
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ test = [
"mypy",
"hypothesis",
"universal-pathlib",
"obstore==0.3.0b9",
]

jupyter = [
Expand Down
319 changes: 319 additions & 0 deletions src/zarr/storage/object_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
from __future__ import annotations

import asyncio
from collections import defaultdict
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, TypedDict

import obstore as obs

from zarr.abc.store import ByteRangeRequest, Store
from zarr.core.buffer import Buffer
from zarr.core.buffer.core import BufferPrototype

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Coroutine, Iterable
from typing import Any

from obstore import ListStream, ObjectMeta, OffsetRange, SuffixRange
from obstore.store import ObjectStore as _ObjectStore

from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import BytesLike


class ObjectStore(Store):
"""A Zarr store that uses obstore for fast read/write from AWS, GCP, and Azure.

Parameters
----------
store : obstore.store.ObjectStore
An obstore store instance that is set up with the proper credentials.
"""

store: _ObjectStore
"""The underlying obstore instance."""

def __eq__(self, value: object) -> bool:
if not isinstance(value, ObjectStore):
return False

return bool(self.store.__eq__(value.store))

def __init__(self, store: _ObjectStore, *, read_only: bool = False) -> None:
if not isinstance(
store,
(
obs.store.AzureStore,
obs.store.GCSStore,
obs.store.HTTPStore,
obs.store.S3Store,
obs.store.LocalStore,
obs.store.MemoryStore,
),
):
raise TypeError(f"expected ObjectStore class, got {store!r}")
self.store = store
super().__init__(read_only=read_only)

def __str__(self) -> str:
return f"object://{self.store}"

def __repr__(self) -> str:
return f"ObjectStore({self})"

async def get(
self, key: str, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None
) -> Buffer:
if byte_range is None:
resp = await obs.get_async(self.store, key)
return prototype.buffer.from_bytes(await resp.bytes_async())

start, end = byte_range
if start is not None and end is not None:
resp = await obs.get_range_async(self.store, key, start=start, end=end)
return prototype.buffer.from_bytes(memoryview(resp))
elif start is not None:
if start >= 0:
# Offset request
resp = await obs.get_async(self.store, key, options={"range": {"offset": start}})
else:
resp = await obs.get_async(self.store, key, options={"range": {"suffix": start}})

return prototype.buffer.from_bytes(await resp.bytes_async())
else:
raise ValueError(f"Unexpected input to `get`: {start=}, {end=}")

async def get_partial_values(
self,
prototype: BufferPrototype,
key_ranges: Iterable[tuple[str, ByteRangeRequest]],
) -> list[Buffer | None]:
return await _get_partial_values(self.store, prototype=prototype, key_ranges=key_ranges)

async def exists(self, key: str) -> bool:
try:
await obs.head_async(self.store, key)
except FileNotFoundError:
return False
else:
return True

@property
def supports_writes(self) -> bool:
return True

async def set(self, key: str, value: Buffer) -> None:
buf = value.to_bytes()
await obs.put_async(self.store, key, buf)

async def set_if_not_exists(self, key: str, value: Buffer) -> None:
buf = value.to_bytes()
await obs.put_async(self.store, key, buf, mode="create")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
async def set_if_not_exists(self, key: str, value: Buffer) -> None:
buf = value.to_bytes()
await obs.put_async(self.store, key, buf, mode="create")
async def set_if_not_exists(self, key: str, value: Buffer) -> None:
buf = value.to_bytes()
try:
await obs.put_async(self.store, key, buf, mode="create")
except AlreadyExistsError:
pass

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks Joe, pre-commit recommended a slightly different version (kylebarron@86951b8) but this fixed one of the failing tests


@property
def supports_deletes(self) -> bool:
return True

async def delete(self, key: str) -> None:
await obs.delete_async(self.store, key)

@property
def supports_partial_writes(self) -> bool:
return False

async def set_partial_values(
self, key_start_values: Iterable[tuple[str, int, BytesLike]]
) -> None:
raise NotImplementedError

@property
def supports_listing(self) -> bool:
return True

def list(self) -> AsyncGenerator[str, None]:
objects: ListStream[list[ObjectMeta]] = obs.list(self.store)
return _transform_list(objects)

def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
objects: ListStream[list[ObjectMeta]] = obs.list(self.store, prefix=prefix)
return _transform_list(objects)

def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
objects: ListStream[list[ObjectMeta]] = obs.list(self.store, prefix=prefix)
return _transform_list_dir(objects, prefix)


async def _transform_list(
list_stream: AsyncGenerator[list[ObjectMeta], None],
) -> AsyncGenerator[str, None]:
async for batch in list_stream:
for item in batch:
yield item["path"]


async def _transform_list_dir(
list_stream: AsyncGenerator[list[ObjectMeta], None], prefix: str
) -> AsyncGenerator[str, None]:
# We assume that the underlying object-store implementation correctly handles the
# prefix, so we don't double-check that the returned results actually start with the
# given prefix.
prefix_len = len(prefix)
async for batch in list_stream:
for item in batch:
# Yield this item if "/" does not exist after the prefix.
if "/" not in item["path"][prefix_len:]:
yield item["path"]
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was brought to my attention in developmentseed/obstore#101 that in the current object_store implementation a final / is stripped from directories. So this _transform_list_dir will include the names of any sub directories.

Does zarr expect zero-byte files in normal usage? Or we could assume that a zero-byte file may be a directory.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fsspec has struggled with this for the longest time. My recommendation: you won't come up with a convention that matches the expectations of all users and backends, so choose something you're happy with and maintain it consistently. As long as the documentation is clear about how the convention works.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, that's useful background. What does that mean for Zarr? What does Zarr expect? Should we document behavior at the Store level for what all stores should implement? So at least the fsspec-based and obstore-based stores could have the same behavior

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought that listing and caring about directories versus files is something that only matters for V2, no?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not intricately familiar with Zarr v3 but there are still Store methods for list, list_prefix and list_dir. Will it give incorrect behavior if one of the listed files is a directory prefix with no content?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure, but I would have thought not.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, good for this backend to be considered experimental for now 🙂



class _BoundedRequest(TypedDict):
"""Range request with a known start and end byte.

These requests can be multiplexed natively on the Rust side with
`obstore.get_ranges_async`.
"""

original_request_index: int
"""The positional index in the original key_ranges input"""

start: int
"""Start byte offset."""

end: int
"""End byte offset."""


class _OtherRequest(TypedDict):
"""Offset or suffix range requests.

These requests cannot be concurrent on the Rust side, and each need their own call
to `obstore.get_async`, passing in the `range` parameter.
"""

original_request_index: int
"""The positional index in the original key_ranges input"""

path: str
"""The path to request from."""

range: OffsetRange | SuffixRange
"""The range request type."""


class _Response(TypedDict):
"""A response buffer associated with the original index that it should be restored to."""

original_request_index: int
"""The positional index in the original key_ranges input"""

buffer: Buffer
"""The buffer returned from obstore's range request."""


async def _make_bounded_requests(
store: obs.store.ObjectStore,
path: str,
requests: list[_BoundedRequest],
prototype: BufferPrototype,
) -> list[_Response]:
"""Make all bounded requests for a specific file.

`obstore.get_ranges_async` allows for making concurrent requests for multiple ranges
within a single file, and will e.g. merge concurrent requests. This only uses one
single Python coroutine.
"""

starts = [r["start"] for r in requests]
ends = [r["end"] for r in requests]
responses = await obs.get_ranges_async(store, path=path, starts=starts, ends=ends)

buffer_responses: list[_Response] = []
for request, response in zip(requests, responses, strict=True):
buffer_responses.append(
{
"original_request_index": request["original_request_index"],
"buffer": prototype.buffer.from_bytes(memoryview(response)),
}
)

return buffer_responses


async def _make_other_request(
store: obs.store.ObjectStore,
request: _OtherRequest,
prototype: BufferPrototype,
) -> list[_Response]:
"""Make suffix or offset requests.

We return a `list[_Response]` for symmetry with `_make_bounded_requests` so that all
futures can be gathered together.
"""
resp = await obs.get_async(store, request["path"], options={"range": request["range"]})
buffer = await resp.bytes_async()
return [
{
"original_request_index": request["original_request_index"],
"buffer": prototype.buffer.from_bytes(buffer),
}
]


async def _get_partial_values(
store: obs.store.ObjectStore,
prototype: BufferPrototype,
key_ranges: Iterable[tuple[str, ByteRangeRequest]],
) -> list[Buffer | None]:
"""Make multiple range requests.

ObjectStore has a `get_ranges` method that will additionally merge nearby ranges,
but it's _per_ file. So we need to split these key_ranges into **per-file** key
ranges, and then reassemble the results in the original order.

We separate into different requests:

- One call to `obstore.get_ranges_async` **per target file**
- One call to `obstore.get_async` for each other request.
"""
key_ranges = list(key_ranges)
per_file_bounded_requests: dict[str, list[_BoundedRequest]] = defaultdict(list)
other_requests: list[_OtherRequest] = []

for idx, (path, (start, end)) in enumerate(key_ranges):
if start is None:
raise ValueError("Cannot pass `None` for the start of the range request.")

if end is not None:
# This is a bounded request with known start and end byte.
per_file_bounded_requests[path].append(
{"original_request_index": idx, "start": start, "end": end}
)
elif start < 0:
# Suffix request from the end
other_requests.append(
{"original_request_index": idx, "path": path, "range": {"suffix": abs(start)}}
)
elif start >= 0:
# Offset request to the end
other_requests.append(
{"original_request_index": idx, "path": path, "range": {"offset": start}}
)
else:
raise ValueError(f"Unsupported range input: {start=}, {end=}")

futs: list[Coroutine[Any, Any, list[_Response]]] = []
for path, bounded_ranges in per_file_bounded_requests.items():
futs.append(_make_bounded_requests(store, path, bounded_ranges, prototype))

for request in other_requests:
futs.append(_make_other_request(store, request, prototype)) # noqa: PERF401

buffers: list[Buffer | None] = [None] * len(key_ranges)

# TODO: this gather a list of list of Response; not sure if there's a way to
# unpack these lists inside of an `asyncio.gather`?
for responses in await asyncio.gather(*futs):
for resp in responses:
buffers[resp["original_request_index"]] = resp["buffer"]

return buffers
22 changes: 22 additions & 0 deletions tests/test_store/test_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# ruff: noqa: E402
import pytest

obstore = pytest.importorskip("obstore")

from zarr.core.buffer import cpu
from zarr.storage.object_store import ObjectStore
from zarr.testing.store import StoreTests


class TestObjectStore(StoreTests[ObjectStore, cpu.Buffer]):
store_cls = ObjectStore
buffer_cls = cpu.Buffer

@pytest.fixture
def store_kwargs(self, tmpdir) -> dict[str, str | bool]:
store = obstore.store.LocalStore(prefix=tmpdir)
return {"store": store, "read_only": False}

@pytest.fixture
def store(self, store_kwargs: dict[str, str | bool]) -> ObjectStore:
return self.store_cls(**store_kwargs)
Loading