Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document how we serialize metadata and warn when it is pickled #2156

Merged
merged 2 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions docs/mkdocs/docs/tutorials/metadata.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Metadata

ArcticDB enables you to store arbitrary binary-blobs alongside symbols and versions. The data is pickled when using the Python API. Note that there is a 4GB limit to the size of a single blob.
ArcticDB enables you to store arbitrary binary-blobs alongside symbols and versions.

The below example shows a basic example of writing and reading metadata (in this case a pickled Python dictionary):
The below example shows a basic example of writing and reading metadata (in this case a Python dictionary):

```Python
```python
import arcticdb as adb
# This example assumes the below variables (host, bucket, access, secret) are validly set
ac = adb.Arctic(f"s3://{HOST}:{BUCKET}?access={ACCESS}&secret={SECRET})
Expand All @@ -27,6 +27,41 @@ assert lib.read("meta").metadata == metadata
assert lib.read_metadata("meta").metadata == metadata # Same as read, but doesn't return data from storage
```

New versions of symbols do not "inherit" the metadata of a previous version. Metadata needs to be specified explicitly
each time that you create a new version of the symbol:

```python
lib.write("new_sym", data=pd.DataFrame(), metadata=metadata)
lib.write("new_sym", data=pd.DataFrame())

assert lib.read("new_sym").metadata is None
assert lib.read("new_sym", as_of=0).metadata == metadata
```

### Serialization Format

We use `msgpack` serialization for metadata when possible. We support the built-in `msgpack` types and also:

- Pandas timestamps `pd.Timestamp`
- Python datetime `datetime.datetime`
- Python timedelta `datetime.timedelta`

Documentation of supported `msgpack` structures is available [here](https://github.com/msgpack/msgpack/blob/master/spec.md).
Arrays and maps correspond to Python lists and dicts.

When this `msgpack` serialization of the metadata fails due to unsupported types we fall back to pickling the metadata.
Pickling can have [serious downsides](https://nedbatchelder.com/blog/202006/pickles_nine_flaws.html) as it may not be possible to
unpickle data written with one set of library versions from a client with a different set of library versions.

Because of this, we log a warning when metadata gets pickled. You can disable the warning by setting an environment
variable `ARCTICDB_PickledMetadata_loglevel_str` to `DEBUG`. The log message looks like:

```
Pickling metadata - may not be readable by other clients
```

The metadata may be up to 4GB in size.

### Practical example - using metadata to track vendor timelines

One common example for metadata is to store the vendor-provided date alongside the version. For example, let's say we are processing three files - `data-2004-01-01.csv`, `data-2004-01-02.csv` and `data-2004-01-03.csv`. Each file name contains a date which we'd like to be able to store along side the version information in ArcticDB.
Expand Down
25 changes: 25 additions & 0 deletions python/arcticdb/version_store/_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pickle
from abc import ABCMeta, abstractmethod

from arcticdb_ext import get_config_string
from pandas.api.types import is_integer_dtype
from arcticc.pb2.descriptors_pb2 import UserDefinedMetadata, NormalizationMetadata, MsgPackSerialization
from arcticc.pb2.storage_pb2 import VersionStoreConfig
Expand All @@ -40,6 +41,7 @@

from arcticdb._msgpack_compat import packb, padded_packb, unpackb, ExtType
from arcticdb.log import version as log
from arcticdb_ext.log import LogLevel
from arcticdb.version_store._common import _column_name_to_strings, TimeFrame

PICKLE_PROTOCOL = 4
Expand Down Expand Up @@ -78,6 +80,28 @@ def check_is_utc_if_newer_pandas(*args, **kwargs):
NormalizedInput = NamedTuple("NormalizedInput", [("item", NPDDataFrame), ("metadata", NormalizationMetadata)])


_PICKLED_METADATA_LOGLEVEL = None # set lazily with function below


def get_pickled_metadata_loglevel():
global _PICKLED_METADATA_LOGLEVEL
if _PICKLED_METADATA_LOGLEVEL:
return _PICKLED_METADATA_LOGLEVEL

log_level = get_config_string("PickledMetadata.LogLevel")
expected_settings = ("DEBUG", "INFO", "WARN", "ERROR")
if log_level:
if log_level.upper() not in expected_settings:
log.warn(f"Expected PickledMetadata.LogLevel setting to be in {expected_settings} or absent but was {log_level}")
_PICKLED_METADATA_LOGLEVEL = LogLevel.WARN
else:
_PICKLED_METADATA_LOGLEVEL = getattr(LogLevel, log_level.upper())
else:
_PICKLED_METADATA_LOGLEVEL = LogLevel.WARN

return _PICKLED_METADATA_LOGLEVEL


# To simplify unit testing of serialization logic. This maps the cpp _FrameData exposed object
class FrameData(
NamedTuple("FrameData", [("data", List[np.ndarray]), ("names", List[str]), ("index_columns", List[str])])
Expand Down Expand Up @@ -1064,6 +1088,7 @@ def read(data, pickled_in_python2=False):

@staticmethod
def write(obj):
log.log(get_pickled_metadata_loglevel(), f"Pickling metadata - may not be readable by other clients")
return pickle.dumps(obj, protocol=PICKLE_PROTOCOL)


Expand Down
6 changes: 6 additions & 0 deletions python/arcticdb/version_store/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,8 @@ def write(
Any non-`DatetimeIndex` will converted into an internal `RowCount` index. That is, ArcticDB will assign each
row a monotonically increasing integer identifier and that will be used for the index.

See the Metadata section of our online documentation for details about how metadata is persisted and caveats.

Parameters
----------
symbol : str
Expand Down Expand Up @@ -1707,6 +1709,8 @@ def write_metadata(

This method should be faster than `write` as it involves no data segment read/write operations.

See the Metadata section of our online documentation for details about how metadata is persisted and caveats.

Parameters
----------
symbol
Expand Down Expand Up @@ -1734,6 +1738,8 @@ def write_metadata_batch(
Note that this isn't an atomic operation - it's possible for the metadata for one symbol to be fully written and
readable before another symbol.

See the Metadata section of our online documentation for details about how metadata is persisted and caveats.

Parameters
----------
write_metadata_payloads : `List[WriteMetadataPayload]`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""
import numpy as np
import pandas as pd
from arcticdb_ext import set_config_string, unset_config_string
from pandas import DataFrame, Timestamp
import pytest

Expand All @@ -32,6 +33,36 @@ def test_rt_df_with_small_meta(object_and_mem_and_lmdb_version_store):
assert meta == vit.metadata


@pytest.mark.parametrize("log_level", ("error", "warn", "debug", "info", "ERROR", "eRror", "", None))
def test_pickled_metadata_warning(lmdb_version_store_v1, log_level):
import arcticdb.version_store._normalization as norm
norm._PICKLED_METADATA_LOGLEVEL = None
if log_level is not None:
set_config_string("PickledMetadata.LogLevel", log_level)
lib = lmdb_version_store_v1
df = DataFrame(data=["A", "B", "C"])
meta = df
lib.write("pandas", df, metadata=meta)
vit = lib.read("pandas")
assert_frame_equal(df, vit.data)
assert_frame_equal(df, vit.metadata)
unset_config_string("PickledMetadata.LogLevel")


def test_pickled_metadata_warning_bad_config(lmdb_version_store_v1):
"""Don't block writes just because they set this wrong."""
import arcticdb.version_store._normalization as norm
norm._PICKLED_METADATA_LOGLEVEL = None
set_config_string("PickledMetadata.LogLevel", "cat")
lib = lmdb_version_store_v1
df = DataFrame(data=["A", "B", "C"])
meta = df
lib.write("pandas", df, metadata=meta)
vit = lib.read("pandas")
assert_frame_equal(df, vit.data)
assert_frame_equal(df, vit.metadata)


def test_rt_df_with_humonguous_meta(object_and_mem_and_lmdb_version_store):
with pytest.raises(ArcticDbNotYetImplemented):
from arcticdb.version_store._normalization import _MAX_USER_DEFINED_META as MAX
Expand Down
Loading