diff --git a/docs/mkdocs/docs/tutorials/metadata.md b/docs/mkdocs/docs/tutorials/metadata.md index 4f8b1c034b..2cd45fb508 100644 --- a/docs/mkdocs/docs/tutorials/metadata.md +++ b/docs/mkdocs/docs/tutorials/metadata.md @@ -1,10 +1,10 @@ # Metadata -ArcticDB enables you to store arbitrary binary-blobs alongside symbols and versions. The data is pickled when using the Python API. Note that there is a 4GB limit to the size of a single blob. +ArcticDB enables you to store arbitrary binary-blobs alongside symbols and versions. -The below example shows a basic example of writing and reading metadata (in this case a pickled Python dictionary): +The below example shows a basic example of writing and reading metadata (in this case a Python dictionary): -```Python +```python import arcticdb as adb # This example assumes the below variables (host, bucket, access, secret) are validly set ac = adb.Arctic(f"s3://{HOST}:{BUCKET}?access={ACCESS}&secret={SECRET}) @@ -27,6 +27,41 @@ assert lib.read("meta").metadata == metadata assert lib.read_metadata("meta").metadata == metadata # Same as read, but doesn't return data from storage ``` +New versions of symbols do not "inherit" the metadata of a previous version. Metadata needs to be specified explicitly +each time that you create a new version of the symbol: + +```python +lib.write("new_sym", data=pd.DataFrame(), metadata=metadata) +lib.write("new_sym", data=pd.DataFrame()) + +assert lib.read("new_sym").metadata is None +assert lib.read("new_sym", as_of=0).metadata == metadata +``` + +### Serialization Format + +We use `msgpack` serialization for metadata when possible. We support the built-in `msgpack` types and also: + +- Pandas timestamps `pd.Timestamp` +- Python datetime `datetime.datetime` +- Python timedelta `datetime.timedelta` + +Documentation of supported `msgpack` structures is available [here](https://github.com/msgpack/msgpack/blob/master/spec.md). +Arrays and maps correspond to Python lists and dicts. + +When this `msgpack` serialization of the metadata fails due to unsupported types we fall back to pickling the metadata. +Pickling can have [serious downsides](https://nedbatchelder.com/blog/202006/pickles_nine_flaws.html) as it may not be possible to +unpickle data written with one set of library versions from a client with a different set of library versions. + +Because of this, we log a warning when metadata gets pickled. You can disable the warning by setting an environment +variable `ARCTICDB_PickledMetadata_loglevel_str` to `DEBUG`. The log message looks like: + +``` +Pickling metadata - may not be readable by other clients +``` + +The metadata may be up to 4GB in size. + ### Practical example - using metadata to track vendor timelines One common example for metadata is to store the vendor-provided date alongside the version. For example, let's say we are processing three files - `data-2004-01-01.csv`, `data-2004-01-02.csv` and `data-2004-01-03.csv`. Each file name contains a date which we'd like to be able to store along side the version information in ArcticDB. diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 424a5f393d..a8760ffd45 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -19,6 +19,7 @@ import pickle from abc import ABCMeta, abstractmethod +from arcticdb_ext import get_config_string from pandas.api.types import is_integer_dtype from arcticc.pb2.descriptors_pb2 import UserDefinedMetadata, NormalizationMetadata, MsgPackSerialization from arcticc.pb2.storage_pb2 import VersionStoreConfig @@ -40,6 +41,7 @@ from arcticdb._msgpack_compat import packb, padded_packb, unpackb, ExtType from arcticdb.log import version as log +from arcticdb_ext.log import LogLevel from arcticdb.version_store._common import _column_name_to_strings, TimeFrame PICKLE_PROTOCOL = 4 @@ -78,6 +80,28 @@ def check_is_utc_if_newer_pandas(*args, **kwargs): NormalizedInput = NamedTuple("NormalizedInput", [("item", NPDDataFrame), ("metadata", NormalizationMetadata)]) +_PICKLED_METADATA_LOGLEVEL = None # set lazily with function below + + +def get_pickled_metadata_loglevel(): + global _PICKLED_METADATA_LOGLEVEL + if _PICKLED_METADATA_LOGLEVEL: + return _PICKLED_METADATA_LOGLEVEL + + log_level = get_config_string("PickledMetadata.LogLevel") + expected_settings = ("DEBUG", "INFO", "WARN", "ERROR") + if log_level: + if log_level.upper() not in expected_settings: + log.warn(f"Expected PickledMetadata.LogLevel setting to be in {expected_settings} or absent but was {log_level}") + _PICKLED_METADATA_LOGLEVEL = LogLevel.WARN + else: + _PICKLED_METADATA_LOGLEVEL = getattr(LogLevel, log_level.upper()) + else: + _PICKLED_METADATA_LOGLEVEL = LogLevel.WARN + + return _PICKLED_METADATA_LOGLEVEL + + # To simplify unit testing of serialization logic. This maps the cpp _FrameData exposed object class FrameData( NamedTuple("FrameData", [("data", List[np.ndarray]), ("names", List[str]), ("index_columns", List[str])]) @@ -1064,6 +1088,7 @@ def read(data, pickled_in_python2=False): @staticmethod def write(obj): + log.log(get_pickled_metadata_loglevel(), f"Pickling metadata - may not be readable by other clients") return pickle.dumps(obj, protocol=PICKLE_PROTOCOL) diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index 66bd0d740d..f6430df9ac 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -648,6 +648,8 @@ def write( Any non-`DatetimeIndex` will converted into an internal `RowCount` index. That is, ArcticDB will assign each row a monotonically increasing integer identifier and that will be used for the index. + See the Metadata section of our online documentation for details about how metadata is persisted and caveats. + Parameters ---------- symbol : str @@ -1707,6 +1709,8 @@ def write_metadata( This method should be faster than `write` as it involves no data segment read/write operations. + See the Metadata section of our online documentation for details about how metadata is persisted and caveats. + Parameters ---------- symbol @@ -1734,6 +1738,8 @@ def write_metadata_batch( Note that this isn't an atomic operation - it's possible for the metadata for one symbol to be fully written and readable before another symbol. + See the Metadata section of our online documentation for details about how metadata is persisted and caveats. + Parameters ---------- write_metadata_payloads : `List[WriteMetadataPayload]` diff --git a/python/tests/integration/arcticdb/version_store/test_metadata_support.py b/python/tests/integration/arcticdb/version_store/test_metadata_support.py index 83f3215a8d..205992b014 100644 --- a/python/tests/integration/arcticdb/version_store/test_metadata_support.py +++ b/python/tests/integration/arcticdb/version_store/test_metadata_support.py @@ -7,6 +7,7 @@ """ import numpy as np import pandas as pd +from arcticdb_ext import set_config_string, unset_config_string from pandas import DataFrame, Timestamp import pytest @@ -32,6 +33,36 @@ def test_rt_df_with_small_meta(object_and_mem_and_lmdb_version_store): assert meta == vit.metadata +@pytest.mark.parametrize("log_level", ("error", "warn", "debug", "info", "ERROR", "eRror", "", None)) +def test_pickled_metadata_warning(lmdb_version_store_v1, log_level): + import arcticdb.version_store._normalization as norm + norm._PICKLED_METADATA_LOGLEVEL = None + if log_level is not None: + set_config_string("PickledMetadata.LogLevel", log_level) + lib = lmdb_version_store_v1 + df = DataFrame(data=["A", "B", "C"]) + meta = df + lib.write("pandas", df, metadata=meta) + vit = lib.read("pandas") + assert_frame_equal(df, vit.data) + assert_frame_equal(df, vit.metadata) + unset_config_string("PickledMetadata.LogLevel") + + +def test_pickled_metadata_warning_bad_config(lmdb_version_store_v1): + """Don't block writes just because they set this wrong.""" + import arcticdb.version_store._normalization as norm + norm._PICKLED_METADATA_LOGLEVEL = None + set_config_string("PickledMetadata.LogLevel", "cat") + lib = lmdb_version_store_v1 + df = DataFrame(data=["A", "B", "C"]) + meta = df + lib.write("pandas", df, metadata=meta) + vit = lib.read("pandas") + assert_frame_equal(df, vit.data) + assert_frame_equal(df, vit.metadata) + + def test_rt_df_with_humonguous_meta(object_and_mem_and_lmdb_version_store): with pytest.raises(ArcticDbNotYetImplemented): from arcticdb.version_store._normalization import _MAX_USER_DEFINED_META as MAX