Skip to content

Commit

Permalink
Rename all the things
Browse files Browse the repository at this point in the history
  • Loading branch information
antonymilne committed Apr 3, 2024
1 parent 5578fb5 commit 02e7879
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 62 deletions.
3 changes: 0 additions & 3 deletions vizro-core/docs/pages/API-reference/manager.md

This file was deleted.

1 change: 0 additions & 1 deletion vizro-core/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ nav:
- API Reference:
- Vizro: pages/API-reference/vizro.md
- Models: pages/API-reference/models.md
- Data manager: pages/API-reference/manager.md
- Actions: pages/API-reference/actions.md
- Table functions: pages/API-reference/captured-callables.md
- Explanation:
Expand Down
117 changes: 59 additions & 58 deletions vizro-core/src/vizro/managers/_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,24 @@

logger = logging.getLogger(__name__)

# Really ComponentID and DatasetName should be NewType and not just aliases but then for a user's code to type check
# Really ComponentID and DataSourceName should be NewType and not just aliases but then for a user's code to type check
# correctly they would need to cast all strings to these types.
# TODO: remove these type aliases once have moved component to data mapping to models
ComponentID = str
DatasetName = str
DataSourceName = str
pd_DataFrameCallable = Callable[[], pd.DataFrame]


# wrapt.decorator is the cleanest way to decorate a bound method when instance properties (here instance.timeout)
# are required as arguments.
@wrapt.decorator
def memoize(wrapped: Callable, instance: _Dataset, args, kwargs):
def memoize(wrapped: Callable, instance: _DynamicData, args, kwargs):
"""Caches the result of wrapped function, taking its arguments into account in the cache key.
This delegates to flask_caching.memoize functionality, but with an important modification. flask_caching.memoize
is designed to apply the same timeout to every call of the function or bound method, but we want to have a
dataset-dependent timeout. This means rewriting the wrapped function's __qualname__ so that different instances
of Dataset have completely independent caches. Without this, flask_caching.utils.function_namespace gives the
data source-dependent timeout. This means rewriting the wrapped function's __qualname__ so that different instances
of _DynamicData have completely independent caches. Without this, flask_caching.utils.function_namespace gives the
same namespace for each, which means that the timeouts cannot be set independently.
The following things *do not* work to achieve the same thing - there will still be interference between
Expand All @@ -56,39 +57,39 @@ def memoize(wrapped: Callable, instance: _Dataset, args, kwargs):
Args:
wrapped: function that will be memoized.
instance: Dataset object to which wrapped is bound.
instance: _DynamicData object to which wrapped is bound.
args: positional arguments supplied to wrapped, taken into account for generating cache key.
kwargs: keyword arguments supplied to wrapped, taken into account for generating cache key.
Returns:
Memoized call.
"""
# Before altering, wrapped.__func__.__qualname__ is "Dataset.__call__"
# After altering, it becomes Dataset.__call__.<vizro.managers._data_manager.Dataset object at 0x11d5fc2d0>
# Before altering, wrapped.__func__.__qualname__ is "_DynamicData.__call__"
# After altering, it becomes _DynamicData.__call__.<vizro.managers._data_manager._DynamicData object at 0x11d5fc2d0>
# where the repr(instance) depends on id(instance).
# Note this doesn't work by using += since the qualname will just be appended to fresh every time the function is
# called so will be different every time and not ever hit the cache.
wrapped.__func__.__qualname__ = ".".join([instance.__class__.__name__, wrapped.__func__.__name__, repr(instance)])
return data_manager.cache.memoize(timeout=instance.timeout)(wrapped)(*args, **kwargs)


class _Dataset:
class _DynamicData:
"""Wrapper for a pd_DataFrameCallable, i.e. a function that produces a pandas DataFrame. Crucially this means
that the data can be refreshed during runtime, since the loading function can be re-run.
This is currently private since it's not expected that a user would instantiate it directly. Instead, you would use
through the data_manager interface as follows:
>>> def live_data():
>>> return pd.read_csv("live_data.csv")
>>> data_manager["live_data"] = live_data
>>> data_manager["live_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds
>>> def dynamic_data():
>>> return pd.read_csv("dynamic_data.csv")
>>> data_manager["dynamic_data"] = dynamic_data
>>> data_manager["dynamic_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds
Possibly in future, this will become a public class so you could directly do:
>>> data_manager["live_data"] = Dataset(live_data, timeout=5)
>>> data_manager["dynamic_data"] = DynamicData(dynamic_data, timeout=5)
At this point we might like to disable the behaviour so that data_manager setitem and getitem handle the same
object rather than doing an implicit conversion to _Dataset.
object rather than doing an implicit conversion to _DynamicData.
"""

def __init__(self, load_data: pd_DataFrameCallable, /):
Expand Down Expand Up @@ -119,32 +120,32 @@ def __repr__(self):
independently of cache key, but this doesn't seem to be well documented or work well, so it's better to rely
on __repr__.
In future we might like to change the cache so that it works on dataset name rather than the place in memory.
This would necessitate a new Dataset._name attribute. This would get us closer to getting gunicorn to work
In future we might like to change the cache so that it works on data source name rather than the place in memory.
This would necessitate a new _DynamicData._name attribute. This would get us closer to getting gunicorn to work
without relying on --preload, although it would not get all the way there:
* model_manager would need to fix a random seed or alternative solution (just like Dash does for its
component ids)
* not clear how to handle the case of unnamed datasets, where the name is currently generated
* not clear how to handle the case of unnamed data sources, where the name is currently generated
automatically by the id, since without --preload this would give mismatched names. If use a random number with
fixed seed for this then lose advantage of multiple plots that use the same dataset having just one
fixed seed for this then lose advantage of multiple plots that use the same data source having just one
underlying dataframe in memory.
* would need to make it possible to disable cache at build time so that data would be loaded once at build
time and then again once at runtime, which is generally not what we want
"""
return super().__repr__()


class _StaticDataset:
class _StaticData:
"""Wrapper for a pd.DataFrame. This data cannot be updated during runtime.
This is currently private since it's not expected that a user would instantiate it directly. Instead, you would use
through the data_manager interface as follows:
>>> data_manager["data"] = pd.read_csv("data.csv")
>>> data_manager["static_data"] = pd.read_csv("static_data.csv")
This class does not have much functionality but exists for a couple of reasons:
1. to align interface with _Dataset by providing a load method so that fetching data from data_manager can
transparently handle loading both _StaticDataset and _Dataset without needing switching logic
2. to raise a clear error message if a user tries to set a timeout on the dataset
1. to align interface with _DynamicData by providing a load method so that fetching data from data_manager can
transparently handle loading both _StaticData and _DynamicData without needing switching logic
2. to raise a clear error message if a user tries to set a timeout on the data source
"""

def __init__(self, data: pd.DataFrame, /):
Expand All @@ -159,11 +160,11 @@ def load(self) -> pd.DataFrame:
return self.__data.copy()

def __setattr__(self, name, value):
# Any attributes that are only relevant for _Dataset should go here to raise a clear error message.
# Any attributes that are only relevant for _DynamicData should go here to raise a clear error message.
if name in ["timeout"]:
raise AttributeError(
f"Dataset that is a pandas.DataFrame itself does not support {name}; you should instead use a dataset "
"that is a function that returns a pandas.DataFrame."
f"Static data that is a pandas.DataFrame itself does not support {name}; you should instead use a"
"dynamic data source that is a function that returns a pandas.DataFrame."
)
super().__setattr__(name, value)

Expand All @@ -175,16 +176,16 @@ class DataManager:
>>> # Static data that cannot be refreshed during runtime
>>> data_manager["data"] = pd.read_csv("data.csv")
>>> # Data that can be refreshed during runtime
>>> def live_data():
>>> return pd.read_csv("live_data.csv")
>>> data_manager["live_data"] = live_data
>>> data_manager["live_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds
>>> def dynamic_data():
>>> return pd.read_csv("dynamic_data.csv")
>>> data_manager["dynamic_data"] = dynamic_data
>>> data_manager["dynamic_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds
"""

def __init__(self):
self.__datasets: Dict[DatasetName, Union[_Dataset, _StaticDataset]] = {}
self.__component_to_dataset: Dict[ComponentID, DatasetName] = {}
self.__data: Dict[DataSourceName, Union[_DynamicData, _StaticData]] = {}
self.__component_to_data: Dict[ComponentID, DataSourceName] = {}
self._frozen_state = False
self.cache = Cache(config={"CACHE_TYPE": "SimpleCache"})
# In future, possibly we will accept just a config dict. Would need to work out whether to handle merging with
Expand All @@ -196,50 +197,50 @@ def __init__(self):
# _cache = property(fset=__set_cache)

@_state_modifier
def __setitem__(self, dataset_name: DatasetName, data: Union[pd.DataFrame, pd_DataFrameCallable]):
"""Adds `data` to the `DataManager` with key `dataset_name`."""
if dataset_name in self.__datasets:
raise ValueError(f"Dataset {dataset_name} already exists.")
def __setitem__(self, name: DataSourceName, data: Union[pd.DataFrame, pd_DataFrameCallable]):
"""Adds `data` to the `DataManager` with key `name`."""
if name in self.__data:
raise ValueError(f"Data source {name} already exists.")

if callable(data):
self.__datasets[dataset_name] = _Dataset(data)
self.__data[name] = _DynamicData(data)
elif isinstance(data, pd.DataFrame):
self.__datasets[dataset_name] = _StaticDataset(data)
self.__data[name] = _StaticData(data)
else:
raise TypeError(
f"Dataset {dataset_name} must be a pandas DataFrame or function that returns a pandas DataFrame."
f"Data source {name} must be a pandas DataFrame or function that returns a pandas DataFrame."
)

def __getitem__(self, dataset_name: DatasetName) -> Union[_Dataset, _StaticDataset]:
"""Returns the `_Dataset` or `_StaticDataset` object associated with `dataset_name`."""
def __getitem__(self, name: DataSourceName) -> Union[_DynamicData, _StaticData]:
"""Returns the `_DynamicData` or `_StaticData` object associated with `name`."""
try:
return self.__datasets[dataset_name]
return self.__data[name]
except KeyError as exc:
raise KeyError(f"Dataset {dataset_name} does not exist.") from exc
raise KeyError(f"Data source {name} does not exist.") from exc

@_state_modifier
def _add_component(self, component_id: ComponentID, dataset_name: DatasetName):
"""Adds a mapping from `component_id` to `dataset_name`."""
if dataset_name not in self.__datasets:
raise KeyError(f"Dataset {dataset_name} does not exist.")
if component_id in self.__component_to_dataset:
def _add_component(self, component_id: ComponentID, name: DataSourceName):
"""Adds a mapping from `component_id` to `name`."""
if name not in self.__data:
raise KeyError(f"Data source {name} does not exist.")
if component_id in self.__component_to_data:
raise ValueError(
f"Component with id={component_id} already exists and is mapped to dataset "
f"{self.__component_to_dataset[component_id]}. Components must uniquely map to a dataset across the "
f"Component with id={component_id} already exists and is mapped to data "
f"{self.__component_to_data[component_id]}. Components must uniquely map to a data source across the "
f"whole dashboard. If you are working from a Jupyter Notebook, please either restart the kernel, or "
f"use 'from vizro import Vizro; Vizro._reset()`."
)
self.__component_to_dataset[component_id] = dataset_name
self.__component_to_data[component_id] = name

def _get_component_data(self, component_id: ComponentID) -> pd.DataFrame:
# TODO: once have removed self.__component_to_dataset, we shouldn't need this function any more. Calling
# functions would just do data_manager[dataset_name].load().
# TODO: once have removed self.__component_to_data, we shouldn't need this function any more. Calling
# functions would just do data_manager[name].load().
"""Returns the original data for `component_id`."""
if component_id not in self.__component_to_dataset:
if component_id not in self.__component_to_data:
raise KeyError(f"Component {component_id} does not exist. You need to call add_component first.")
dataset_name = self.__component_to_dataset[component_id]
logger.debug("Loading dataset %s with id %s", dataset_name, id(self[dataset_name]))
return self[dataset_name].load()
name = self.__component_to_data[component_id]
logger.debug("Loading data %s with id %s", name, id(self[name]))
return self[name].load()

def _clear(self):
# Make sure the cache itself is cleared before the reference to it is lost by calling __init__.
Expand Down

0 comments on commit 02e7879

Please sign in to comment.