diff --git a/vizro-core/docs/pages/API-reference/manager.md b/vizro-core/docs/pages/API-reference/manager.md deleted file mode 100644 index 73f8c4440..000000000 --- a/vizro-core/docs/pages/API-reference/manager.md +++ /dev/null @@ -1,3 +0,0 @@ -# Data Manager - -::: vizro.managers._data_manager diff --git a/vizro-core/mkdocs.yml b/vizro-core/mkdocs.yml index abf399c15..75e86c9a6 100644 --- a/vizro-core/mkdocs.yml +++ b/vizro-core/mkdocs.yml @@ -43,7 +43,6 @@ nav: - API Reference: - Vizro: pages/API-reference/vizro.md - Models: pages/API-reference/models.md - - Data manager: pages/API-reference/manager.md - Actions: pages/API-reference/actions.md - Table functions: pages/API-reference/captured-callables.md - Explanation: diff --git a/vizro-core/src/vizro/managers/_data_manager.py b/vizro-core/src/vizro/managers/_data_manager.py index 8f9d6ab24..117025360 100644 --- a/vizro-core/src/vizro/managers/_data_manager.py +++ b/vizro-core/src/vizro/managers/_data_manager.py @@ -24,23 +24,24 @@ logger = logging.getLogger(__name__) -# Really ComponentID and DatasetName should be NewType and not just aliases but then for a user's code to type check +# Really ComponentID and DataSourceName should be NewType and not just aliases but then for a user's code to type check # correctly they would need to cast all strings to these types. +# TODO: remove these type aliases once have moved component to data mapping to models ComponentID = str -DatasetName = str +DataSourceName = str pd_DataFrameCallable = Callable[[], pd.DataFrame] # wrapt.decorator is the cleanest way to decorate a bound method when instance properties (here instance.timeout) # are required as arguments. @wrapt.decorator -def memoize(wrapped: Callable, instance: _Dataset, args, kwargs): +def memoize(wrapped: Callable, instance: _DynamicData, args, kwargs): """Caches the result of wrapped function, taking its arguments into account in the cache key. This delegates to flask_caching.memoize functionality, but with an important modification. flask_caching.memoize is designed to apply the same timeout to every call of the function or bound method, but we want to have a - dataset-dependent timeout. This means rewriting the wrapped function's __qualname__ so that different instances - of Dataset have completely independent caches. Without this, flask_caching.utils.function_namespace gives the + data source-dependent timeout. This means rewriting the wrapped function's __qualname__ so that different instances + of _DynamicData have completely independent caches. Without this, flask_caching.utils.function_namespace gives the same namespace for each, which means that the timeouts cannot be set independently. The following things *do not* work to achieve the same thing - there will still be interference between @@ -56,7 +57,7 @@ def memoize(wrapped: Callable, instance: _Dataset, args, kwargs): Args: wrapped: function that will be memoized. - instance: Dataset object to which wrapped is bound. + instance: _DynamicData object to which wrapped is bound. args: positional arguments supplied to wrapped, taken into account for generating cache key. kwargs: keyword arguments supplied to wrapped, taken into account for generating cache key. @@ -64,8 +65,8 @@ def memoize(wrapped: Callable, instance: _Dataset, args, kwargs): Memoized call. """ - # Before altering, wrapped.__func__.__qualname__ is "Dataset.__call__" - # After altering, it becomes Dataset.__call__. + # Before altering, wrapped.__func__.__qualname__ is "_DynamicData.__call__" + # After altering, it becomes _DynamicData.__call__. # where the repr(instance) depends on id(instance). # Note this doesn't work by using += since the qualname will just be appended to fresh every time the function is # called so will be different every time and not ever hit the cache. @@ -73,22 +74,22 @@ def memoize(wrapped: Callable, instance: _Dataset, args, kwargs): return data_manager.cache.memoize(timeout=instance.timeout)(wrapped)(*args, **kwargs) -class _Dataset: +class _DynamicData: """Wrapper for a pd_DataFrameCallable, i.e. a function that produces a pandas DataFrame. Crucially this means that the data can be refreshed during runtime, since the loading function can be re-run. This is currently private since it's not expected that a user would instantiate it directly. Instead, you would use through the data_manager interface as follows: - >>> def live_data(): - >>> return pd.read_csv("live_data.csv") - >>> data_manager["live_data"] = live_data - >>> data_manager["live_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds + >>> def dynamic_data(): + >>> return pd.read_csv("dynamic_data.csv") + >>> data_manager["dynamic_data"] = dynamic_data + >>> data_manager["dynamic_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds Possibly in future, this will become a public class so you could directly do: - >>> data_manager["live_data"] = Dataset(live_data, timeout=5) + >>> data_manager["dynamic_data"] = DynamicData(dynamic_data, timeout=5) At this point we might like to disable the behaviour so that data_manager setitem and getitem handle the same - object rather than doing an implicit conversion to _Dataset. + object rather than doing an implicit conversion to _DynamicData. """ def __init__(self, load_data: pd_DataFrameCallable, /): @@ -119,14 +120,14 @@ def __repr__(self): independently of cache key, but this doesn't seem to be well documented or work well, so it's better to rely on __repr__. - In future we might like to change the cache so that it works on dataset name rather than the place in memory. - This would necessitate a new Dataset._name attribute. This would get us closer to getting gunicorn to work + In future we might like to change the cache so that it works on data source name rather than the place in memory. + This would necessitate a new _DynamicData._name attribute. This would get us closer to getting gunicorn to work without relying on --preload, although it would not get all the way there: * model_manager would need to fix a random seed or alternative solution (just like Dash does for its component ids) - * not clear how to handle the case of unnamed datasets, where the name is currently generated + * not clear how to handle the case of unnamed data sources, where the name is currently generated automatically by the id, since without --preload this would give mismatched names. If use a random number with - fixed seed for this then lose advantage of multiple plots that use the same dataset having just one + fixed seed for this then lose advantage of multiple plots that use the same data source having just one underlying dataframe in memory. * would need to make it possible to disable cache at build time so that data would be loaded once at build time and then again once at runtime, which is generally not what we want @@ -134,17 +135,17 @@ def __repr__(self): return super().__repr__() -class _StaticDataset: +class _StaticData: """Wrapper for a pd.DataFrame. This data cannot be updated during runtime. This is currently private since it's not expected that a user would instantiate it directly. Instead, you would use through the data_manager interface as follows: - >>> data_manager["data"] = pd.read_csv("data.csv") + >>> data_manager["static_data"] = pd.read_csv("static_data.csv") This class does not have much functionality but exists for a couple of reasons: - 1. to align interface with _Dataset by providing a load method so that fetching data from data_manager can - transparently handle loading both _StaticDataset and _Dataset without needing switching logic - 2. to raise a clear error message if a user tries to set a timeout on the dataset + 1. to align interface with _DynamicData by providing a load method so that fetching data from data_manager can + transparently handle loading both _StaticData and _DynamicData without needing switching logic + 2. to raise a clear error message if a user tries to set a timeout on the data source """ def __init__(self, data: pd.DataFrame, /): @@ -159,11 +160,11 @@ def load(self) -> pd.DataFrame: return self.__data.copy() def __setattr__(self, name, value): - # Any attributes that are only relevant for _Dataset should go here to raise a clear error message. + # Any attributes that are only relevant for _DynamicData should go here to raise a clear error message. if name in ["timeout"]: raise AttributeError( - f"Dataset that is a pandas.DataFrame itself does not support {name}; you should instead use a dataset " - "that is a function that returns a pandas.DataFrame." + f"Static data that is a pandas.DataFrame itself does not support {name}; you should instead use a" + "dynamic data source that is a function that returns a pandas.DataFrame." ) super().__setattr__(name, value) @@ -175,16 +176,16 @@ class DataManager: >>> # Static data that cannot be refreshed during runtime >>> data_manager["data"] = pd.read_csv("data.csv") >>> # Data that can be refreshed during runtime - >>> def live_data(): - >>> return pd.read_csv("live_data.csv") - >>> data_manager["live_data"] = live_data - >>> data_manager["live_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds + >>> def dynamic_data(): + >>> return pd.read_csv("dynamic_data.csv") + >>> data_manager["dynamic_data"] = dynamic_data + >>> data_manager["dynamic_data"].timeout = 5 # if you want to change the cache timeout to 5 seconds """ def __init__(self): - self.__datasets: Dict[DatasetName, Union[_Dataset, _StaticDataset]] = {} - self.__component_to_dataset: Dict[ComponentID, DatasetName] = {} + self.__data: Dict[DataSourceName, Union[_DynamicData, _StaticData]] = {} + self.__component_to_data: Dict[ComponentID, DataSourceName] = {} self._frozen_state = False self.cache = Cache(config={"CACHE_TYPE": "SimpleCache"}) # In future, possibly we will accept just a config dict. Would need to work out whether to handle merging with @@ -196,50 +197,50 @@ def __init__(self): # _cache = property(fset=__set_cache) @_state_modifier - def __setitem__(self, dataset_name: DatasetName, data: Union[pd.DataFrame, pd_DataFrameCallable]): - """Adds `data` to the `DataManager` with key `dataset_name`.""" - if dataset_name in self.__datasets: - raise ValueError(f"Dataset {dataset_name} already exists.") + def __setitem__(self, name: DataSourceName, data: Union[pd.DataFrame, pd_DataFrameCallable]): + """Adds `data` to the `DataManager` with key `name`.""" + if name in self.__data: + raise ValueError(f"Data source {name} already exists.") if callable(data): - self.__datasets[dataset_name] = _Dataset(data) + self.__data[name] = _DynamicData(data) elif isinstance(data, pd.DataFrame): - self.__datasets[dataset_name] = _StaticDataset(data) + self.__data[name] = _StaticData(data) else: raise TypeError( - f"Dataset {dataset_name} must be a pandas DataFrame or function that returns a pandas DataFrame." + f"Data source {name} must be a pandas DataFrame or function that returns a pandas DataFrame." ) - def __getitem__(self, dataset_name: DatasetName) -> Union[_Dataset, _StaticDataset]: - """Returns the `_Dataset` or `_StaticDataset` object associated with `dataset_name`.""" + def __getitem__(self, name: DataSourceName) -> Union[_DynamicData, _StaticData]: + """Returns the `_DynamicData` or `_StaticData` object associated with `name`.""" try: - return self.__datasets[dataset_name] + return self.__data[name] except KeyError as exc: - raise KeyError(f"Dataset {dataset_name} does not exist.") from exc + raise KeyError(f"Data source {name} does not exist.") from exc @_state_modifier - def _add_component(self, component_id: ComponentID, dataset_name: DatasetName): - """Adds a mapping from `component_id` to `dataset_name`.""" - if dataset_name not in self.__datasets: - raise KeyError(f"Dataset {dataset_name} does not exist.") - if component_id in self.__component_to_dataset: + def _add_component(self, component_id: ComponentID, name: DataSourceName): + """Adds a mapping from `component_id` to `name`.""" + if name not in self.__data: + raise KeyError(f"Data source {name} does not exist.") + if component_id in self.__component_to_data: raise ValueError( - f"Component with id={component_id} already exists and is mapped to dataset " - f"{self.__component_to_dataset[component_id]}. Components must uniquely map to a dataset across the " + f"Component with id={component_id} already exists and is mapped to data " + f"{self.__component_to_data[component_id]}. Components must uniquely map to a data source across the " f"whole dashboard. If you are working from a Jupyter Notebook, please either restart the kernel, or " f"use 'from vizro import Vizro; Vizro._reset()`." ) - self.__component_to_dataset[component_id] = dataset_name + self.__component_to_data[component_id] = name def _get_component_data(self, component_id: ComponentID) -> pd.DataFrame: - # TODO: once have removed self.__component_to_dataset, we shouldn't need this function any more. Calling - # functions would just do data_manager[dataset_name].load(). + # TODO: once have removed self.__component_to_data, we shouldn't need this function any more. Calling + # functions would just do data_manager[name].load(). """Returns the original data for `component_id`.""" - if component_id not in self.__component_to_dataset: + if component_id not in self.__component_to_data: raise KeyError(f"Component {component_id} does not exist. You need to call add_component first.") - dataset_name = self.__component_to_dataset[component_id] - logger.debug("Loading dataset %s with id %s", dataset_name, id(self[dataset_name])) - return self[dataset_name].load() + name = self.__component_to_data[component_id] + logger.debug("Loading data %s with id %s", name, id(self[name])) + return self[name].load() def _clear(self): # Make sure the cache itself is cleared before the reference to it is lost by calling __init__.