Skip to content

Commit

Permalink
Make dataset name the cache key so that preload is not needed
Browse files Browse the repository at this point in the history
  • Loading branch information
antonymilne committed Apr 5, 2024
1 parent 02e7879 commit 87f0fa2
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 69 deletions.
42 changes: 24 additions & 18 deletions vizro-core/examples/_dev/app.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,39 @@
"""Example to show dashboard configuration."""
from flask_caching import Cache

import vizro.models as vm
import vizro.plotly.express as px
from vizro import Vizro
from vizro.managers import data_manager
from vizro.tables import dash_ag_grid

df = px.data.gapminder()
from vizro.managers import data_manager

df = px.data.iris()

# Cache of default_expire_data expires every 5 minutes, the default
data_manager.cache = Cache(config={"CACHE_TYPE": "FileSystemCache", "CACHE_DIR": "cache", "CACHE_DEFAULT_TIMEOUT": 20})
data_manager["default_expire_data"] = lambda: px.data.iris()

# Set cache of fast_expire_data to expire every 10 seconds
data_manager["fast_expire_data"] = lambda: px.data.iris()
data_manager["fast_expire_data"].timeout = 5
# Set cache of no_expire_data to never expire
data_manager["no_expire_data"] = lambda: px.data.iris()
data_manager["no_expire_data"].timeout = 0

page = vm.Page(
title="Enhanced AG Grid",
title="Blah",
components=[
vm.AgGrid(
title="Dash AG Grid",
figure=dash_ag_grid(
data_frame=df,
columnDefs=[
{"field": "country", "floatingFilter": True, "suppressHeaderMenuButton": True},
{"field": "continent", "floatingFilter": True, "suppressHeaderMenuButton": True},
{"field": "year"},
{"field": "lifeExp", "cellDataType": "numeric"},
{"field": "pop", "cellDataType": "numeric"},
{"field": "gdpPercap", "cellDataType": "euro"},
],
),
),
vm.Graph(figure=px.scatter(df, "sepal_width", "sepal_length")),
vm.Graph(figure=px.scatter("default_expire_data", "sepal_width", "sepal_length")),
vm.Graph(figure=px.scatter("fast_expire_data", "sepal_width", "sepal_length")),
vm.Graph(figure=px.scatter("no_expire_data", "sepal_width", "sepal_length")),
],
controls=[vm.Filter(column="continent")],
)
dashboard = vm.Dashboard(pages=[page])
app = Vizro().build(dashboard)
server = app.dash.server

if __name__ == "__main__":
Vizro().build(dashboard).run()
app.run()
48 changes: 20 additions & 28 deletions vizro-core/src/vizro/managers/_data_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The data manager handles access to all DataFrames used in a Vizro app."""

from __future__ import annotations

import os
import logging
from typing import Callable, Dict, Optional, Union

Expand All @@ -17,13 +17,11 @@
# * new error messages that are raised
# * set cache to null in all other tests


# * rename in code
# * make sure no mention of lazy or eager/active
# TODO: __main__ in this file: remove/move to docs


logger = logging.getLogger(__name__)

logger.setLevel(logging.DEBUG)
# Really ComponentID and DataSourceName should be NewType and not just aliases but then for a user's code to type check
# correctly they would need to cast all strings to these types.
# TODO: remove these type aliases once have moved component to data mapping to models
Expand Down Expand Up @@ -87,13 +85,16 @@ class _DynamicData:
Possibly in future, this will become a public class so you could directly do:
>>> data_manager["dynamic_data"] = DynamicData(dynamic_data, timeout=5)
But we'd need to make sure that name is not an argument in __init__ then.
At this point we might like to disable the behaviour so that data_manager setitem and getitem handle the same
object rather than doing an implicit conversion to _DynamicData.
"""

def __init__(self, load_data: pd_DataFrameCallable, /):
def __init__(self, name: str, load_data: pd_DataFrameCallable):
self.__load_data: pd_DataFrameCallable = load_data
# name is needed for the cache key and should not be modified by users.
self._name = name
self.timeout: Optional[int] = None
# We might also want a self.cache_arguments dictionary in future that allows user to customise more than just
# timeout, but no rush to do this since other arguments are unlikely to be useful.
Expand All @@ -109,30 +110,20 @@ def load(self) -> pd.DataFrame:
return self.__load_data()

def __repr__(self):
"""This is just the default repr so behaviour would be the same if we removed the function definition.
"""Flask-caching uses repr to form the cache key, so this is very important to set correctly.
The reason for defining this is to have somewhere to put the following warning: caching currently relies on
this returning a string that depends on id(self). This is relied on by flask_caching.utils.function_namespace
and our own memoize decorator. If this method were changed to no longer include some representation of id(self)
then cache keys would be mixed up.
In particular, it must depend on something that uniquely labels the data source and is the same across all
workers: self._name. Using id(self), as in Python's default repr, only works in the case that gunicorn is
running with --preload: without preloading, the id of the same data source in different processes will be
different so the cache will not match up.
flask_caching make it possible to set a __cached_id__ attribute to handle this so that repr can be set
independently of cache key, but this doesn't seem to be well documented or work well, so it's better to rely
on __repr__.
In future we might like to change the cache so that it works on data source name rather than the place in memory.
This would necessitate a new _DynamicData._name attribute. This would get us closer to getting gunicorn to work
without relying on --preload, although it would not get all the way there:
* model_manager would need to fix a random seed or alternative solution (just like Dash does for its
component ids)
* not clear how to handle the case of unnamed data sources, where the name is currently generated
automatically by the id, since without --preload this would give mismatched names. If use a random number with
fixed seed for this then lose advantage of multiple plots that use the same data source having just one
underlying dataframe in memory.
* would need to make it possible to disable cache at build time so that data would be loaded once at build
time and then again once at runtime, which is generally not what we want
"""
return super().__repr__()
# Note that using repr(self.__load_data) is not good since it depends on the id of self.__load_data and so
# would not be consistent across processes.
return f"{self.__class__.__name__}({self._name}, {self.__load_data.__qualname__})"


class _StaticData:
Expand All @@ -148,7 +139,8 @@ class _StaticData:
2. to raise a clear error message if a user tries to set a timeout on the data source
"""

def __init__(self, data: pd.DataFrame, /):
def __init__(self, data: pd.DataFrame):
# No need for _name here because static data doesn't need a cache key.
self.__data = data

def load(self) -> pd.DataFrame:
Expand Down Expand Up @@ -203,7 +195,7 @@ def __setitem__(self, name: DataSourceName, data: Union[pd.DataFrame, pd_DataFra
raise ValueError(f"Data source {name} already exists.")

if callable(data):
self.__data[name] = _DynamicData(data)
self.__data[name] = _DynamicData(name, data)
elif isinstance(data, pd.DataFrame):
self.__data[name] = _StaticData(data)
else:
Expand Down Expand Up @@ -239,7 +231,8 @@ def _get_component_data(self, component_id: ComponentID) -> pd.DataFrame:
if component_id not in self.__component_to_data:
raise KeyError(f"Component {component_id} does not exist. You need to call add_component first.")
name = self.__component_to_data[component_id]
logger.debug("Loading data %s with id %s", name, id(self[name]))

logger.debug(f"Loading data %s on process %s", name, os.getpid())
return self[name].load()

def _clear(self):
Expand All @@ -252,7 +245,6 @@ def _clear(self):


if __name__ == "__main__":
# TODO: remove this/move to docs
from functools import partial

import vizro.plotly.express as px
Expand Down
2 changes: 2 additions & 0 deletions vizro-core/src/vizro/managers/_model_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from vizro.models import VizroBaseModel
from vizro.models._action._actions_chain import ActionsChain

# As done for Dash components in dash.development.base_component, fixing the random seed is required to make sure that
# the randomly generated model ID for the same model matches up across workers when running gunicorn without --preload.
rd = random.Random(0)

ModelID = NewType("ModelID", str)
Expand Down
37 changes: 18 additions & 19 deletions vizro-core/src/vizro/models/_components/_components_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import uuid

import logging
from functools import partial

Expand Down Expand Up @@ -26,26 +28,23 @@ def _callable_mode_validator_factory(mode: str):
def _process_callable_data_frame(captured_callable, values):
data_frame = captured_callable["data_frame"]

# Enable running e.g. px.scatter("iris") from the Python API and specification of "data_frame": "iris" through JSON.
# In these cases, data already exists in the data manager and just needs to be linked to the component.
if isinstance(data_frame, str):
data_manager._add_component(values["id"], data_frame)
return captured_callable

# Standard case for px.scatter(df: pd.DataFrame).
# Extract dataframe from the captured function and put it into the data manager.
dataset_name = str(id(data_frame))

logger.debug("Adding data to data manager for Figure with id %s", values["id"])
# If the dataset already exists in the data manager then it's not a problem, it just means that we don't need
# to duplicate it. Just log the exception for debugging purposes.
try:
data_manager[dataset_name] = data_frame
except ValueError as exc:
logger.debug(exc)

data_manager._add_component(values["id"], dataset_name)

# Enable running with DynamicData, e.g. px.scatter("iris") from the Python API and specification of
# "data_frame": "iris" through JSON. In these cases, data already exists in the data manager and just needs to be
# linked to the component.
data_source_name = data_frame
else:
# Standard StaticData case for px.scatter(df: pd.DataFrame).
# Extract dataframe from the captured function and put it into the data manager.
# Unlike with model_manager, it doesn't matter if the random seed is different across workers here. So long as we
# always fetch StaticData data from the data manager my going through the appropriate Figure component, the right
# data name will be fetched. It also doesn't matter that multiple Figures with the same underlying data
# each have their own entry in the data manager, since the underlying pd.DataFrame will still be the same and not
# copied into each one, so no memory is wasted.
logger.debug("Adding data to data manager for Figure with id %s", values["id"])
data_source_name = str(uuid.uuid4())

data_manager._add_component(values["id"], data_source_name)
# No need to keep the data in the captured function any more so remove it to save memory.
del captured_callable["data_frame"]
return captured_callable
6 changes: 2 additions & 4 deletions vizro-core/src/vizro/models/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,8 @@ def wrapped(*args, **kwargs) -> _DashboardReadyFigure:

if isinstance(captured_callable["data_frame"], str):
# Enable running e.g. px.scatter("iris") from the Python API. Don't actually run the function
# because it won't get work as there's no data. It's vital we don't fetch data from the data manager
# yet either, because otherwise all lazy data will be loaded before the dashboard is started.
# This case is not relevant for the JSON/YAML API, which is handled separately through validation of
# CapturedCallable.
# because it won't get work as there's no data. This case is not relevant for the JSON/YAML API,
# which is handled separately through validation of CapturedCallable.
fig = _DashboardReadyFigure()
else:
# Standard case for px.scatter(df: pd.DataFrame).
Expand Down

0 comments on commit 87f0fa2

Please sign in to comment.