Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

datapane embedded #58

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,8 @@ Detecting drift concept and get analyses and explainability of this drift. An is
Adapting Eurybia for models consumed in API mode. An issue is open: [Adapt Eurybia to API mode](https://github.com/MAIF/eurybia/issues/9)

If you want to contribute, you can contact us in the [discussion tab](https://github.com/MAIF/eurybia/discussions)


## Note

Eurybia uses [datapane](https://github.com/datapane/datapane) to generate its reports. The support of datapane being dropped since 2023, the package has been embedded as a module of Eurybia.
154 changes: 120 additions & 34 deletions eurybia/core/smartplotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ class SmartPlotter:

def __init__(self, smartdrift):
self._palette_name = list(colors_loading().keys())[0]
self._style_dict = define_style(select_palette(colors_loading(), self._palette_name))
self._style_dict = define_style(
select_palette(colors_loading(), self._palette_name)
)
self.smartdrift = smartdrift

def generate_fig_univariate(
Expand Down Expand Up @@ -86,15 +88,24 @@ def generate_fig_univariate(
hue = self.smartdrift._datadrift_target
if df_all is None:
df_all = self.smartdrift._df_concat
df_all.loc[df_all[hue] == 0, hue] = list(self.smartdrift.dataset_names.keys())[1]
df_all.loc[df_all[hue] == 1, hue] = list(self.smartdrift.dataset_names.keys())[0]
df_all.loc[df_all[hue] == 0, hue] = list(
self.smartdrift.dataset_names.keys()
)[1]
df_all.loc[df_all[hue] == 1, hue] = list(
self.smartdrift.dataset_names.keys()
)[0]
if dict_color_palette is None:
dict_color_palette = self._style_dict
col_types = compute_col_types(df_all=df_all)

if col_types[col] == VarType.TYPE_NUM:
fig = self.generate_fig_univariate_continuous(df_all, col, hue=hue, dict_color_palette=dict_color_palette)
fig = self.generate_fig_univariate_continuous(
df_all, col, hue=hue, dict_color_palette=dict_color_palette
)
elif col_types[col] == VarType.TYPE_CAT:
fig = self.generate_fig_univariate_categorical(df_all, col, hue=hue, dict_color_palette=dict_color_palette)
fig = self.generate_fig_univariate_categorical(
df_all, col, hue=hue, dict_color_palette=dict_color_palette
)
else:
raise NotImplementedError("Series dtype not supported")
return fig
Expand All @@ -114,7 +125,6 @@ def generate_fig_univariate_continuous(
width: Optional[str] = None,
hovermode: Optional[str] = None,
) -> plotly.graph_objs._figure.Figure:

"""
Returns a plotly figure containing the distribution of a continuous feature.

Expand Down Expand Up @@ -147,7 +157,10 @@ def generate_fig_univariate_continuous(
plotly.graph_objs._figure.Figure
"""
df_all.loc[:, col].fillna(0, inplace=True)
datasets = [df_all[df_all[hue] == val][col].values.tolist() for val in df_all[hue].unique()]
datasets = [
df_all[df_all[hue] == val][col].values.tolist()
for val in df_all[hue].unique()
]

fig = ff.create_distplot(
datasets,
Expand Down Expand Up @@ -249,20 +262,33 @@ def generate_fig_univariate_categorical(
-------
plotly.graph_objs._figure.Figure
"""
df_cat = df_all.groupby([col, hue]).agg({col: "count"}).rename(columns={col: "count"}).reset_index()
df_cat["Percent"] = df_cat["count"] * 100 / df_cat.groupby(hue)["count"].transform("sum")
df_cat = (
df_all.groupby([col, hue])
.agg({col: "count"})
.rename(columns={col: "count"})
.reset_index()
)
df_cat["Percent"] = (
df_cat["count"] * 100 / df_cat.groupby(hue)["count"].transform("sum")
)

if pd.api.types.is_numeric_dtype(df_cat[col].dtype):
df_cat = df_cat.sort_values(col, ascending=True)
df_cat[col] = df_cat[col].astype(str)

nb_cat = df_cat.groupby([col]).agg({"count": "sum"}).reset_index()[col].nunique()
nb_cat = (
df_cat.groupby([col]).agg({"count": "sum"}).reset_index()[col].nunique()
)

if nb_cat > nb_cat_max:
df_cat = self._merge_small_categories(df_cat=df_cat, col=col, hue=hue, nb_cat_max=nb_cat_max)
df_cat = self._merge_small_categories(
df_cat=df_cat, col=col, hue=hue, nb_cat_max=nb_cat_max
)

df_to_sort = df_cat.copy().reset_index(drop=True)
df_to_sort["Sorted_indicator"] = df_to_sort.sort_values([col]).groupby([col])["Percent"].diff()
df_to_sort["Sorted_indicator"] = (
df_to_sort.sort_values([col]).groupby([col])["Percent"].diff()
)
df_to_sort["Sorted_indicator"] = np.abs(df_to_sort["Sorted_indicator"])
df_sorted = df_to_sort.dropna()[[col, "Sorted_indicator"]]

Expand All @@ -272,7 +298,9 @@ def generate_fig_univariate_categorical(
.drop("Sorted_indicator", axis=1)
)

df_cat["Percent_displayed"] = df_cat["Percent"].apply(lambda row: str(round(row, 2)) + " %")
df_cat["Percent_displayed"] = df_cat["Percent"].apply(
lambda row: str(round(row, 2)) + " %"
)

modalities = df_cat[hue].unique().tolist()

Expand All @@ -285,7 +313,10 @@ def generate_fig_univariate_categorical(
color=hue,
text="Percent_displayed",
)
fig1.update_traces(marker_color=list(self._style_dict["univariate_cat_bar"].values())[0], showlegend=True)
fig1.update_traces(
marker_color=list(self._style_dict["univariate_cat_bar"].values())[0],
showlegend=True,
)

fig2 = px.bar(
df_cat[df_cat[hue] == modalities[1]],
Expand All @@ -296,7 +327,10 @@ def generate_fig_univariate_categorical(
color=hue,
text="Percent_displayed",
)
fig2.update_traces(marker_color=list(self._style_dict["univariate_cat_bar"].values())[1], showlegend=True)
fig2.update_traces(
marker_color=list(self._style_dict["univariate_cat_bar"].values())[1],
showlegend=True,
)

fig = fig1.add_trace(fig2.data[0])

Expand Down Expand Up @@ -336,21 +370,31 @@ def generate_fig_univariate_categorical(

return fig

def _merge_small_categories(self, df_cat: pd.DataFrame, col: str, hue: str, nb_cat_max: int) -> pd.DataFrame:
def _merge_small_categories(
self, df_cat: pd.DataFrame, col: str, hue: str, nb_cat_max: int
) -> pd.DataFrame:
"""
Merges categories of column 'col' of df_cat into 'Other' category so that
the number of categories is less than nb_cat_max.
"""
df_cat_sum_hue = df_cat.groupby([col]).agg({"count": "sum"}).reset_index()
list_cat_to_merge = df_cat_sum_hue.sort_values("count", ascending=False)[col].to_list()[nb_cat_max - 1 :]
list_cat_to_merge = df_cat_sum_hue.sort_values("count", ascending=False)[
col
].to_list()[nb_cat_max - 1 :]
df_cat_other = (
df_cat.loc[df_cat[col].isin(list_cat_to_merge)].groupby(hue, as_index=False)[["count", "Percent"]].sum()
df_cat.loc[df_cat[col].isin(list_cat_to_merge)]
.groupby(hue, as_index=False)[["count", "Percent"]]
.sum()
)
df_cat_other[col] = "Other"
return pd.concat([df_cat.loc[~df_cat[col].isin(list_cat_to_merge)], df_cat_other])
return pd.concat(
[df_cat.loc[~df_cat[col].isin(list_cat_to_merge)], df_cat_other]
)

def scatter_feature_importance(
self, feature_importance: pd.DataFrame = None, datadrift_stat_test: pd.DataFrame = None
self,
feature_importance: pd.DataFrame = None,
datadrift_stat_test: pd.DataFrame = None,
) -> plotly.graph_objs._figure.Figure:
"""
Displays scatter of feature importance between drift
Expand Down Expand Up @@ -392,7 +436,16 @@ def scatter_feature_importance(
+ f"Datadrift test: {t} - pvalue: {pv:.5f}<br />"
+ f"Datadrift model Importance: {ddrimp*100:.1f}"
for feat, depimp, t, pv, ddrimp in zip(
*map(data.get, ["features", "deployed_model", "testname", "pvalue", "datadrift_classifier"])
*map(
data.get,
[
"features",
"deployed_model",
"testname",
"pvalue",
"datadrift_classifier",
],
)
)
]

Expand All @@ -401,20 +454,30 @@ def scatter_feature_importance(
go.Scatter(
x=data["datadrift_classifier"],
y=data["deployed_model"],
marker_symbol=datadrift_stat_test["testname"].apply(lambda x: symbol_dict[x]),
marker_symbol=datadrift_stat_test["testname"].apply(
lambda x: symbol_dict[x]
),
mode="markers",
showlegend=False,
hovertext=hv_text,
hovertemplate="%{hovertext}<extra></extra>",
)
)

fig.update_traces(marker={"size": 15, "opacity": 0.8, "line": {"width": 0.8, "color": "white"}})
fig.update_traces(
marker={
"size": 15,
"opacity": 0.8,
"line": {"width": 0.8, "color": "white"},
}
)

fig.data[0].marker.color = data["pvalue"]
fig.data[0].marker.coloraxis = "coloraxis"
fig.layout.coloraxis.colorscale = self._style_dict["featimportance_colorscale"]
fig.layout.coloraxis.colorbar = {"title": {"text": "Univariate<br />DataDrift Test<br />Pvalue"}}
fig.layout.coloraxis.colorbar = {
"title": {"text": "Univariate<br />DataDrift Test<br />Pvalue"}
}

height = self._style_dict["height"]
width = self._style_dict["width"]
Expand Down Expand Up @@ -476,24 +539,31 @@ def generate_historical_datadrift_metric(
datadrift_historical = self.smartdrift.historical_auc
if datadrift_historical is not None:
if self.smartdrift.deployed_model is not None:
datadrift_historical = datadrift_historical[["date", "auc", "JS_predict"]]
datadrift_historical = datadrift_historical[
["date", "auc", "JS_predict"]
]
datadrift_historical = (
datadrift_historical.groupby(["date"])[["auc", "JS_predict"]].mean().reset_index()
datadrift_historical.groupby(["date"])[["auc", "JS_predict"]]
.mean()
.reset_index()
)
datadrift_historical.sort_values(by="date", inplace=True)
else:
datadrift_historical = datadrift_historical[["date", "auc"]]
datadrift_historical = datadrift_historical.groupby("date")["auc"].mean().reset_index()
datadrift_historical = (
datadrift_historical.groupby("date")["auc"].mean().reset_index()
)
datadrift_historical.sort_values(by="date", inplace=True)

datadrift_historical["auc_displayed"] = datadrift_historical["auc"].round(2)

if self.smartdrift.deployed_model is not None:

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(
x=datadrift_historical["date"], y=datadrift_historical["auc"], name="Datadrift classifier AUC"
x=datadrift_historical["date"],
y=datadrift_historical["auc"],
name="Datadrift classifier AUC",
),
secondary_y=False,
)
Expand All @@ -508,8 +578,13 @@ def generate_historical_datadrift_metric(
)

fig.update_layout(title_text="Evolution of data drift")
fig.update_yaxes(title_text="<b>Datadrift classifier AUC</b> ", secondary_y=False)
fig.update_yaxes(title_text="<b>Jensen_Shannon Prediction Divergence</b> ", secondary_y=True)
fig.update_yaxes(
title_text="<b>Datadrift classifier AUC</b> ", secondary_y=False
)
fig.update_yaxes(
title_text="<b>Jensen_Shannon Prediction Divergence</b> ",
secondary_y=True,
)
fig.update_yaxes(range=[0.5, 1], secondary_y=False)
fig.update_yaxes(range=[0, 0.3], secondary_y=True)
else:
Expand Down Expand Up @@ -600,7 +675,9 @@ def generate_modeldrift_data(
For more information see the documentation"""
)
data_modeldrift[metric] = data_modeldrift[metric].apply(
lambda row: round(row, len([char for char in str(row).split(".")[1] if char == "0"]) + 3)
lambda row: round(
row, len([char for char in str(row).split(".")[1] if char == "0"]) + 3
)
)

fig = px.line(
Expand Down Expand Up @@ -688,7 +765,12 @@ def generate_indicator(
color = sns.blend_palette(["green", "yellow", "orange", "red"], 100)
color = color.as_hex()
list_color_glob = list()
threshold = [i for i in np.arange(min_gauge, max_gauge, (max_gauge - min_gauge) / len(color))]
threshold = [
i
for i in np.arange(
min_gauge, max_gauge, (max_gauge - min_gauge) / len(color)
)
]
for i in range(1, len(threshold) + 1):
dict_color = dict()
if i == len(threshold):
Expand All @@ -705,7 +787,11 @@ def generate_indicator(
domain={"x": [0, 1], "y": [0, 1]},
title={"text": title, "align": "center", "font": {"size": 20}},
gauge={
"axis": {"range": [min_gauge, max_gauge], "ticktext": ["No Drift", "High Drift"], "tickwidth": 1},
"axis": {
"range": [min_gauge, max_gauge],
"ticktext": ["No Drift", "High Drift"],
"tickwidth": 1,
},
"bar": {"color": "black"},
"borderwidth": 0,
"steps": list_color_glob,
Expand Down
24 changes: 17 additions & 7 deletions eurybia/report/common.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""
Common functions used in report
"""

import os
from enum import Enum
from numbers import Number
from typing import Callable, Dict, Optional, Union

import pandas as pd
from pandas.api.types import is_bool_dtype, is_categorical_dtype, is_numeric_dtype, is_string_dtype
from pandas.api.types import (
is_numeric_dtype,
infer_dtype,
)


class VarType(Enum):
Expand All @@ -23,7 +27,9 @@ def __str__(self):
return str(self.value)


def display_value(value: float, thousands_separator: str = ",", decimal_separator: str = ".") -> str:
def display_value(
value: float, thousands_separator: str = ",", decimal_separator: str = "."
) -> str:
"""
Display a value as a string with specific format.
Parameters
Expand All @@ -43,7 +49,9 @@ def display_value(value: float, thousands_separator: str = ",", decimal_separato
'1,255,000'
"""
value_str = f"{value:,}".replace(",", "/thousands/").replace(".", "/decimal/")
return value_str.replace("/thousands/", thousands_separator).replace("/decimal/", decimal_separator)
return value_str.replace("/thousands/", thousands_separator).replace(
"/decimal/", decimal_separator
)


def replace_dict_values(obj: Dict, replace_fn: Callable, *args) -> dict:
Expand Down Expand Up @@ -76,11 +84,11 @@ def series_dtype(s: pd.Series) -> VarType:
-------
VarType
"""
if is_bool_dtype(s):
if infer_dtype(s) == "boolean":
return VarType.TYPE_CAT
elif is_string_dtype(s):
elif infer_dtype(s, skipna=True) == "string":
return VarType.TYPE_CAT
elif is_categorical_dtype(s):
elif isinstance(s.dtype, pd.CategoricalDtype):
return VarType.TYPE_CAT
elif is_numeric_dtype(s):
if numeric_is_continuous(s):
Expand Down Expand Up @@ -139,7 +147,9 @@ def get_callable(path: str):
try:
import_module(mod)
except Exception as e:
raise ImportError(f"Encountered error: `{e}` when loading module '{path}'") from e
raise ImportError(
f"Encountered error: `{e}` when loading module '{path}'"
) from e
obj = getattr(obj, part)
if isinstance(obj, type):
obj_type: type = obj
Expand Down
Loading
Loading