Skip to content

182 add standard deviation to pig plots in regression case #183

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 140 additions & 86 deletions cobra/evaluation/pigs_tables.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import FuncFormatter

import cobra.utils as utils
import logging

# logging.basicConfig(level=logging.DEBUG)

def generate_pig_tables(basetable: pd.DataFrame,
target_column_name: str,
preprocessed_predictors: list,
id_column_name: str = None) -> pd.DataFrame:

def generate_pig_tables(
basetable: pd.DataFrame,
target_column_name: str,
preprocessed_predictors: list,
id_column_name: str = None,
) -> pd.DataFrame:
"""Compute PIG tables for all predictors in preprocessed_predictors.

The output is a DataFrame with columns ``variable``, ``label``,
Expand All @@ -26,35 +30,37 @@ def generate_pig_tables(basetable: pd.DataFrame,
List of basetable column names containing preprocessed predictors.
id_column_name : str, default=None
Name of the basetable column containing the IDs of the basetable rows
(e.g. customernumber).
(e.g. customernumber).
Returns
-------
pd.DataFrame
DataFrame containing a PIG table for all predictors.
"""

#check if there is a id-column and define no_predictor accordingly
if id_column_name == None:
# check if there is a id-column and define no_predictor accordingly
if id_column_name is None:
no_predictor = [target_column_name]
else:
no_predictor = [id_column_name, target_column_name]


pigs = [
compute_pig_table(basetable,
column_name,
target_column_name,
)
compute_pig_table(
basetable,
column_name,
target_column_name,
)
for column_name in sorted(preprocessed_predictors)
if column_name not in no_predictor
]
output = pd.concat(pigs, ignore_index=True)
return output


def compute_pig_table(basetable: pd.DataFrame,
predictor_column_name: str,
target_column_name: str) -> pd.DataFrame:
def compute_pig_table(
basetable: pd.DataFrame,
predictor_column_name: str,
target_column_name: str
) -> pd.DataFrame:
"""Compute the PIG table of a given predictor for a given target.

Parameters
Expand All @@ -76,38 +82,44 @@ def compute_pig_table(basetable: pd.DataFrame,
# group by the binned variable, compute the incidence
# (= mean of the target for the given bin) and compute the bin size
# (e.g. COUNT(id_column_name)). After that, rename the columns

res = (basetable.groupby(predictor_column_name)
.agg(
avg_target = (target_column_name, "mean"),
pop_size = (target_column_name, "size")
)
.reset_index()
.rename(
columns={predictor_column_name: "label"}
)
res = (
basetable.groupby(predictor_column_name)
.agg(
avg_target=(target_column_name, "mean"),
pop_size=(target_column_name, "size"),
std_dev_target=(target_column_name, "std"),
)
.reset_index()
.rename(columns={predictor_column_name: "label"})
)


# add the column name to a variable column
# add the average incidence
# replace population size by a percentage of total population
res["variable"] = utils.clean_predictor_name(predictor_column_name)
res["global_avg_target"] = global_avg_target
res["pop_size"] = res["pop_size"]/len(basetable.index)

res["pop_size"] = res["pop_size"] / len(basetable.index)
# make sure to always return the data with the proper column order
column_order = ["variable", "label", "pop_size",
"global_avg_target", "avg_target"]
column_order = [
"variable",
"label",
"pop_size",
"global_avg_target",
"avg_target",
"std_dev_target",
]

return res[column_order]


def plot_incidence(pig_tables: pd.DataFrame,
variable: str,
model_type: str,
column_order: list=None,
dim: tuple=(12, 8)):
def plot_incidence(
pig_tables: pd.DataFrame,
variable: str,
model_type: str,
column_order: list = None,
dim: tuple = (12, 8),
show_error: bool = False,
):
"""Plots a Predictor Insights Graph (PIG), a graph in which the mean
target value is plotted for a number of bins constructed from a predictor
variable. When the target is a binary classification target,
Expand All @@ -130,28 +142,33 @@ def plot_incidence(pig_tables: pd.DataFrame,
on the PIG.
dim: tuple, default=(12, 8)
Optional tuple to configure the width and length of the plot.
show_error: bool, default=False
Indicate if the standard deviation per bin should be showed. This can
be useful in regression.
"""
if model_type not in ["classification", "regression"]:
raise ValueError("An unexpected value was set for the model_type "
"parameter. Expected 'classification' or "
"'regression'.")
raise ValueError(
"An unexpected value was set for the model_type "
"parameter. Expected 'classification' or "
"'regression'."
)

df_plot = pig_tables[pig_tables['variable'] == variable].copy()
df_plot = pig_tables[pig_tables["variable"] == variable].copy()

if column_order is not None:
if not set(df_plot['label']) == set(column_order):
if not set(df_plot["label"]) == set(column_order):
raise ValueError(
'The column_order and pig_tables parameters do not contain '
'the same set of variables.')
"The column_order and pig_tables parameters do not contain "
"the same set of variables."
)

df_plot['label'] = df_plot['label'].astype('category')
df_plot['label'].cat.reorder_categories(column_order,
inplace=True)
df_plot["label"] = df_plot["label"].astype("category")
df_plot["label"].cat.reorder_categories(column_order, inplace=True)

df_plot.sort_values(by=['label'], ascending=True, inplace=True)
df_plot.sort_values(by=["label"], ascending=True, inplace=True)
df_plot.reset_index(inplace=True)
else:
df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True)
df_plot.sort_values(by=["avg_target"], ascending=False, inplace=True)
df_plot.reset_index(inplace=True)

with plt.style.context("seaborn-ticks"):
Expand All @@ -160,35 +177,54 @@ def plot_incidence(pig_tables: pd.DataFrame,
# --------------------------
# Left axis - average target
# --------------------------
ax.plot(df_plot['label'], df_plot['avg_target'],
color="#00ccff", marker=".",
markersize=20, linewidth=3,
label='incidence rate per bin' if model_type == "classification" else "mean target value per bin",
zorder=10)

ax.plot(df_plot['label'], df_plot['global_avg_target'],
color="#022252", linestyle='--', linewidth=4,
label='average incidence rate' if model_type == "classification" else "global mean target value",
zorder=10)
error = df_plot["std_dev_target"] / 2 if show_error else None
ax.errorbar(
df_plot["label"],
df_plot["avg_target"],
yerr=error,
color="#00ccff",
marker=".",
markersize=15,
linewidth=3,
elinewidth=2,
capsize=5,
barsabove=True,
label="incidence rate per bin"
if model_type == "classification"
else "mean target value per bin",
zorder=10,
)

ax.plot(
df_plot["label"],
df_plot["global_avg_target"],
color="#022252",
linestyle="--",
linewidth=4,
label="average incidence rate"
if model_type == "classification"
else "global mean target value",
zorder=10,
)

# Dummy line to have label on second axis from first
ax.plot(np.nan, "#939598", linewidth=6, label='bin size')
ax.plot(np.nan, "#939598", linewidth=6, label="bin size")

# Set labels & ticks
ax.set_ylabel('Incidence' if model_type == "classification" else "Mean target value",
fontsize=16)
ax.set_ylabel(
"Incidence" if model_type == "classification" else "Mean target value",
fontsize=16,
)
ax.set_xlabel("Bins", fontsize=15)
ax.xaxis.set_tick_params(labelsize=14)
plt.setp(ax.get_xticklabels(),
rotation=45, ha="right", rotation_mode="anchor")
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
ax.yaxis.set_tick_params(labelsize=14)

if model_type == "classification":
# Mean target values are between 0 and 1 (target incidence rate),
# so format them as percentages
ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05))
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
ax.set_yticks(np.arange(0, max(df_plot["avg_target"]) + 0.05, 0.05))
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: "{:.1%}".format(y)))
elif model_type == "regression":
# If the difference between the highest avg. target of all bins
# versus the global avg. target AND the difference between the
Expand All @@ -200,40 +236,52 @@ def plot_incidence(pig_tables: pd.DataFrame,
# the bins and versus the global avg. target.
# (Motivation for the AND above: if on one end there IS enough
# difference, the effect that we discuss here does not occur.)
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
ax.set_ylim(global_avg_target * 0.75,
global_avg_target * 1.25)
global_avg_target = max(
df_plot["global_avg_target"]
) # series of same number, for every bin.
if (
np.abs((max(df_plot["avg_target"]) - global_avg_target))
/ global_avg_target
< 0.25
) and (
np.abs((min(df_plot["avg_target"]) - global_avg_target))
/ global_avg_target
< 0.25
):
ax.set_ylim(global_avg_target * 0.75, global_avg_target * 1.25)

# Remove ticks but keep the labels
ax.tick_params(axis='both', which='both', length=0)
ax.tick_params(axis='y', colors="#00ccff")
ax.yaxis.label.set_color('#00ccff')
ax.tick_params(axis="both", which="both", length=0)
ax.tick_params(axis="y", colors="#00ccff")
ax.yaxis.label.set_color("#00ccff")

# -----------------
# Right Axis - bins
# -----------------
ax2 = ax.twinx()

ax2.bar(df_plot['label'], df_plot['pop_size'],
align='center', color="#939598", zorder=1)
ax2.bar(
df_plot["label"],
df_plot["pop_size"],
align="center",
color="#939598",
zorder=1,
)

# Set labels & ticks
ax2.set_xlabel("Bins", fontsize=15)
ax2.xaxis.set_tick_params(rotation=45, labelsize=14)

ax2.yaxis.set_tick_params(labelsize=14)
ax2.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
ax2.set_ylabel('Population size', fontsize=15)
ax2.tick_params(axis='y', colors="#939598")
ax2.yaxis.label.set_color('#939598')
ax2.yaxis.set_major_formatter(FuncFormatter(lambda y, _: "{:.1%}".format(y)))
ax2.set_ylabel("Population size", fontsize=15)
ax2.tick_params(axis="y", colors="#939598")
ax2.yaxis.label.set_color("#939598")

# Despine & prettify
sns.despine(ax=ax, right=True, left=True)
sns.despine(ax=ax2, left=True, right=False)
ax2.spines['right'].set_color('white')
ax2.spines["right"].set_color("white")

ax2.grid(False)

Expand All @@ -244,9 +292,15 @@ def plot_incidence(pig_tables: pd.DataFrame,
title = "Mean target plot"
fig.suptitle(title, fontsize=20)
plt.title(variable, fontsize=17)
ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
loc=3, ncol=1, mode="expand", borderaxespad=0.,
prop={"size": 14})
ax.legend(
frameon=False,
bbox_to_anchor=(0.0, 1.01, 1.0, 0.102),
loc=3,
ncol=1,
mode="expand",
borderaxespad=0.0,
prop={"size": 14},
)

# Set order of layers
ax.set_zorder(1)
Expand Down
13 changes: 3 additions & 10 deletions tests/preprocessing/test_pig_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from cobra.evaluation.pigs_tables import generate_pig_tables

from typing import Optional
import numpy as np


class TestPigTablesGeneration:
Expand Down Expand Up @@ -36,20 +37,12 @@ def test_col_id(self, id_col_name: Optional[str]):
# expected
expected = pd.DataFrame(
{
"variable": [
"age",
"age",
"age",
"age",
"pclass",
"pclass",
"sex",
"sex",
],
"variable": ["age", "age", "age", "age", "pclass", "pclass", "sex", "sex",],
"label": [22.0, 35.0, 38.0, 54.0, 1, 3, "female", "male"],
"pop_size": [0.2, 0.4, 0.2, 0.2, 0.6, 0.4, 0.4, 0.6],
"global_avg_target": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
"avg_target": [0.0, 0.5, 1.0, 0.0, 0.6666666666666666, 0.0, 1.0, 0.0],
"std_dev_target": [np.nan, 0.7071067811865476, np.nan, np.nan, 0.5773502691896258, 0.0, 0.0, 0.0,],
}
)

Expand Down
Loading