Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace broken Plotly plots in widget with matplotlib #17

Merged
merged 23 commits into from
Oct 24, 2017
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
32c1cb4
changed html widget to dom output widget for rendering matplotlib plots
shwinnn Oct 16, 2017
fe767e1
amended typo widget->widgets
shwinnn Oct 16, 2017
e46474f
Fixed display of matplotlib plots within output widget
shwinnn Oct 16, 2017
85e0187
Set plot size in both matplotlib figure and Output widget
zblz Oct 16, 2017
85786c2
Add plot_pairdensity_mpl method to plotting
zblz Oct 16, 2017
2a72cbe
re-enabled pairdensity plot using output widgets
shwinnn Oct 16, 2017
ddd4666
added plot_correlation_mpl to create correlation plots in matplotlib
shwinnn Oct 16, 2017
5efba7b
renamed figure, plot area and updated docstring in update_plot
shwinnn Oct 16, 2017
4767c9b
fixed correlation plot maximum width & height
shwinnn Oct 16, 2017
f18eec9
moved axis ticks & labels, inverted colour map for correlation matrix…
shwinnn Oct 16, 2017
657aef2
changed colourmap name
shwinnn Oct 16, 2017
7b3bfba
enabled correlation matrix plots to be displayed in output widgets
shwinnn Oct 16, 2017
a7d39d0
corrected plot sizing in update_plot
shwinnn Oct 16, 2017
b99b67d
fixed the cells in correlation plots to be squares
shwinnn Oct 16, 2017
dde199b
changed number display format on correlation plots
shwinnn Oct 16, 2017
76163f5
modified aspect ratio for correlation plots displayed in widget
shwinnn Oct 16, 2017
1157969
Fix alignment of bar chart in plot_distribution
zblz Oct 16, 2017
692e29b
Remove alpha for ax.bar
zblz Oct 16, 2017
28c2ce5
Formatting and order fixes
zblz Oct 16, 2017
5a14ff9
enforced PEP8 compliance
shwinnn Oct 16, 2017
074f0f8
Enforces at least version 6 of ipywidgets
shwinnn Oct 23, 2017
771e04f
Added comment explaining 'magic numbers' for enforcing plot width
shwinnn Oct 23, 2017
b95118c
Simplify correlation plot width computation
zblz Oct 23, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 159 additions & 39 deletions lens/plotting.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, MaxNLocator
import numpy as np
import plotly.graph_objs as go
import seaborn as sns
try:
import plotly.figure_factory as pff
except ImportError:
Expand Down Expand Up @@ -48,7 +50,8 @@ def plot_distribution(ls, column, bins=None):

fig, ax = plt.subplots()

ax.bar(edges[:-1], counts, width=np.diff(edges), label=column, alpha=0.4)
ax.bar(edges[:-1], counts, width=np.diff(edges), label=column,
align='edge')

ax.set_ylim(bottom=0)

Expand All @@ -60,6 +63,161 @@ def plot_distribution(ls, column, bins=None):
return fig


def _set_integer_tick_labels(axis, labels):
"""Use labels dict to set labels on axis"""
axis.set_major_formatter(FuncFormatter(lambda x, _: labels.get(x, '')))
axis.set_major_locator(MaxNLocator(integer=True))


def plot_pairdensity_mpl(ls, column1, column2):
"""Plot the pairwise density between two columns.

This plot is an approximation of a scatterplot through a 2D Kernel
Density Estimate for two numerical variables. When one of the variables
is categorical, a 1D KDE for each of the categories is shown,
normalised to the total number of non-null observations. For two
categorical variables, the plot produced is a heatmap representation of
the contingency table.

Parameters
----------
ls : :class:`~lens.Summary`
Lens `Summary`.
column1 : str
First column.
column2 : str
Second column.

Returns
-------
:class:`plt.Figure`
Matplotlib figure containing the pairwise density plot.
"""
pair_details = ls.pair_details(column1, column2)
pairdensity = pair_details['pairdensity']

x = np.array(pairdensity['x'])
y = np.array(pairdensity['y'])
Z = np.array(pairdensity['density'])

fig, ax = plt.subplots()

if ls.summary(column1)['desc'] == 'categorical':
idx = np.argsort(x)
x = x[idx]
Z = Z[:, idx]
# Create labels and positions for categorical axis
x_labels = dict(enumerate(x))
_set_integer_tick_labels(ax.xaxis, x_labels)
x = np.arange(-0.5, len(x), 1.0)

if ls.summary(column2)['desc'] == 'categorical':
idx = np.argsort(y)
y = y[idx]
Z = Z[idx]
y_labels = dict(enumerate(y))
_set_integer_tick_labels(ax.yaxis, y_labels)
y = np.arange(-0.5, len(y), 1.0)

X, Y = np.meshgrid(x, y)

ax.pcolormesh(X, Y, Z, cmap=DEFAULT_COLORSCALE.lower())

ax.set_xlabel(column1)
ax.set_ylabel(column2)

ax.set_title(r'$\it{{ {} }}$ vs $\it{{ {} }}$'.format(column1, column2))

return fig


def plot_correlation_mpl(ls, include=None, exclude=None):
"""Plot the correlation matrix for numeric columns

Plot a Spearman rank order correlation coefficient matrix showing the
correlation between columns. The matrix is reordered to group together
columns that have a higher correlation coefficient. The columns to be
plotted in the correlation plot can be selected through either the
``include`` or ``exclude`` keyword arguments. Only one of them can be
given.

Parameters
----------
ls : :class:`~lens.Summary`
Lens `Summary`.
include : list of str
List of columns to include in the correlation plot.
exclude : list of str
List of columns to exclude from the correlation plot.

Returns
-------
:class:`plt.Figure`
Matplotlib figure containing the pairwise density plot.
"""

columns, correlation_matrix = ls.correlation_matrix(include, exclude)
num_cols = len(columns)

if num_cols > 10:
annotate = False
else:
annotate = True

fig, ax = plt.subplots()
sns.heatmap(correlation_matrix, annot=annotate, fmt='.2f', ax=ax,
xticklabels=columns, yticklabels=columns, vmin=-1, vmax=1,
cmap='RdBu_r', square=True)

ax.xaxis.tick_top()

w = len(columns) * 2.5
while w > 10:
w /= np.sqrt(1.4)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some pretty heavy use of magic numbers here. Why those numbers? What does this do?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just set w = 10 here?


fig.set_size_inches(w, w)

return fig


def plot_cdf(ls, column, N_cdf=100):
"""Plot the empirical cumulative distribution function of a column.

Creates a plotly plot with the empirical CDF of a column.

Parameters
----------
ls : :class:`~lens.Summary`
Lens `Summary`.
column : str
Name of the column.
N_cdf : int
Number of points in the CDF plot.

Returns
-------
:class:`~matplotlib.Axes`
Matplotlib axes containing the distribution plot.
"""
tdigest = ls.tdigest(column)

cdfs = np.linspace(0, 100, N_cdf)
xs = [tdigest.percentile(p) for p in cdfs]

fig, ax = plt.subplots()

ax.set_ylabel('Percentile')
ax.set_xlabel(column)
ax.plot(xs, cdfs)

if ls._report['column_summary'][column]['logtrans']:
ax.set_xscale('log')

ax.set_title('Empirical Cumulative Distribution Function')

return fig


def plot_pairdensity(ls, column1, column2):
"""Plot the pairwise density between two columns.

Expand Down Expand Up @@ -190,41 +348,3 @@ def plot_correlation(ls, include=None, exclude=None):
fig.data[0]['showscale'] = True

return fig


def plot_cdf(ls, column, N_cdf=100):
"""Plot the empirical cumulative distribution function of a column.

Creates a plotly plot with the empirical CDF of a column.

Parameters
----------
ls : :class:`~lens.Summary`
Lens `Summary`.
column : str
Name of the column.
N_cdf : int
Number of points in the CDF plot.

Returns
-------
:class:`~matplotlib.Axes`
Matplotlib axes containing the distribution plot.
"""
tdigest = ls.tdigest(column)

cdfs = np.linspace(0, 100, N_cdf)
xs = [tdigest.percentile(p) for p in cdfs]

fig, ax = plt.subplots()

ax.set_ylabel('Percentile')
ax.set_xlabel(column)
ax.plot(xs, cdfs)

if ls._report['column_summary'][column]['logtrans']:
ax.set_xscale('log')

ax.set_title('Empirical Cumulative Distribution Function')

return fig
71 changes: 31 additions & 40 deletions lens/widget.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from __future__ import division
import sys
import logging
import matplotlib.pyplot as plt
import plotly
import plotly.offline as py
from ipywidgets import widgets

from IPython.display import display
from lens.plotting import (plot_distribution,
plot_cdf,
plot_pairdensity,
plot_correlation)
plot_pairdensity_mpl,
plot_correlation_mpl)

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
Expand All @@ -18,23 +16,28 @@
IN_NOTEBOOK = 'ipykernel' in sys.modules

PADDING = '10px'
PLOT_HEIGHT = 400
PLOT_WIDTH = 600
DPI = 72


def update_plot(f, args, plot_area, **kwargs):
"""Updates the content of an output widget with rendered function"""

def render_plotly_js(fig, width=800, height=600):
"""Return the plotly html for a plot"""
if isinstance(fig, plt.Axes):
fig = fig.figure
else:
fig = fig
fig = f(*args)
plot_area.clear_output()

if isinstance(fig, plt.Figure):
fig = plotly.tools.mpl_to_plotly(fig, strip_style=True, resize=True)
height = kwargs.get('height', PLOT_HEIGHT)
width = kwargs.get('width', PLOT_WIDTH)
dpi = kwargs.get('dpi', DPI)

fig.layout['width'] = width
fig.layout['height'] = height
fig.set_size_inches(width / dpi, height / dpi)

return py.plot(fig, output_type='div', include_plotlyjs=False,
show_link=False)
plot_area.layout.height = '{:.0f}px'.format(height)
plot_area.layout.width = '{:.0f}px'.format(width)

with plot_area:
display(fig)


def create_correlation_plot_widget(ls):
Expand All @@ -50,24 +53,18 @@ def create_correlation_plot_widget(ls):
:class:`ipywidgets.Widget`
Jupyter widget to explore correlation matrix plot.
"""
fig = plot_correlation(ls)
return widgets.HTML(render_plotly_js(fig, width=fig.layout['width'],
height=fig.layout['height']),
height='{:.0f}px'.format(fig.layout['height']))

plot_area = widgets.Output()

update_plot(plot_correlation_mpl, [ls], plot_area,
height=PLOT_WIDTH, width=PLOT_WIDTH*1.3)

def update_plot(f, args, html_area, **kwargs):
"""Updates the content of an html_area with rendered function"""
html_area.value = render_plotly_js(f(*args), **kwargs)
if 'height' in kwargs.keys():
html_area.height = '{:.0f}px'.format(kwargs['height'])
if 'width' in kwargs.keys():
html_area.width = '{:.0f}px'.format(kwargs['width'])
return plot_area


def _update_pairdensity_plot(ls, dd1, dd2, plot_area):
if dd1.value != dd2.value:
update_plot(plot_pairdensity,
update_plot(plot_pairdensity_mpl,
[ls, dd1.value, dd2.value],
plot_area, height=600, width=600)

Expand All @@ -93,7 +90,7 @@ def create_pairdensity_plot_widget(ls):
if len(numeric_columns) > 1:
dropdown1.value, dropdown2.value = numeric_columns[:2]

plot_area = widgets.HTML()
plot_area = widgets.Output()

for dropdown in [dropdown1, dropdown2]:
dropdown.observe(lambda x: _update_pairdensity_plot(ls, dropdown1,
Expand All @@ -110,11 +107,11 @@ def _simple_columnwise_widget(ls, plot_function, columns):
"""Basic column-wise plot widget"""

dropdown = widgets.Dropdown(options=columns, description='Column:')
plot_area = widgets.HTML()
update_plot(plot_function, [ls, columns[0]], plot_area, height=500)
plot_area = widgets.Output()
update_plot(plot_function, [ls, columns[0]], plot_area, height=PLOT_HEIGHT)

dropdown.observe(lambda x: update_plot(plot_function, [ls, x['new']],
plot_area, height=500),
plot_area, height=PLOT_HEIGHT),
names='value', type='change')

return widgets.VBox([dropdown, plot_area], padding=PADDING)
Expand Down Expand Up @@ -174,12 +171,6 @@ def interactive_explore(ls):
' Jupyter notebook')
logger.error(message)
raise ValueError(message)
else:
# This is a bit of a hack, but it is the only place where the state of
# plotly initialization is stored. We need to do it because otherwise
# plotly fails silently if the notebook mode is not initialized.
if not py.offline.__PLOTLY_OFFLINE_INITIALIZED:
py.init_notebook_mode()

tabs = widgets.Tab()
tabs.children = [create_distribution_plot_widget(ls),
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,6 @@ def read_version():
'plotly',
'scipy',
'tdigest',
'seaborn',
],
)