facultyai · zblz · Oct 24, 2017 · Oct 16, 2017 · Oct 16, 2017 · Oct 16, 2017
diff --git a/lens/plotting.py b/lens/plotting.py
@@ -1,6 +1,8 @@
 import matplotlib.pyplot as plt
+from matplotlib.ticker import FuncFormatter, MaxNLocator
 import numpy as np
 import plotly.graph_objs as go
+import seaborn as sns
 try:
     import plotly.figure_factory as pff
 except ImportError:
@@ -48,7 +50,8 @@ def plot_distribution(ls, column, bins=None):
 
     fig, ax = plt.subplots()
 
-    ax.bar(edges[:-1], counts, width=np.diff(edges), label=column, alpha=0.4)
+    ax.bar(edges[:-1], counts, width=np.diff(edges), label=column,
+           align='edge')
 
     ax.set_ylim(bottom=0)
 
@@ -60,6 +63,161 @@ def plot_distribution(ls, column, bins=None):
     return fig
 
 
+def _set_integer_tick_labels(axis, labels):
+    """Use labels dict to set labels on axis"""
+    axis.set_major_formatter(FuncFormatter(lambda x, _: labels.get(x, '')))
+    axis.set_major_locator(MaxNLocator(integer=True))
+
+
+def plot_pairdensity_mpl(ls, column1, column2):
+    """Plot the pairwise density between two columns.
+
+    This plot is an approximation of a scatterplot through a 2D Kernel
+    Density Estimate for two numerical variables. When one of the variables
+    is categorical, a 1D KDE for each of the categories is shown,
+    normalised to the total number of non-null observations. For two
+    categorical variables, the plot produced is a heatmap representation of
+    the contingency table.
+
+    Parameters
+    ----------
+    ls : :class:`~lens.Summary`
+        Lens `Summary`.
+    column1 : str
+        First column.
+    column2 : str
+        Second column.
+
+    Returns
+    -------
+    :class:`plt.Figure`
+        Matplotlib figure containing the pairwise density plot.
+    """
+    pair_details = ls.pair_details(column1, column2)
+    pairdensity = pair_details['pairdensity']
+
+    x = np.array(pairdensity['x'])
+    y = np.array(pairdensity['y'])
+    Z = np.array(pairdensity['density'])
+
+    fig, ax = plt.subplots()
+
+    if ls.summary(column1)['desc'] == 'categorical':
+        idx = np.argsort(x)
+        x = x[idx]
+        Z = Z[:, idx]
+        # Create labels and positions for categorical axis
+        x_labels = dict(enumerate(x))
+        _set_integer_tick_labels(ax.xaxis, x_labels)
+        x = np.arange(-0.5, len(x), 1.0)
+
+    if ls.summary(column2)['desc'] == 'categorical':
+        idx = np.argsort(y)
+        y = y[idx]
+        Z = Z[idx]
+        y_labels = dict(enumerate(y))
+        _set_integer_tick_labels(ax.yaxis, y_labels)
+        y = np.arange(-0.5, len(y), 1.0)
+
+    X, Y = np.meshgrid(x, y)
+
+    ax.pcolormesh(X, Y, Z, cmap=DEFAULT_COLORSCALE.lower())
+
+    ax.set_xlabel(column1)
+    ax.set_ylabel(column2)
+
+    ax.set_title(r'$\it{{ {} }}$ vs $\it{{ {} }}$'.format(column1, column2))
+
+    return fig
+
+
+def plot_correlation_mpl(ls, include=None, exclude=None):
+    """Plot the correlation matrix for numeric columns
+
+    Plot a Spearman rank order correlation coefficient matrix showing the
+    correlation between columns. The matrix is reordered to group together
+    columns that have a higher correlation coefficient.  The columns to be
+    plotted in the correlation plot can be selected through either the
+    ``include`` or ``exclude`` keyword arguments. Only one of them can be
+    given.
+
+    Parameters
+    ----------
+    ls : :class:`~lens.Summary`
+        Lens `Summary`.
+    include : list of str
+        List of columns to include in the correlation plot.
+    exclude : list of str
+        List of columns to exclude from the correlation plot.
+
+    Returns
+    -------
+    :class:`plt.Figure`
+        Matplotlib figure containing the pairwise density plot.
+    """
+
+    columns, correlation_matrix = ls.correlation_matrix(include, exclude)
+    num_cols = len(columns)
+
+    if num_cols > 10:
+        annotate = False
+    else:
+        annotate = True
+
+    fig, ax = plt.subplots()
+    sns.heatmap(correlation_matrix, annot=annotate, fmt='.2f', ax=ax,
+                xticklabels=columns, yticklabels=columns, vmin=-1, vmax=1,
+                cmap='RdBu_r', square=True)
+
+    ax.xaxis.tick_top()
+
+    w = len(columns) * 2.5
+    while w > 10:
+        w /= np.sqrt(1.4)
+
+    fig.set_size_inches(w, w)
+
+    return fig
+
+
+def plot_cdf(ls, column, N_cdf=100):
+    """Plot the empirical cumulative distribution function of a column.
+
+    Creates a plotly plot with the empirical CDF of a column.
+
+    Parameters
+    ----------
+    ls : :class:`~lens.Summary`
+        Lens `Summary`.
+    column : str
+        Name of the column.
+    N_cdf : int
+        Number of points in the CDF plot.
+
+    Returns
+    -------
+    :class:`~matplotlib.Axes`
+        Matplotlib axes containing the distribution plot.
+    """
+    tdigest = ls.tdigest(column)
+
+    cdfs = np.linspace(0, 100, N_cdf)
+    xs = [tdigest.percentile(p) for p in cdfs]
+
+    fig, ax = plt.subplots()
+
+    ax.set_ylabel('Percentile')
+    ax.set_xlabel(column)
+    ax.plot(xs, cdfs)
+
+    if ls._report['column_summary'][column]['logtrans']:
+        ax.set_xscale('log')
+
+    ax.set_title('Empirical Cumulative Distribution Function')
+
+    return fig
+
+
 def plot_pairdensity(ls, column1, column2):
     """Plot the pairwise density between two columns.
 
@@ -190,41 +348,3 @@ def plot_correlation(ls, include=None, exclude=None):
     fig.data[0]['showscale'] = True
 
     return fig
-
-
-def plot_cdf(ls, column, N_cdf=100):
-    """Plot the empirical cumulative distribution function of a column.
-
-    Creates a plotly plot with the empirical CDF of a column.
-
-    Parameters
-    ----------
-    ls : :class:`~lens.Summary`
-        Lens `Summary`.
-    column : str
-        Name of the column.
-    N_cdf : int
-        Number of points in the CDF plot.
-
-    Returns
-    -------
-    :class:`~matplotlib.Axes`
-        Matplotlib axes containing the distribution plot.
-    """
-    tdigest = ls.tdigest(column)
-
-    cdfs = np.linspace(0, 100, N_cdf)
-    xs = [tdigest.percentile(p) for p in cdfs]
-
-    fig, ax = plt.subplots()
-
-    ax.set_ylabel('Percentile')
-    ax.set_xlabel(column)
-    ax.plot(xs, cdfs)
-
-    if ls._report['column_summary'][column]['logtrans']:
-        ax.set_xscale('log')
-
-    ax.set_title('Empirical Cumulative Distribution Function')
-
-    return fig
diff --git a/lens/widget.py b/lens/widget.py
@@ -1,14 +1,12 @@
+from __future__ import division
 import sys
 import logging
-import matplotlib.pyplot as plt
-import plotly
-import plotly.offline as py
 from ipywidgets import widgets
-
+from IPython.display import display
 from lens.plotting import (plot_distribution,
                            plot_cdf,
-                           plot_pairdensity,
-                           plot_correlation)
+                           plot_pairdensity_mpl,
+                           plot_correlation_mpl)
 
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.StreamHandler())
@@ -18,23 +16,28 @@
 IN_NOTEBOOK = 'ipykernel' in sys.modules
 
 PADDING = '10px'
+PLOT_HEIGHT = 400
+PLOT_WIDTH = 600
+DPI = 72
+
 
+def update_plot(f, args, plot_area, **kwargs):
+    """Updates the content of an output widget with rendered function"""
 
-def render_plotly_js(fig, width=800, height=600):
-    """Return the plotly html for a plot"""
-    if isinstance(fig, plt.Axes):
-        fig = fig.figure
-    else:
-        fig = fig
+    fig = f(*args)
+    plot_area.clear_output()
 
-    if isinstance(fig, plt.Figure):
-        fig = plotly.tools.mpl_to_plotly(fig, strip_style=True, resize=True)
+    height = kwargs.get('height', PLOT_HEIGHT)
+    width = kwargs.get('width', PLOT_WIDTH)
+    dpi = kwargs.get('dpi', DPI)
 
-    fig.layout['width'] = width
-    fig.layout['height'] = height
+    fig.set_size_inches(width / dpi, height / dpi)
 
-    return py.plot(fig, output_type='div', include_plotlyjs=False,
-                   show_link=False)
+    plot_area.layout.height = '{:.0f}px'.format(height)
+    plot_area.layout.width = '{:.0f}px'.format(width)
+
+    with plot_area:
+        display(fig)
 
 
 def create_correlation_plot_widget(ls):
@@ -50,24 +53,18 @@ def create_correlation_plot_widget(ls):
     :class:`ipywidgets.Widget`
         Jupyter widget to explore correlation matrix plot.
     """
-    fig = plot_correlation(ls)
-    return widgets.HTML(render_plotly_js(fig, width=fig.layout['width'],
-                                         height=fig.layout['height']),
-                        height='{:.0f}px'.format(fig.layout['height']))
 
+    plot_area = widgets.Output()
+
+    update_plot(plot_correlation_mpl, [ls], plot_area,
+                height=PLOT_WIDTH, width=PLOT_WIDTH*1.3)
 
-def update_plot(f, args, html_area, **kwargs):
-    """Updates the content of an html_area with rendered function"""
-    html_area.value = render_plotly_js(f(*args), **kwargs)
-    if 'height' in kwargs.keys():
-        html_area.height = '{:.0f}px'.format(kwargs['height'])
-    if 'width' in kwargs.keys():
-        html_area.width = '{:.0f}px'.format(kwargs['width'])
+    return plot_area
 
 
 def _update_pairdensity_plot(ls, dd1, dd2, plot_area):
     if dd1.value != dd2.value:
-        update_plot(plot_pairdensity,
+        update_plot(plot_pairdensity_mpl,
                     [ls, dd1.value, dd2.value],
                     plot_area, height=600, width=600)
 
@@ -93,7 +90,7 @@ def create_pairdensity_plot_widget(ls):
     if len(numeric_columns) > 1:
         dropdown1.value, dropdown2.value = numeric_columns[:2]
 
-    plot_area = widgets.HTML()
+    plot_area = widgets.Output()
 
     for dropdown in [dropdown1, dropdown2]:
         dropdown.observe(lambda x: _update_pairdensity_plot(ls, dropdown1,
@@ -110,11 +107,11 @@ def _simple_columnwise_widget(ls, plot_function, columns):
     """Basic column-wise plot widget"""
 
     dropdown = widgets.Dropdown(options=columns, description='Column:')
-    plot_area = widgets.HTML()
-    update_plot(plot_function, [ls, columns[0]], plot_area, height=500)
+    plot_area = widgets.Output()
+    update_plot(plot_function, [ls, columns[0]], plot_area, height=PLOT_HEIGHT)
 
     dropdown.observe(lambda x: update_plot(plot_function, [ls, x['new']],
-                                           plot_area, height=500),
+                                           plot_area, height=PLOT_HEIGHT),
                      names='value', type='change')
 
     return widgets.VBox([dropdown, plot_area], padding=PADDING)
@@ -174,12 +171,6 @@ def interactive_explore(ls):
                    ' Jupyter notebook')
         logger.error(message)
         raise ValueError(message)
-    else:
-        # This is a bit of a hack, but it is the only place where the state of
-        # plotly initialization is stored. We need to do it because otherwise
-        # plotly fails silently if the notebook mode is not initialized.
-        if not py.offline.__PLOTLY_OFFLINE_INITIALIZED:
-            py.init_notebook_mode()
 
     tabs = widgets.Tab()
     tabs.children = [create_distribution_plot_widget(ls),

diff --git a/setup.py b/setup.py
@@ -42,5 +42,6 @@ def read_version():
         'plotly',
         'scipy',
         'tdigest',
+        'seaborn',
     ],
 )