skrub-data · reshamas · Dec 7, 2024 · Dec 15, 2024 · Dec 15, 2024
diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
@@ -2,6 +2,7 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
 
 from . import _dataframe as sbd
@@ -13,18 +14,17 @@
 def column_associations(df):
     """Get measures of statistical associations between all pairs of columns.
 
-    At the moment, the only reported metric is Cramer's V statistic. More may
-    be added in the future.
+    Reported metrics include Cramer's V statistic and Pearson's Correlation
+    Coefficient. More may be added in the future.
 
     The result is returned as a dataframe with columns:
 
     ``['left_column_name', 'left_column_idx', 'right_column_name',
-    'right_column_idx', 'cramer_v']``
+    'right_column_idx', 'cramer_v', 'pearson']``
 
     As the function is commutative, each pair of columns appears only once
     (either ``col_1``, ``col_2`` or ``col_2``, ``col_1`` but not both).
-    The results are sorted
-    from most associated to least associated.
+    The results are sorted from most associated to least associated.
 
     To compute the Cramer's V statistic, all columns are discretized. Numeric
     columns are binned with 10 bins. For categorical columns, only the 10 most
@@ -33,6 +33,10 @@ def column_associations(df):
     associations between the values of 2 columns or between their missingness
     patterns may be captured.
 
+    To compute the Pearson's Correlation Coefficient, only numeric columns are
+    considered. The correlation is computed using the Pearson method used in
+    pandas.
+
     Parameters
     ----------
     df : dataframe
@@ -50,6 +54,15 @@ def column_associations(df):
 
     * `Cramer's V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
 
+    Pearson's Correlation Coefficient is a measure of the linear correlation
+    between two variables, giving a value between -1 and +1 (inclusive).
+
+    * `Pearson's Correlation Coefficient
+    <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
+    * `pandas.DataFrame.corr
+    <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html>`_
+
+
     Examples
     --------
     >>> import numpy as np
@@ -70,29 +83,23 @@ def column_associations(df):
     2  9.0810  9.4011  1.9257  5.7429  6.2358  val 2
     3  2.5425  2.9678  9.7801  9.9879  6.0709  val 3
     4  5.8878  9.3223  5.3840  7.2006  2.1494  val 4
+    >>> # Compute the associations
     >>> associations = skrub.column_associations(df)
     >>> associations # doctest: +SKIP
-       left_column_name  left_column_idx right_column_name  right_column_idx  cramer_v
-    0               c_3                3             c_str                 5    0.8215
-    1               c_1                1               c_4                 4    0.8215
-    2               c_0                0               c_1                 1    0.8215
-    3               c_2                2             c_str                 5    0.7551
-    4               c_0                0             c_str                 5    0.7551
-    5               c_0                0               c_3                 3    0.7551
-    6               c_1                1               c_3                 3    0.6837
-    7               c_0                0               c_4                 4    0.6837
-    8               c_4                4             c_str                 5    0.6837
-    9               c_3                3               c_4                 4    0.6053
-    10              c_2                2               c_3                 3    0.6053
-    11              c_1                1             c_str                 5    0.6053
-    12              c_0                0               c_2                 2    0.6053
-    13              c_2                2               c_4                 4    0.5169
-    14              c_1                1               c_2                 2    0.4122
     >>> pd.reset_option('display.width')
     >>> pd.reset_option('display.max_columns')
     >>> pd.reset_option('display.precision')
     """
-    return _stack_symmetric_associations(_cramer_v_matrix(df), df)
+    cramer_v_table = _stack_symmetric_associations(_cramer_v_matrix(df), df)
+    pearson_c_table = _compute_pearsons(df)
+    stats = pd.merge(
+        cramer_v_table,
+        pearson_c_table,
+        left_on=["left_column_name", "right_column_name"],
+        right_on=["left", "right"],
+        how="left",
+    ).drop(columns=["left", "right"])
+    return stats
 
 
 def _stack_symmetric_associations(associations, df):
@@ -247,3 +254,23 @@ def _compute_cramer(table, n_samples):
     stat = np.sqrt(chi_stat / (n_samples * np.maximum(min_dim, 1)))
     stat[min_dim == 0] = 0.0
     return stat
+
+
+def _compute_pearsons(df):
+    """Compute the Pearson correlation coefficient statistic given a
+    contingency table / pandas dataframe.
+
+    The input is the table computed by ``_contingency_table`` with shape
+    (n cols, n cols, n bins, n bins).
+
+    This returns the symmetric matrix with shape (n cols, n cols) where entry
+    i, j contains the statistic for column i x column j.
+
+    """
+    correlations = df.corr(method="pearson", min_periods=1, numeric_only=True)
+    stat = (
+        correlations.stack()
+        .reset_index()
+        .set_axis(["left", "right", "pearson"], axis=1)
+    )
+    return stat