From 9d8d16945cc010c69303a995c30b446465c40129 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 07:27:46 -0500
Subject: [PATCH 1/3] start pearsons

---
 skrub/_column_associations.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 6b6aba696..4367964aa 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -13,8 +13,8 @@
 def column_associations(df):
     """Get measures of statistical associations between all pairs of columns.
 
-    At the moment, the only reported metric is Cramer's V statistic. More may
-    be added in the future.
+    Reported metrics include Cramer's V statistic and Pearson's Correlation
+    Coefficient. More may be added in the future.
 
     The result is returned as a dataframe with columns:
 
@@ -247,3 +247,20 @@ def _compute_cramer(table, n_samples):
     stat = np.sqrt(chi_stat / (n_samples * np.maximum(min_dim, 1)))
     stat[min_dim == 0] = 0.0
     return stat
+
+
+def _compute_pearsons(table, n_samples):
+    """Compute the Pearson correlation coefficient statistic given a
+    contingency table / pandas dataframe.
+
+    The input is the table computed by ``_contingency_table`` with shape
+    (n cols, n cols, n bins, n bins).
+
+    This returns the symmetric matrix with shape (n cols, n cols) where entry
+    i, j contains the statistic for column i x column j.
+
+    NOTE: get correct number
+    """
+    stats = table.corr(method="pearson", min_periods=1, numeric_only=True)
+
+    return stats

From 7f50701ee3019abec57bf84b3356d31d47338294 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sun, 15 Dec 2024 16:30:16 -0500
Subject: [PATCH 2/3] fix line wrap

---
 skrub/_column_associations.py | 54 +++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 4367964aa..a31fe8737 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -2,6 +2,7 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
 
 from . import _dataframe as sbd
@@ -70,6 +71,7 @@ def column_associations(df):
     2  9.0810  9.4011  1.9257  5.7429  6.2358  val 2
     3  2.5425  2.9678  9.7801  9.9879  6.0709  val 3
     4  5.8878  9.3223  5.3840  7.2006  2.1494  val 4
+    >>> # Compute the associations
     >>> associations = skrub.column_associations(df)
     >>> associations # doctest: +SKIP
        left_column_name  left_column_idx right_column_name  right_column_idx  cramer_v
@@ -91,8 +93,39 @@ def column_associations(df):
     >>> pd.reset_option('display.width')
     >>> pd.reset_option('display.max_columns')
     >>> pd.reset_option('display.precision')
+
+    This is an example of the Pearson correlation coefficient:
+    >>> # Compute the correlations
+    >>> correlations = df.corr(method="pearson", min_periods=1, numeric_only=True)
+    >>> correlations
+            c_0     c_1     c_2     c_3     c_4
+    c_0  1.0000  0.1123 -0.0578  0.3212 -0.3202
+    c_1  0.1123  1.0000 -0.4986 -0.1887  0.1597
+    c_2 -0.0578 -0.4986  1.0000  0.1757 -0.2885
+    c_3  0.3212 -0.1887  0.1757  1.0000 -0.0150
+    c_4 -0.3202  0.1597 -0.2885 -0.0150  1.0000
+    >>> correlations = (correlations.stack().reset_index().set_axis
+    ...     (["left", "right", "pearson"], axis=1))
+    >>> correlations.head()
+      left  right pearson
+    0  c_0   c_0   1.0000
+    1  c_0   c_1   0.1123
+    2  c_0   c_2  -0.0578
+    3  c_0   c_3   0.3212
+    4  c_0   c_4  -0.3202
+    >>> associations = pd.merge(
+    ...     associations,
+    ...     correlations,
+    ...     left_on=["left_column_name", "right_column_name"],
+    ...     right_on=["left", "right"],
+    ...     how="left",
+    ... ).drop(columns=["left", "right"])
+    >>> pd.reset_option('display.width')
+    >>> pd.reset_option('display.max_columns')
+    >>> pd.reset_option('display.precision')
     """
-    return _stack_symmetric_associations(_cramer_v_matrix(df), df)
+    associations_table = _stack_symmetric_associations(_cramer_v_matrix(df), df)
+    return _compute_pearsons(associations_table)
 
 
 def _stack_symmetric_associations(associations, df):
@@ -249,7 +282,7 @@ def _compute_cramer(table, n_samples):
     return stat
 
 
-def _compute_pearsons(table, n_samples):
+def _compute_pearsons(table):
     """Compute the Pearson correlation coefficient statistic given a
     contingency table / pandas dataframe.
 
@@ -259,8 +292,19 @@ def _compute_pearsons(table, n_samples):
     This returns the symmetric matrix with shape (n cols, n cols) where entry
     i, j contains the statistic for column i x column j.
 
-    NOTE: get correct number
     """
-    stats = table.corr(method="pearson", min_periods=1, numeric_only=True)
-
+    associations = table
+    correlations = table.corr(method="pearson", min_periods=1, numeric_only=True)
+    correlations = (
+        correlations.stack()
+        .reset_index()
+        .set_axis(["left", "right", "pearson"], axis=1)
+    )
+    stats = pd.merge(
+        associations,
+        correlations,
+        left_on=["left_column_name", "right_column_name"],
+        right_on=["left", "right"],
+        how="left",
+    ).drop(columns=["left", "right"])
     return stats

From 69d72fc3ecec19ed72ba8cc578bf7fd7305a4d10 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sun, 15 Dec 2024 17:02:53 -0500
Subject: [PATCH 3/3] rm example output for now

---
 skrub/_column_associations.py | 92 +++++++++++------------------------
 1 file changed, 29 insertions(+), 63 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index a31fe8737..9a44ef663 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -20,12 +20,11 @@ def column_associations(df):
     The result is returned as a dataframe with columns:
 
     ``['left_column_name', 'left_column_idx', 'right_column_name',
-    'right_column_idx', 'cramer_v']``
+    'right_column_idx', 'cramer_v', 'pearson']``
 
     As the function is commutative, each pair of columns appears only once
     (either ``col_1``, ``col_2`` or ``col_2``, ``col_1`` but not both).
-    The results are sorted
-    from most associated to least associated.
+    The results are sorted from most associated to least associated.
 
     To compute the Cramer's V statistic, all columns are discretized. Numeric
     columns are binned with 10 bins. For categorical columns, only the 10 most
@@ -34,6 +33,10 @@ def column_associations(df):
     associations between the values of 2 columns or between their missingness
     patterns may be captured.
 
+    To compute the Pearson's Correlation Coefficient, only numeric columns are
+    considered. The correlation is computed using the Pearson method used in
+    pandas.
+
     Parameters
     ----------
     df : dataframe
@@ -51,6 +54,15 @@ def column_associations(df):
 
     * `Cramer's V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
 
+    Pearson's Correlation Coefficient is a measure of the linear correlation
+    between two variables, giving a value between -1 and +1 (inclusive).
+
+    * `Pearson's Correlation Coefficient
+    <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
+    * `pandas.DataFrame.corr
+    <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html>`_
+
+
     Examples
     --------
     >>> import numpy as np
@@ -74,58 +86,20 @@ def column_associations(df):
     >>> # Compute the associations
     >>> associations = skrub.column_associations(df)
     >>> associations # doctest: +SKIP
-       left_column_name  left_column_idx right_column_name  right_column_idx  cramer_v
-    0               c_3                3             c_str                 5    0.8215
-    1               c_1                1               c_4                 4    0.8215
-    2               c_0                0               c_1                 1    0.8215
-    3               c_2                2             c_str                 5    0.7551
-    4               c_0                0             c_str                 5    0.7551
-    5               c_0                0               c_3                 3    0.7551
-    6               c_1                1               c_3                 3    0.6837
-    7               c_0                0               c_4                 4    0.6837
-    8               c_4                4             c_str                 5    0.6837
-    9               c_3                3               c_4                 4    0.6053
-    10              c_2                2               c_3                 3    0.6053
-    11              c_1                1             c_str                 5    0.6053
-    12              c_0                0               c_2                 2    0.6053
-    13              c_2                2               c_4                 4    0.5169
-    14              c_1                1               c_2                 2    0.4122
-    >>> pd.reset_option('display.width')
-    >>> pd.reset_option('display.max_columns')
-    >>> pd.reset_option('display.precision')
-
-    This is an example of the Pearson correlation coefficient:
-    >>> # Compute the correlations
-    >>> correlations = df.corr(method="pearson", min_periods=1, numeric_only=True)
-    >>> correlations
-            c_0     c_1     c_2     c_3     c_4
-    c_0  1.0000  0.1123 -0.0578  0.3212 -0.3202
-    c_1  0.1123  1.0000 -0.4986 -0.1887  0.1597
-    c_2 -0.0578 -0.4986  1.0000  0.1757 -0.2885
-    c_3  0.3212 -0.1887  0.1757  1.0000 -0.0150
-    c_4 -0.3202  0.1597 -0.2885 -0.0150  1.0000
-    >>> correlations = (correlations.stack().reset_index().set_axis
-    ...     (["left", "right", "pearson"], axis=1))
-    >>> correlations.head()
-      left  right pearson
-    0  c_0   c_0   1.0000
-    1  c_0   c_1   0.1123
-    2  c_0   c_2  -0.0578
-    3  c_0   c_3   0.3212
-    4  c_0   c_4  -0.3202
-    >>> associations = pd.merge(
-    ...     associations,
-    ...     correlations,
-    ...     left_on=["left_column_name", "right_column_name"],
-    ...     right_on=["left", "right"],
-    ...     how="left",
-    ... ).drop(columns=["left", "right"])
     >>> pd.reset_option('display.width')
     >>> pd.reset_option('display.max_columns')
     >>> pd.reset_option('display.precision')
     """
-    associations_table = _stack_symmetric_associations(_cramer_v_matrix(df), df)
-    return _compute_pearsons(associations_table)
+    cramer_v_table = _stack_symmetric_associations(_cramer_v_matrix(df), df)
+    pearson_c_table = _compute_pearsons(df)
+    stats = pd.merge(
+        cramer_v_table,
+        pearson_c_table,
+        left_on=["left_column_name", "right_column_name"],
+        right_on=["left", "right"],
+        how="left",
+    ).drop(columns=["left", "right"])
+    return stats
 
 
 def _stack_symmetric_associations(associations, df):
@@ -282,7 +256,7 @@ def _compute_cramer(table, n_samples):
     return stat
 
 
-def _compute_pearsons(table):
+def _compute_pearsons(df):
     """Compute the Pearson correlation coefficient statistic given a
     contingency table / pandas dataframe.
 
@@ -293,18 +267,10 @@ def _compute_pearsons(table):
     i, j contains the statistic for column i x column j.
 
     """
-    associations = table
-    correlations = table.corr(method="pearson", min_periods=1, numeric_only=True)
-    correlations = (
+    correlations = df.corr(method="pearson", min_periods=1, numeric_only=True)
+    stat = (
         correlations.stack()
         .reset_index()
         .set_axis(["left", "right", "pearson"], axis=1)
     )
-    stats = pd.merge(
-        associations,
-        correlations,
-        left_on=["left_column_name", "right_column_name"],
-        right_on=["left", "right"],
-        how="left",
-    ).drop(columns=["left", "right"])
-    return stats
+    return stat