From 9d8d16945cc010c69303a995c30b446465c40129 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 07:27:46 -0500 Subject: [PATCH 1/3] start pearsons --- skrub/_column_associations.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 6b6aba696..4367964aa 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -13,8 +13,8 @@ def column_associations(df): """Get measures of statistical associations between all pairs of columns. - At the moment, the only reported metric is Cramer's V statistic. More may - be added in the future. + Reported metrics include Cramer's V statistic and Pearson's Correlation + Coefficient. More may be added in the future. The result is returned as a dataframe with columns: @@ -247,3 +247,20 @@ def _compute_cramer(table, n_samples): stat = np.sqrt(chi_stat / (n_samples * np.maximum(min_dim, 1))) stat[min_dim == 0] = 0.0 return stat + + +def _compute_pearsons(table, n_samples): + """Compute the Pearson correlation coefficient statistic given a + contingency table / pandas dataframe. + + The input is the table computed by ``_contingency_table`` with shape + (n cols, n cols, n bins, n bins). + + This returns the symmetric matrix with shape (n cols, n cols) where entry + i, j contains the statistic for column i x column j. + + NOTE: get correct number + """ + stats = table.corr(method="pearson", min_periods=1, numeric_only=True) + + return stats From 7f50701ee3019abec57bf84b3356d31d47338294 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sun, 15 Dec 2024 16:30:16 -0500 Subject: [PATCH 2/3] fix line wrap --- skrub/_column_associations.py | 54 +++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 4367964aa..a31fe8737 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -2,6 +2,7 @@ import warnings import numpy as np +import pandas as pd from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder from . import _dataframe as sbd @@ -70,6 +71,7 @@ def column_associations(df): 2 9.0810 9.4011 1.9257 5.7429 6.2358 val 2 3 2.5425 2.9678 9.7801 9.9879 6.0709 val 3 4 5.8878 9.3223 5.3840 7.2006 2.1494 val 4 + >>> # Compute the associations >>> associations = skrub.column_associations(df) >>> associations # doctest: +SKIP left_column_name left_column_idx right_column_name right_column_idx cramer_v @@ -91,8 +93,39 @@ def column_associations(df): >>> pd.reset_option('display.width') >>> pd.reset_option('display.max_columns') >>> pd.reset_option('display.precision') + + This is an example of the Pearson correlation coefficient: + >>> # Compute the correlations + >>> correlations = df.corr(method="pearson", min_periods=1, numeric_only=True) + >>> correlations + c_0 c_1 c_2 c_3 c_4 + c_0 1.0000 0.1123 -0.0578 0.3212 -0.3202 + c_1 0.1123 1.0000 -0.4986 -0.1887 0.1597 + c_2 -0.0578 -0.4986 1.0000 0.1757 -0.2885 + c_3 0.3212 -0.1887 0.1757 1.0000 -0.0150 + c_4 -0.3202 0.1597 -0.2885 -0.0150 1.0000 + >>> correlations = (correlations.stack().reset_index().set_axis + ... (["left", "right", "pearson"], axis=1)) + >>> correlations.head() + left right pearson + 0 c_0 c_0 1.0000 + 1 c_0 c_1 0.1123 + 2 c_0 c_2 -0.0578 + 3 c_0 c_3 0.3212 + 4 c_0 c_4 -0.3202 + >>> associations = pd.merge( + ... associations, + ... correlations, + ... left_on=["left_column_name", "right_column_name"], + ... right_on=["left", "right"], + ... how="left", + ... ).drop(columns=["left", "right"]) + >>> pd.reset_option('display.width') + >>> pd.reset_option('display.max_columns') + >>> pd.reset_option('display.precision') """ - return _stack_symmetric_associations(_cramer_v_matrix(df), df) + associations_table = _stack_symmetric_associations(_cramer_v_matrix(df), df) + return _compute_pearsons(associations_table) def _stack_symmetric_associations(associations, df): @@ -249,7 +282,7 @@ def _compute_cramer(table, n_samples): return stat -def _compute_pearsons(table, n_samples): +def _compute_pearsons(table): """Compute the Pearson correlation coefficient statistic given a contingency table / pandas dataframe. @@ -259,8 +292,19 @@ def _compute_pearsons(table, n_samples): This returns the symmetric matrix with shape (n cols, n cols) where entry i, j contains the statistic for column i x column j. - NOTE: get correct number """ - stats = table.corr(method="pearson", min_periods=1, numeric_only=True) - + associations = table + correlations = table.corr(method="pearson", min_periods=1, numeric_only=True) + correlations = ( + correlations.stack() + .reset_index() + .set_axis(["left", "right", "pearson"], axis=1) + ) + stats = pd.merge( + associations, + correlations, + left_on=["left_column_name", "right_column_name"], + right_on=["left", "right"], + how="left", + ).drop(columns=["left", "right"]) return stats From 69d72fc3ecec19ed72ba8cc578bf7fd7305a4d10 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sun, 15 Dec 2024 17:02:53 -0500 Subject: [PATCH 3/3] rm example output for now --- skrub/_column_associations.py | 92 +++++++++++------------------------ 1 file changed, 29 insertions(+), 63 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index a31fe8737..9a44ef663 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -20,12 +20,11 @@ def column_associations(df): The result is returned as a dataframe with columns: ``['left_column_name', 'left_column_idx', 'right_column_name', - 'right_column_idx', 'cramer_v']`` + 'right_column_idx', 'cramer_v', 'pearson']`` As the function is commutative, each pair of columns appears only once (either ``col_1``, ``col_2`` or ``col_2``, ``col_1`` but not both). - The results are sorted - from most associated to least associated. + The results are sorted from most associated to least associated. To compute the Cramer's V statistic, all columns are discretized. Numeric columns are binned with 10 bins. For categorical columns, only the 10 most @@ -34,6 +33,10 @@ def column_associations(df): associations between the values of 2 columns or between their missingness patterns may be captured. + To compute the Pearson's Correlation Coefficient, only numeric columns are + considered. The correlation is computed using the Pearson method used in + pandas. + Parameters ---------- df : dataframe @@ -51,6 +54,15 @@ def column_associations(df): * `Cramer's V `_ + Pearson's Correlation Coefficient is a measure of the linear correlation + between two variables, giving a value between -1 and +1 (inclusive). + + * `Pearson's Correlation Coefficient + `_ + * `pandas.DataFrame.corr + `_ + + Examples -------- >>> import numpy as np @@ -74,58 +86,20 @@ def column_associations(df): >>> # Compute the associations >>> associations = skrub.column_associations(df) >>> associations # doctest: +SKIP - left_column_name left_column_idx right_column_name right_column_idx cramer_v - 0 c_3 3 c_str 5 0.8215 - 1 c_1 1 c_4 4 0.8215 - 2 c_0 0 c_1 1 0.8215 - 3 c_2 2 c_str 5 0.7551 - 4 c_0 0 c_str 5 0.7551 - 5 c_0 0 c_3 3 0.7551 - 6 c_1 1 c_3 3 0.6837 - 7 c_0 0 c_4 4 0.6837 - 8 c_4 4 c_str 5 0.6837 - 9 c_3 3 c_4 4 0.6053 - 10 c_2 2 c_3 3 0.6053 - 11 c_1 1 c_str 5 0.6053 - 12 c_0 0 c_2 2 0.6053 - 13 c_2 2 c_4 4 0.5169 - 14 c_1 1 c_2 2 0.4122 - >>> pd.reset_option('display.width') - >>> pd.reset_option('display.max_columns') - >>> pd.reset_option('display.precision') - - This is an example of the Pearson correlation coefficient: - >>> # Compute the correlations - >>> correlations = df.corr(method="pearson", min_periods=1, numeric_only=True) - >>> correlations - c_0 c_1 c_2 c_3 c_4 - c_0 1.0000 0.1123 -0.0578 0.3212 -0.3202 - c_1 0.1123 1.0000 -0.4986 -0.1887 0.1597 - c_2 -0.0578 -0.4986 1.0000 0.1757 -0.2885 - c_3 0.3212 -0.1887 0.1757 1.0000 -0.0150 - c_4 -0.3202 0.1597 -0.2885 -0.0150 1.0000 - >>> correlations = (correlations.stack().reset_index().set_axis - ... (["left", "right", "pearson"], axis=1)) - >>> correlations.head() - left right pearson - 0 c_0 c_0 1.0000 - 1 c_0 c_1 0.1123 - 2 c_0 c_2 -0.0578 - 3 c_0 c_3 0.3212 - 4 c_0 c_4 -0.3202 - >>> associations = pd.merge( - ... associations, - ... correlations, - ... left_on=["left_column_name", "right_column_name"], - ... right_on=["left", "right"], - ... how="left", - ... ).drop(columns=["left", "right"]) >>> pd.reset_option('display.width') >>> pd.reset_option('display.max_columns') >>> pd.reset_option('display.precision') """ - associations_table = _stack_symmetric_associations(_cramer_v_matrix(df), df) - return _compute_pearsons(associations_table) + cramer_v_table = _stack_symmetric_associations(_cramer_v_matrix(df), df) + pearson_c_table = _compute_pearsons(df) + stats = pd.merge( + cramer_v_table, + pearson_c_table, + left_on=["left_column_name", "right_column_name"], + right_on=["left", "right"], + how="left", + ).drop(columns=["left", "right"]) + return stats def _stack_symmetric_associations(associations, df): @@ -282,7 +256,7 @@ def _compute_cramer(table, n_samples): return stat -def _compute_pearsons(table): +def _compute_pearsons(df): """Compute the Pearson correlation coefficient statistic given a contingency table / pandas dataframe. @@ -293,18 +267,10 @@ def _compute_pearsons(table): i, j contains the statistic for column i x column j. """ - associations = table - correlations = table.corr(method="pearson", min_periods=1, numeric_only=True) - correlations = ( + correlations = df.corr(method="pearson", min_periods=1, numeric_only=True) + stat = ( correlations.stack() .reset_index() .set_axis(["left", "right", "pearson"], axis=1) ) - stats = pd.merge( - associations, - correlations, - left_on=["left_column_name", "right_column_name"], - right_on=["left", "right"], - how="left", - ).drop(columns=["left", "right"]) - return stats + return stat