From e61b8d7f4b4ca5c26db77a02d60a4d62c56036b9 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 08:44:19 -0500 Subject: [PATCH 1/8] add example for Cramer --- skrub/_column_associations.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index e39b9dd8a..4647ad9b6 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -18,8 +18,8 @@ def column_associations(df): The result is returned as a dataframe with columns: - ['left_column_name', 'left_column_idx', 'right_column_name', - 'right_column_idx', 'cramer_v'] + `['left_column_name', 'left_column_idx', 'right_column_name', + 'right_column_idx', 'cramer_v']` As the function is commutative, each pair of columns appears only once (either col_1, col_2 or col_2, col_1 but not both). The results are sorted @@ -180,6 +180,31 @@ def _compute_cramer(table, n_samples): This returns the symmetric matrix with shape (n cols, n cols) where entry i, j contains the statistic for column i x column j. + + Returns + ------- + dataframe + of computed associations. + + Notes + ----- + Cramér's V is a measure of association between two nominal variables, + giving a value between 0 and +1 (inclusive). + * `Cramer V `_ + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> import skrub + >>> pd.set_option('display.precision', 4) + >>> rng = np.random.default_rng() + >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)}) + >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])] + >>> df.shape() + >>> df.head() + >>> associations = skrub.column_associations(df) + >>> associations """ marginal_0 = table.sum(axis=-2) marginal_1 = table.sum(axis=-1) From 262fde61a5148391149f1f9bddb64543d96db47c Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 09:38:35 -0500 Subject: [PATCH 2/8] mv example to top function --- skrub/_column_associations.py | 44 ++++++++++++++++------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 4647ad9b6..922ca6970 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -41,6 +41,26 @@ def column_associations(df): ------- dataframe The computed associations. + + Notes + ----- + Cramér's V is a measure of association between two nominal variables, + giving a value between 0 and +1 (inclusive). + * `Cramer V `_ + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> import skrub + >>> pd.set_option('display.precision', 4) + >>> rng = np.random.default_rng() + >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)}) + >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])] + >>> df.shape + >>> df.head() + >>> associations = skrub.column_associations(df) + >>> associations """ return _stack_symmetric_associations(_cramer_v_matrix(df), df) @@ -181,30 +201,6 @@ def _compute_cramer(table, n_samples): This returns the symmetric matrix with shape (n cols, n cols) where entry i, j contains the statistic for column i x column j. - Returns - ------- - dataframe - of computed associations. - - Notes - ----- - Cramér's V is a measure of association between two nominal variables, - giving a value between 0 and +1 (inclusive). - * `Cramer V `_ - - Examples - -------- - >>> import numpy as np - >>> import pandas as pd - >>> import skrub - >>> pd.set_option('display.precision', 4) - >>> rng = np.random.default_rng() - >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)}) - >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])] - >>> df.shape() - >>> df.head() - >>> associations = skrub.column_associations(df) - >>> associations """ marginal_0 = table.sum(axis=-2) marginal_1 = table.sum(axis=-1) From dd43ba89a3e910c5e9a43d74fc48836d26e43273 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 11:02:21 -0500 Subject: [PATCH 3/8] fixes: add output in example --- skrub/_column_associations.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 922ca6970..50cd82c5e 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -54,13 +54,36 @@ def column_associations(df): >>> import pandas as pd >>> import skrub >>> pd.set_option('display.precision', 4) - >>> rng = np.random.default_rng() + >>> rng = np.random.default_rng(33) >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)}) >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])] >>> df.shape + (20, 6) >>> df.head() + c_0 c_1 c_2 c_3 c_4 c_str + 0 4.4364 4.0114 6.9271 7.0970 4.8913 val 0 + 1 5.6849 0.7192 7.6430 4.6441 2.5116 val 1 + 2 9.0810 9.4011 1.9257 5.7429 6.2358 val 2 + 3 2.5425 2.9678 9.7801 9.9879 6.0709 val 3 + 4 5.8878 9.3223 5.3840 7.2006 2.1494 val 4 >>> associations = skrub.column_associations(df) >>> associations + left_column_name left_column_idx right_column_name right_column_idx cramer_v + 0 c_3 3 c_str 5 0.8215 + 1 c_1 1 c_4 4 0.8215 + 2 c_0 0 c_1 1 0.8215 + 3 c_2 2 c_str 5 0.7551 + 4 c_0 0 c_str 5 0.7551 + 5 c_0 0 c_3 3 0.7551 + 6 c_1 1 c_3 3 0.6837 + 7 c_0 0 c_4 4 0.6837 + 8 c_4 4 c_str 5 0.6837 + 9 c_3 3 c_4 4 0.6053 + 10 c_2 2 c_3 3 0.6053 + 11 c_1 1 c_str 5 0.6053 + 12 c_0 0 c_2 2 0.6053 + 13 c_2 2 c_4 4 0.5169 + 14 c_1 1 c_2 2 0.4122 """ return _stack_symmetric_associations(_cramer_v_matrix(df), df) From 26b073e8f792a66844751261c60715427227ae85 Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 11:36:02 -0500 Subject: [PATCH 4/8] add blank line under Notes bullet point --- skrub/_column_associations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 50cd82c5e..a4229a88a 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -46,6 +46,7 @@ def column_associations(df): ----- Cramér's V is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). + * `Cramer V `_ Examples From f28299b8550bace79e4221dab4aabbbf235030fa Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 11:39:04 -0500 Subject: [PATCH 5/8] consistent use of Cramer's V --- skrub/_column_associations.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index a4229a88a..8f3744822 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -22,10 +22,10 @@ def column_associations(df): 'right_column_idx', 'cramer_v']` As the function is commutative, each pair of columns appears only once - (either col_1, col_2 or col_2, col_1 but not both). The results are sorted + (either `col_1`, `col_2` or `col_2`, `col_1` but not both). The results are sorted from most associated to least associated. - To compute the Cramer V statistic, all columns are discretized. Numeric + To compute the Cramer's V statistic, all columns are discretized. Numeric columns are binned with 10 bins. For categorical columns, only the 10 most frequent categories are considered. In both cases, nulls are treated as a separate category, ie a separate row in the contingency table. Thus @@ -47,7 +47,7 @@ def column_associations(df): Cramér's V is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). - * `Cramer V `_ + * `Cramer's V `_ Examples -------- @@ -217,7 +217,7 @@ def _contingency_table(encoded): def _compute_cramer(table, n_samples): - """Compute the Cramer V statistic given a contingency table. + """Compute the Cramer's V statistic given a contingency table. The input is the table computed by ``_contingency_table`` with shape (n cols, n cols, n bins, n bins). From a28677bf2ca8172df8db4e01a0d33ee21e157adc Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 7 Dec 2024 12:09:03 -0500 Subject: [PATCH 6/8] update pandas display settings --- skrub/_column_associations.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 8f3744822..e9f637a94 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -54,6 +54,8 @@ def column_associations(df): >>> import numpy as np >>> import pandas as pd >>> import skrub + >>> pd.set_option('display.width', 200) + >>> pd.set_option('display.max_columns', 10) >>> pd.set_option('display.precision', 4) >>> rng = np.random.default_rng(33) >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)}) @@ -85,6 +87,9 @@ def column_associations(df): 12 c_0 0 c_2 2 0.6053 13 c_2 2 c_4 4 0.5169 14 c_1 1 c_2 2 0.4122 + >>> pd.reset_option('display.width') + >>> pd.reset_option('display.max_columns') + >>> pd.reset_option('display.precision') """ return _stack_symmetric_associations(_cramer_v_matrix(df), df) From 882676563a70a6d44e92b2282b6b5ba332c69d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Sat, 7 Dec 2024 18:19:30 +0100 Subject: [PATCH 7/8] Update skrub/_column_associations.py --- skrub/_column_associations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index e9f637a94..46cc5fc42 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -70,7 +70,7 @@ def column_associations(df): 3 2.5425 2.9678 9.7801 9.9879 6.0709 val 3 4 5.8878 9.3223 5.3840 7.2006 2.1494 val 4 >>> associations = skrub.column_associations(df) - >>> associations + >>> associations # doctest: +SKIP left_column_name left_column_idx right_column_name right_column_idx cramer_v 0 c_3 3 c_str 5 0.8215 1 c_1 1 c_4 4 0.8215 From b48168d43957d4839231515de240886fe798cfd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Sat, 7 Dec 2024 18:34:16 +0100 Subject: [PATCH 8/8] Apply suggestions from code review --- skrub/_column_associations.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py index 46cc5fc42..6b6aba696 100644 --- a/skrub/_column_associations.py +++ b/skrub/_column_associations.py @@ -18,11 +18,12 @@ def column_associations(df): The result is returned as a dataframe with columns: - `['left_column_name', 'left_column_idx', 'right_column_name', - 'right_column_idx', 'cramer_v']` + ``['left_column_name', 'left_column_idx', 'right_column_name', + 'right_column_idx', 'cramer_v']`` As the function is commutative, each pair of columns appears only once - (either `col_1`, `col_2` or `col_2`, `col_1` but not both). The results are sorted + (either ``col_1``, ``col_2`` or ``col_2``, ``col_1`` but not both). + The results are sorted from most associated to least associated. To compute the Cramer's V statistic, all columns are discretized. Numeric @@ -229,7 +230,6 @@ def _compute_cramer(table, n_samples): This returns the symmetric matrix with shape (n cols, n cols) where entry i, j contains the statistic for column i x column j. - """ marginal_0 = table.sum(axis=-2) marginal_1 = table.sum(axis=-1)