From e61b8d7f4b4ca5c26db77a02d60a4d62c56036b9 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 08:44:19 -0500
Subject: [PATCH 1/8] add example for Cramer

---
 skrub/_column_associations.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index e39b9dd8a..4647ad9b6 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -18,8 +18,8 @@ def column_associations(df):
 
     The result is returned as a dataframe with columns:
 
-    ['left_column_name', 'left_column_idx', 'right_column_name',
-    'right_column_idx', 'cramer_v']
+    `['left_column_name', 'left_column_idx', 'right_column_name',
+    'right_column_idx', 'cramer_v']`
 
     As the function is commutative, each pair of columns appears only once
     (either col_1, col_2 or col_2, col_1 but not both). The results are sorted
@@ -180,6 +180,31 @@ def _compute_cramer(table, n_samples):
 
     This returns the symmetric matrix with shape (n cols, n cols) where entry
     i, j contains the statistic for column i x column j.
+
+    Returns
+    -------
+    dataframe
+        of computed associations.
+
+    Notes
+    -----
+    Cramér's V is a measure of association between two nominal variables,
+    giving a value between 0 and +1 (inclusive).
+    * `Cramer V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> import skrub
+    >>> pd.set_option('display.precision', 4)
+    >>> rng = np.random.default_rng()
+    >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)})
+    >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])]
+    >>> df.shape()
+    >>> df.head()
+    >>> associations = skrub.column_associations(df)
+    >>> associations
     """
     marginal_0 = table.sum(axis=-2)
     marginal_1 = table.sum(axis=-1)

From 262fde61a5148391149f1f9bddb64543d96db47c Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 09:38:35 -0500
Subject: [PATCH 2/8] mv example to top function

---
 skrub/_column_associations.py | 44 ++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 4647ad9b6..922ca6970 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -41,6 +41,26 @@ def column_associations(df):
     -------
     dataframe
         The computed associations.
+
+    Notes
+    -----
+    Cramér's V is a measure of association between two nominal variables,
+    giving a value between 0 and +1 (inclusive).
+    * `Cramer V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> import skrub
+    >>> pd.set_option('display.precision', 4)
+    >>> rng = np.random.default_rng()
+    >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)})
+    >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])]
+    >>> df.shape
+    >>> df.head()
+    >>> associations = skrub.column_associations(df)
+    >>> associations
     """
     return _stack_symmetric_associations(_cramer_v_matrix(df), df)
 
@@ -181,30 +201,6 @@ def _compute_cramer(table, n_samples):
     This returns the symmetric matrix with shape (n cols, n cols) where entry
     i, j contains the statistic for column i x column j.
 
-    Returns
-    -------
-    dataframe
-        of computed associations.
-
-    Notes
-    -----
-    Cramér's V is a measure of association between two nominal variables,
-    giving a value between 0 and +1 (inclusive).
-    * `Cramer V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> import pandas as pd
-    >>> import skrub
-    >>> pd.set_option('display.precision', 4)
-    >>> rng = np.random.default_rng()
-    >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)})
-    >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])]
-    >>> df.shape()
-    >>> df.head()
-    >>> associations = skrub.column_associations(df)
-    >>> associations
     """
     marginal_0 = table.sum(axis=-2)
     marginal_1 = table.sum(axis=-1)

From dd43ba89a3e910c5e9a43d74fc48836d26e43273 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 11:02:21 -0500
Subject: [PATCH 3/8] fixes: add output in example

---
 skrub/_column_associations.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 922ca6970..50cd82c5e 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -54,13 +54,36 @@ def column_associations(df):
     >>> import pandas as pd
     >>> import skrub
     >>> pd.set_option('display.precision', 4)
-    >>> rng = np.random.default_rng()
+    >>> rng = np.random.default_rng(33)
     >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)})
     >>> df["c_str"] = [f"val {i}" for i in range(df.shape[0])]
     >>> df.shape
+    (20, 6)
     >>> df.head()
+          c_0     c_1     c_2     c_3     c_4  c_str
+    0  4.4364  4.0114  6.9271  7.0970  4.8913  val 0
+    1  5.6849  0.7192  7.6430  4.6441  2.5116  val 1
+    2  9.0810  9.4011  1.9257  5.7429  6.2358  val 2
+    3  2.5425  2.9678  9.7801  9.9879  6.0709  val 3
+    4  5.8878  9.3223  5.3840  7.2006  2.1494  val 4
     >>> associations = skrub.column_associations(df)
     >>> associations
+       left_column_name  left_column_idx right_column_name  right_column_idx  cramer_v
+    0               c_3                3             c_str                 5    0.8215
+    1               c_1                1               c_4                 4    0.8215
+    2               c_0                0               c_1                 1    0.8215
+    3               c_2                2             c_str                 5    0.7551
+    4               c_0                0             c_str                 5    0.7551
+    5               c_0                0               c_3                 3    0.7551
+    6               c_1                1               c_3                 3    0.6837
+    7               c_0                0               c_4                 4    0.6837
+    8               c_4                4             c_str                 5    0.6837
+    9               c_3                3               c_4                 4    0.6053
+    10              c_2                2               c_3                 3    0.6053
+    11              c_1                1             c_str                 5    0.6053
+    12              c_0                0               c_2                 2    0.6053
+    13              c_2                2               c_4                 4    0.5169
+    14              c_1                1               c_2                 2    0.4122
     """
     return _stack_symmetric_associations(_cramer_v_matrix(df), df)
 

From 26b073e8f792a66844751261c60715427227ae85 Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 11:36:02 -0500
Subject: [PATCH 4/8] add blank line under Notes bullet point

---
 skrub/_column_associations.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 50cd82c5e..a4229a88a 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -46,6 +46,7 @@ def column_associations(df):
     -----
     Cramér's V is a measure of association between two nominal variables,
     giving a value between 0 and +1 (inclusive).
+
     * `Cramer V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
 
     Examples

From f28299b8550bace79e4221dab4aabbbf235030fa Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 11:39:04 -0500
Subject: [PATCH 5/8] consistent use of Cramer's V

---
 skrub/_column_associations.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index a4229a88a..8f3744822 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -22,10 +22,10 @@ def column_associations(df):
     'right_column_idx', 'cramer_v']`
 
     As the function is commutative, each pair of columns appears only once
-    (either col_1, col_2 or col_2, col_1 but not both). The results are sorted
+    (either `col_1`, `col_2` or `col_2`, `col_1` but not both). The results are sorted
     from most associated to least associated.
 
-    To compute the Cramer V statistic, all columns are discretized. Numeric
+    To compute the Cramer's V statistic, all columns are discretized. Numeric
     columns are binned with 10 bins. For categorical columns, only the 10 most
     frequent categories are considered. In both cases, nulls are treated as a
     separate category, ie a separate row in the contingency table. Thus
@@ -47,7 +47,7 @@ def column_associations(df):
     Cramér's V is a measure of association between two nominal variables,
     giving a value between 0 and +1 (inclusive).
 
-    * `Cramer V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
+    * `Cramer's V <https://en.wikipedia.org/wiki/Cramér%27s_V>`_
 
     Examples
     --------
@@ -217,7 +217,7 @@ def _contingency_table(encoded):
 
 
 def _compute_cramer(table, n_samples):
-    """Compute the Cramer V statistic given a contingency table.
+    """Compute the Cramer's V statistic given a contingency table.
 
     The input is the table computed by ``_contingency_table`` with shape
     (n cols, n cols, n bins, n bins).

From a28677bf2ca8172df8db4e01a0d33ee21e157adc Mon Sep 17 00:00:00 2001
From: Reshama Shaikh <reshama.stat@gmail.com>
Date: Sat, 7 Dec 2024 12:09:03 -0500
Subject: [PATCH 6/8] update pandas display settings

---
 skrub/_column_associations.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 8f3744822..e9f637a94 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -54,6 +54,8 @@ def column_associations(df):
     >>> import numpy as np
     >>> import pandas as pd
     >>> import skrub
+    >>> pd.set_option('display.width', 200)
+    >>> pd.set_option('display.max_columns', 10)
     >>> pd.set_option('display.precision', 4)
     >>> rng = np.random.default_rng(33)
     >>> df = pd.DataFrame({f"c_{i}": rng.random(size=20)*10 for i in range(5)})
@@ -85,6 +87,9 @@ def column_associations(df):
     12              c_0                0               c_2                 2    0.6053
     13              c_2                2               c_4                 4    0.5169
     14              c_1                1               c_2                 2    0.4122
+    >>> pd.reset_option('display.width')
+    >>> pd.reset_option('display.max_columns')
+    >>> pd.reset_option('display.precision')
     """
     return _stack_symmetric_associations(_cramer_v_matrix(df), df)
 

From 882676563a70a6d44e92b2282b6b5ba332c69d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= <jerome@dockes.org>
Date: Sat, 7 Dec 2024 18:19:30 +0100
Subject: [PATCH 7/8] Update skrub/_column_associations.py

---
 skrub/_column_associations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index e9f637a94..46cc5fc42 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -70,7 +70,7 @@ def column_associations(df):
     3  2.5425  2.9678  9.7801  9.9879  6.0709  val 3
     4  5.8878  9.3223  5.3840  7.2006  2.1494  val 4
     >>> associations = skrub.column_associations(df)
-    >>> associations
+    >>> associations # doctest: +SKIP
        left_column_name  left_column_idx right_column_name  right_column_idx  cramer_v
     0               c_3                3             c_str                 5    0.8215
     1               c_1                1               c_4                 4    0.8215

From b48168d43957d4839231515de240886fe798cfd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= <jerome@dockes.org>
Date: Sat, 7 Dec 2024 18:34:16 +0100
Subject: [PATCH 8/8] Apply suggestions from code review

---
 skrub/_column_associations.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
index 46cc5fc42..6b6aba696 100644
--- a/skrub/_column_associations.py
+++ b/skrub/_column_associations.py
@@ -18,11 +18,12 @@ def column_associations(df):
 
     The result is returned as a dataframe with columns:
 
-    `['left_column_name', 'left_column_idx', 'right_column_name',
-    'right_column_idx', 'cramer_v']`
+    ``['left_column_name', 'left_column_idx', 'right_column_name',
+    'right_column_idx', 'cramer_v']``
 
     As the function is commutative, each pair of columns appears only once
-    (either `col_1`, `col_2` or `col_2`, `col_1` but not both). The results are sorted
+    (either ``col_1``, ``col_2`` or ``col_2``, ``col_1`` but not both).
+    The results are sorted
     from most associated to least associated.
 
     To compute the Cramer's V statistic, all columns are discretized. Numeric
@@ -229,7 +230,6 @@ def _compute_cramer(table, n_samples):
 
     This returns the symmetric matrix with shape (n cols, n cols) where entry
     i, j contains the statistic for column i x column j.
-
     """
     marginal_0 = table.sum(axis=-2)
     marginal_1 = table.sum(axis=-1)