skrub-data · Vincent-Maladiere · Nov 8, 2023 · Oct 19, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -58,6 +58,10 @@ Major changes
 Minor changes
 -------------
 
+* Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès <jeromedockes>`.
+
+* :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`.
+  :pr:`592` by :user:`Leo Grinsztajn <LeoGrin>`
 
 * ``inverse_transform`` in :class:`SimilarityEncoder` now works as expected; it used to raise an exception. :pr:`801` by :user:`Jérôme Dockès <jeromedockes>`.
 
@@ -66,6 +70,7 @@ Minor changes
   :pr:`761` by :user:`Leo Grinsztajn <LeoGrin>`, :user:`Guillaume Lemaitre <glemaitre>`,
   and :user:`Jerome Dockes <jeromedockes>`.
 
+
 * Parallelized the :func:`deduplicate` function. Parameter `n_jobs`
   added to the signature. :pr:`618` by :user:`Jovan Stojanovic <jovan-stojanovic>`
   and :user:`Lilian Boulard <LilianBoulard>`

diff --git a/examples/04_fuzzy_joining.py b/examples/04_fuzzy_joining.py
@@ -128,7 +128,7 @@
 )
 
 df1.tail(20)
-# We merged the first WB table to our initial one.
+# We merged the first World Bank table to our initial one.
 
 ###############################################################################
 # .. topic:: Note:
@@ -175,7 +175,7 @@
     gdppc,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.35,
+    match_score=0.1,
     return_score=True,
 )
 df1.sort_values("matching_score").head(4)
@@ -189,7 +189,7 @@
     gdppc,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.35,
+    match_score=0.1,
     drop_unmatched=True,
 )
 
@@ -232,7 +232,7 @@
     life_exp,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.45,
+    match_score=0.1,
 )
 
 df2.drop(columns=["Country Name"], inplace=True)
@@ -268,7 +268,7 @@
     legal_rights,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.45,
+    match_score=0.1,
 )
 
 df3.drop(columns=["Country Name"], inplace=True)
@@ -303,8 +303,8 @@
 #
 # We now separate our covariates (X), from the target (or exogenous)
 # variables: y
-X = df3.drop("Happiness score", axis=1).select_dtypes(exclude=object)
 y = df3["Happiness score"]
+X = df3.drop(["Happiness score", "Country"], axis=1)
 
 ###################################################################
 # Let us now define the model that will be used to predict the happiness score:
@@ -313,10 +313,10 @@
 from sklearn.model_selection import KFold
 
 hgdb = HistGradientBoostingRegressor(random_state=0)
-cv = KFold(n_splits=2, shuffle=True, random_state=0)
+cv = KFold(n_splits=5, shuffle=True, random_state=0)
 
 #################################################################
-# To evaluate our model, we will apply a `4-fold cross-validation`.
+# To evaluate our model, we will apply a `5-fold cross-validation`.
 # We evaluate our model using the `R2` score.
 #
 # Let's finally assess the results of our models:
@@ -326,10 +326,10 @@
 
 cv_r2_t = cv_results_t["test_score"]
 
-print(f"Mean R2 score is {cv_r2_t.mean():.2f} +- {cv_r2_t.std():.2f}")
+print(f"Mean R² score is {cv_r2_t.mean():.2f} +- {cv_r2_t.std():.2f}")
 
 #################################################################
-# We have a satisfying first result: an R2 of 0.66!
+# We have a satisfying first result: an R² of 0.63!
 #
 # Data cleaning varies from dataset to dataset: there are as
 # many ways to clean a table as there are errors. |fj|
@@ -391,33 +391,15 @@
 
 # We will test four possible values of match_score:
 params = {
-    "joiner-1__match_score": [0.2, 0.9],
-    "joiner-2__match_score": [0.2, 0.9],
-    "joiner-3__match_score": [0.2, 0.9],
+    "joiner-1__match_score": [0.1, 0.9],
+    "joiner-2__match_score": [0.1, 0.9],
+    "joiner-3__match_score": [0.1, 0.9],
 }
 
-grid = GridSearchCV(pipeline, param_grid=params)
+grid = GridSearchCV(pipeline, param_grid=params, cv=cv)
 grid.fit(df, y)
 
-print(grid.best_params_)
+print("Best parameters:", grid.best_params_)
 
-##########################################################################
-# The grid searching gave us the best value of 0.5 for the parameter
-# ``match_score``. Let's use this value in our regression:
-#
-
-print(f"Mean R2 score with pipeline is {grid.score(df, y):.2f}")
-
-##########################################################################
-#
-# .. topic:: Note:
-#
-#    Here, ``grid.score()`` takes directly the best model
-#    (with ``match_score=0.5``) that was found during the grid search.
-#    Thus, it is equivalent to fixing the ``match_score`` to 0.5 and
-#    refitting the pipeline on the data.
-#
-#
-# Great, by evaluating the correct ``match_score`` we improved our
-# results significantly!
-#
+# The gridsearch selects a stricter threshold on the matching_score than what
+# we had set manually for the GDP and legal rights joins.
diff --git a/skrub/_fuzzy_join.py b/skrub/_fuzzy_join.py
@@ -183,9 +183,10 @@ def _nearest_matches(
     neigh.fit(aux_array)
     distance, neighbors = neigh.kneighbors(main_array, return_distance=True)
     idx_closest = np.ravel(neighbors)
-    distance = distance / np.max(distance)
-    # Normalizing distance between 0 and 1:
-    matching_score = 1 - (distance / 2)
+    max_dist = distance.max()
+    if max_dist != 0:
+        distance /= max_dist
+    matching_score = 1 - distance
     return idx_closest, matching_score
 
 
@@ -349,7 +350,7 @@ def fuzzy_join(
         a_x  b   a_y     c  matching_score
     0   ana  1   ana     7             1.0
     1  lala  2  lala     6             1.0
-    2  nana  3  <NA>  <NA>             0.5
+    2  nana  3  <NA>  <NA>             0.0
 
     As expected, the category "nana" has no exact match (`match_score=1`).
     """

diff --git a/skrub/tests/test_fuzzy_join.py b/skrub/tests/test_fuzzy_join.py
@@ -1,8 +1,10 @@
+import warnings
 from typing import Literal
 
 import numpy as np
 import pandas as pd
 import pytest
+from numpy.testing import assert_array_equal
 from pandas.testing import assert_frame_equal
 from sklearn.feature_extraction.text import HashingVectorizer
 
@@ -26,7 +28,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
         right=df2,
         left_on="a1",
         right_on="a2",
-        match_score=0.45,
+        match_score=0.0,
         return_score=True,
         analyzer=analyzer,
     )
@@ -41,7 +43,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
         how="left",
         left_on="a2",
         right_on="a1",
-        match_score=0.35,
+        match_score=0.0,
         return_score=True,
         analyzer=analyzer,
     )
@@ -54,7 +56,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
         how="right",
         right_on=["a2"],
         left_on=["a1"],
-        match_score=0.35,
+        match_score=0.0,
         return_score=True,
         analyzer=analyzer,
     )
@@ -80,6 +82,26 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
     assert ("a1l" and "a1r") in df.columns
 
 
+def test_match_score():
+    left = pd.DataFrame({"A": ["aa", "bb"]})
+    right = pd.DataFrame({"A": ["aa", "ba"], "B": [1, 2]})
+    join = fuzzy_join(left, right, on="A", suffixes=("l", "r"))
+    assert join["B"].to_list() == [1, 2]
+    join = fuzzy_join(left, right, on="A", suffixes=("l", "r"), match_score=0.5)
+    assert join["B"].fillna(-1).to_list() == [1, -1]
+
+
+def test_perfect_matches():
+    # non-regression test for https://github.com/skrub-data/skrub/issues/764
+    # fuzzy_join when all rows had a perfect match used to trigger a division by 0
+    df = pd.DataFrame({"A": [0, 1]})
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        warnings.filterwarnings("ignore", message="This feature is still experimental")
+        join = fuzzy_join(df, df, on="A", return_score=True)
+    assert_array_equal(join["matching_score"].to_numpy(), [1.0, 1.0])
+
+
 def test_fuzzy_join_dtypes() -> None:
     """
     Test that the dtypes of dataframes are maintained after join
@@ -144,16 +166,16 @@ def test_drop_unmatched() -> None:
     a = pd.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]})
     b = pd.DataFrame({"col1": ["aaa_", "bbb_", "cc ccc"], "col3": [1, 2, 3]})
 
-    c1 = fuzzy_join(a, b, on="col1", match_score=0.6, drop_unmatched=True)
+    c1 = fuzzy_join(a, b, on="col1", match_score=0.1, drop_unmatched=True)
     assert c1.shape == (2, 4)
 
-    c2 = fuzzy_join(a, b, on="col1", match_score=0.6)
+    c2 = fuzzy_join(a, b, on="col1", match_score=0.1)
     assert sum(c2["col3"].isna()) > 0
 
-    c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6)
+    c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1)
     assert sum(c3["col3"].isna()) > 0
 
-    c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6, drop_unmatched=True)
+    c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1, drop_unmatched=True)
     assert c4.shape == (2, 4)
 
 
@@ -301,7 +323,7 @@ def test_numerical_column() -> None:
         left,
         right,
         on="int",
-        match_score=0.8,
+        match_score=0.4,
         drop_unmatched=True,
     )
     assert fj_num3.shape == (2, n_cols)