skrub-data · Vincent-Maladiere · Nov 8, 2023 · Oct 19, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -53,6 +53,8 @@ Major changes
 Minor changes
 -------------
 
+* Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès <jeromedockes>`.
+
 * :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`.
   :pr:`592` by :user:`Leo Grinsztajn <LeoGrin>`
 

diff --git a/examples/04_fuzzy_joining.py b/examples/04_fuzzy_joining.py
@@ -180,7 +180,7 @@
     gdppc,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.35,
+    match_score=0.1,
     return_score=True,
 )
 df1.sort_values("matching_score").head(4)
@@ -194,7 +194,7 @@
     gdppc,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.35,
+    match_score=0.1,
     drop_unmatched=True,
 )
 
@@ -237,7 +237,7 @@
     life_exp,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.45,
+    match_score=0.1,
 )
 
 df2.drop(columns=["Country Name"], inplace=True)
@@ -273,7 +273,7 @@
     legal_rights,
     left_on="Country",
     right_on="Country Name",
-    match_score=0.45,
+    match_score=0.1,
 )
 
 df3.drop(columns=["Country Name"], inplace=True)
@@ -397,9 +397,9 @@
 
 # We will test four possible values of match_score:
 params = {
-    "joiner-1__match_score": [0.2, 0.9],
-    "joiner-2__match_score": [0.2, 0.9],
-    "joiner-3__match_score": [0.2, 0.9],
+    "joiner-1__match_score": [0.1, 0.9],
+    "joiner-2__match_score": [0.1, 0.9],
+    "joiner-3__match_score": [0.1, 0.9],
 }
 
 grid = GridSearchCV(pipeline, param_grid=params)

diff --git a/skrub/_fuzzy_join.py b/skrub/_fuzzy_join.py
@@ -183,9 +183,10 @@ def _nearest_matches(
     neigh.fit(aux_array)
     distance, neighbors = neigh.kneighbors(main_array, return_distance=True)
     idx_closest = np.ravel(neighbors)
-    distance = distance / np.max(distance)
-    # Normalizing distance between 0 and 1:
-    matching_score = 1 - (distance / 2)
+    max_dist = distance.max()
+    if max_dist != 0:
+        distance /= max_dist
+    matching_score = 1 - distance
     return idx_closest, matching_score
 
 
@@ -349,7 +350,7 @@ def fuzzy_join(
         a_x  b   a_y     c  matching_score
     0   ana  1   ana     7             1.0
     1  lala  2  lala     6             1.0
-    2  nana  3  <NA>  <NA>             0.5
+    2  nana  3  <NA>  <NA>             0.0
 
     As expected, the category "nana" has no exact match (`match_score=1`).
     """

diff --git a/skrub/tests/test_fuzzy_join.py b/skrub/tests/test_fuzzy_join.py
@@ -1,8 +1,10 @@
+import warnings
 from typing import Literal
 
 import numpy as np
 import pandas as pd
 import pytest
+from numpy.testing import assert_array_equal
 from pandas.testing import assert_frame_equal
 from sklearn.feature_extraction.text import HashingVectorizer
 
@@ -26,7 +28,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
         right=df2,
         left_on="a1",
         right_on="a2",
-        match_score=0.45,
+        match_score=0.0,
         return_score=True,
         analyzer=analyzer,
     )
@@ -41,7 +43,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
         how="left",
         left_on="a2",
         right_on="a1",
-        match_score=0.35,
+        match_score=0.0,
         return_score=True,
         analyzer=analyzer,
     )
@@ -54,7 +56,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
         how="right",
         right_on=["a2"],
         left_on=["a1"],
-        match_score=0.35,
+        match_score=0.0,
         return_score=True,
         analyzer=analyzer,
     )
@@ -80,6 +82,26 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
     assert ("a1l" and "a1r") in df.columns
 
 
+def test_match_score():
+    left = pd.DataFrame({"A": ["aa", "bb"]})
+    right = pd.DataFrame({"A": ["aa", "ba"], "B": [1, 2]})
+    join = fuzzy_join(left, right, on="A", suffixes=("l", "r"))
+    assert join["B"].to_list() == [1, 2]
+    join = fuzzy_join(left, right, on="A", suffixes=("l", "r"), match_score=0.5)
+    assert join["B"].fillna(-1).to_list() == [1, -1]
+
+
+def test_perfect_matches():
+    # non-regression test for https://github.com/skrub-data/skrub/issues/764
+    # fuzzy_join when all rows had a perfect match used to trigger a division by 0
+    df = pd.DataFrame({"A": [0, 1]})
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        warnings.filterwarnings("ignore", message="This feature is still experimental")
+        join = fuzzy_join(df, df, on="A", return_score=True)
+    assert_array_equal(join["matching_score"].to_numpy(), [1.0, 1.0])
+
+
 def test_fuzzy_join_dtypes() -> None:
     """
     Test that the dtypes of dataframes are maintained after join
@@ -144,16 +166,16 @@ def test_drop_unmatched() -> None:
     a = pd.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]})
     b = pd.DataFrame({"col1": ["aaa_", "bbb_", "cc ccc"], "col3": [1, 2, 3]})
 
-    c1 = fuzzy_join(a, b, on="col1", match_score=0.6, drop_unmatched=True)
+    c1 = fuzzy_join(a, b, on="col1", match_score=0.1, drop_unmatched=True)
     assert c1.shape == (2, 4)
 
-    c2 = fuzzy_join(a, b, on="col1", match_score=0.6)
+    c2 = fuzzy_join(a, b, on="col1", match_score=0.1)
     assert sum(c2["col3"].isna()) > 0
 
-    c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6)
+    c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1)
     assert sum(c3["col3"].isna()) > 0
 
-    c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6, drop_unmatched=True)
+    c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1, drop_unmatched=True)
     assert c4.shape == (2, 4)
 
 
@@ -301,7 +323,7 @@ def test_numerical_column() -> None:
         left,
         right,
         on="int",
-        match_score=0.8,
+        match_score=0.4,
         drop_unmatched=True,
     )
     assert fj_num3.shape == (2, n_cols)