From 77b1ccc43e3d60c60a38f8ce10d5325e6bb286b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Wed, 8 Nov 2023 16:42:08 +0100 Subject: [PATCH] [MRG] Fix the match score scaling (#802) Let's move this PR forward since it's on the critical path to releasing --- CHANGES.rst | 5 ++++ examples/04_fuzzy_joining.py | 52 +++++++++++----------------------- skrub/_fuzzy_join.py | 9 +++--- skrub/tests/test_fuzzy_join.py | 38 +++++++++++++++++++------ 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 5d547cfd6..c366c247c 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -58,6 +58,10 @@ Major changes Minor changes ------------- +* Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès `. + +* :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`. + :pr:`592` by :user:`Leo Grinsztajn ` * ``inverse_transform`` in :class:`SimilarityEncoder` now works as expected; it used to raise an exception. :pr:`801` by :user:`Jérôme Dockès `. @@ -66,6 +70,7 @@ Minor changes :pr:`761` by :user:`Leo Grinsztajn `, :user:`Guillaume Lemaitre `, and :user:`Jerome Dockes `. + * Parallelized the :func:`deduplicate` function. Parameter `n_jobs` added to the signature. :pr:`618` by :user:`Jovan Stojanovic ` and :user:`Lilian Boulard ` diff --git a/examples/04_fuzzy_joining.py b/examples/04_fuzzy_joining.py index e56dbfd94..1ac375d26 100644 --- a/examples/04_fuzzy_joining.py +++ b/examples/04_fuzzy_joining.py @@ -128,7 +128,7 @@ ) df1.tail(20) -# We merged the first WB table to our initial one. +# We merged the first World Bank table to our initial one. ############################################################################### # .. topic:: Note: @@ -175,7 +175,7 @@ gdppc, left_on="Country", right_on="Country Name", - match_score=0.35, + match_score=0.1, return_score=True, ) df1.sort_values("matching_score").head(4) @@ -189,7 +189,7 @@ gdppc, left_on="Country", right_on="Country Name", - match_score=0.35, + match_score=0.1, drop_unmatched=True, ) @@ -232,7 +232,7 @@ life_exp, left_on="Country", right_on="Country Name", - match_score=0.45, + match_score=0.1, ) df2.drop(columns=["Country Name"], inplace=True) @@ -268,7 +268,7 @@ legal_rights, left_on="Country", right_on="Country Name", - match_score=0.45, + match_score=0.1, ) df3.drop(columns=["Country Name"], inplace=True) @@ -303,8 +303,8 @@ # # We now separate our covariates (X), from the target (or exogenous) # variables: y -X = df3.drop("Happiness score", axis=1).select_dtypes(exclude=object) y = df3["Happiness score"] +X = df3.drop(["Happiness score", "Country"], axis=1) ################################################################### # Let us now define the model that will be used to predict the happiness score: @@ -313,10 +313,10 @@ from sklearn.model_selection import KFold hgdb = HistGradientBoostingRegressor(random_state=0) -cv = KFold(n_splits=2, shuffle=True, random_state=0) +cv = KFold(n_splits=5, shuffle=True, random_state=0) ################################################################# -# To evaluate our model, we will apply a `4-fold cross-validation`. +# To evaluate our model, we will apply a `5-fold cross-validation`. # We evaluate our model using the `R2` score. # # Let's finally assess the results of our models: @@ -326,10 +326,10 @@ cv_r2_t = cv_results_t["test_score"] -print(f"Mean R2 score is {cv_r2_t.mean():.2f} +- {cv_r2_t.std():.2f}") +print(f"Mean R² score is {cv_r2_t.mean():.2f} +- {cv_r2_t.std():.2f}") ################################################################# -# We have a satisfying first result: an R2 of 0.66! +# We have a satisfying first result: an R² of 0.63! # # Data cleaning varies from dataset to dataset: there are as # many ways to clean a table as there are errors. |fj| @@ -391,33 +391,15 @@ # We will test four possible values of match_score: params = { - "joiner-1__match_score": [0.2, 0.9], - "joiner-2__match_score": [0.2, 0.9], - "joiner-3__match_score": [0.2, 0.9], + "joiner-1__match_score": [0.1, 0.9], + "joiner-2__match_score": [0.1, 0.9], + "joiner-3__match_score": [0.1, 0.9], } -grid = GridSearchCV(pipeline, param_grid=params) +grid = GridSearchCV(pipeline, param_grid=params, cv=cv) grid.fit(df, y) -print(grid.best_params_) +print("Best parameters:", grid.best_params_) -########################################################################## -# The grid searching gave us the best value of 0.5 for the parameter -# ``match_score``. Let's use this value in our regression: -# - -print(f"Mean R2 score with pipeline is {grid.score(df, y):.2f}") - -########################################################################## -# -# .. topic:: Note: -# -# Here, ``grid.score()`` takes directly the best model -# (with ``match_score=0.5``) that was found during the grid search. -# Thus, it is equivalent to fixing the ``match_score`` to 0.5 and -# refitting the pipeline on the data. -# -# -# Great, by evaluating the correct ``match_score`` we improved our -# results significantly! -# +# The gridsearch selects a stricter threshold on the matching_score than what +# we had set manually for the GDP and legal rights joins. diff --git a/skrub/_fuzzy_join.py b/skrub/_fuzzy_join.py index e984e21bd..29b924c7c 100644 --- a/skrub/_fuzzy_join.py +++ b/skrub/_fuzzy_join.py @@ -183,9 +183,10 @@ def _nearest_matches( neigh.fit(aux_array) distance, neighbors = neigh.kneighbors(main_array, return_distance=True) idx_closest = np.ravel(neighbors) - distance = distance / np.max(distance) - # Normalizing distance between 0 and 1: - matching_score = 1 - (distance / 2) + max_dist = distance.max() + if max_dist != 0: + distance /= max_dist + matching_score = 1 - distance return idx_closest, matching_score @@ -349,7 +350,7 @@ def fuzzy_join( a_x b a_y c matching_score 0 ana 1 ana 7 1.0 1 lala 2 lala 6 1.0 - 2 nana 3 0.5 + 2 nana 3 0.0 As expected, the category "nana" has no exact match (`match_score=1`). """ diff --git a/skrub/tests/test_fuzzy_join.py b/skrub/tests/test_fuzzy_join.py index d4b4297e8..850166b1a 100644 --- a/skrub/tests/test_fuzzy_join.py +++ b/skrub/tests/test_fuzzy_join.py @@ -1,8 +1,10 @@ +import warnings from typing import Literal import numpy as np import pandas as pd import pytest +from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal from sklearn.feature_extraction.text import HashingVectorizer @@ -26,7 +28,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None: right=df2, left_on="a1", right_on="a2", - match_score=0.45, + match_score=0.0, return_score=True, analyzer=analyzer, ) @@ -41,7 +43,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None: how="left", left_on="a2", right_on="a1", - match_score=0.35, + match_score=0.0, return_score=True, analyzer=analyzer, ) @@ -54,7 +56,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None: how="right", right_on=["a2"], left_on=["a1"], - match_score=0.35, + match_score=0.0, return_score=True, analyzer=analyzer, ) @@ -80,6 +82,26 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None: assert ("a1l" and "a1r") in df.columns +def test_match_score(): + left = pd.DataFrame({"A": ["aa", "bb"]}) + right = pd.DataFrame({"A": ["aa", "ba"], "B": [1, 2]}) + join = fuzzy_join(left, right, on="A", suffixes=("l", "r")) + assert join["B"].to_list() == [1, 2] + join = fuzzy_join(left, right, on="A", suffixes=("l", "r"), match_score=0.5) + assert join["B"].fillna(-1).to_list() == [1, -1] + + +def test_perfect_matches(): + # non-regression test for https://github.com/skrub-data/skrub/issues/764 + # fuzzy_join when all rows had a perfect match used to trigger a division by 0 + df = pd.DataFrame({"A": [0, 1]}) + with warnings.catch_warnings(): + warnings.simplefilter("error") + warnings.filterwarnings("ignore", message="This feature is still experimental") + join = fuzzy_join(df, df, on="A", return_score=True) + assert_array_equal(join["matching_score"].to_numpy(), [1.0, 1.0]) + + def test_fuzzy_join_dtypes() -> None: """ Test that the dtypes of dataframes are maintained after join @@ -144,16 +166,16 @@ def test_drop_unmatched() -> None: a = pd.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]}) b = pd.DataFrame({"col1": ["aaa_", "bbb_", "cc ccc"], "col3": [1, 2, 3]}) - c1 = fuzzy_join(a, b, on="col1", match_score=0.6, drop_unmatched=True) + c1 = fuzzy_join(a, b, on="col1", match_score=0.1, drop_unmatched=True) assert c1.shape == (2, 4) - c2 = fuzzy_join(a, b, on="col1", match_score=0.6) + c2 = fuzzy_join(a, b, on="col1", match_score=0.1) assert sum(c2["col3"].isna()) > 0 - c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6) + c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1) assert sum(c3["col3"].isna()) > 0 - c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6, drop_unmatched=True) + c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1, drop_unmatched=True) assert c4.shape == (2, 4) @@ -301,7 +323,7 @@ def test_numerical_column() -> None: left, right, on="int", - match_score=0.8, + match_score=0.4, drop_unmatched=True, ) assert fj_num3.shape == (2, n_cols)