Skip to content

Commit

Permalink
[MRG] Fix the match score scaling (skrub-data#802)
Browse files Browse the repository at this point in the history
Let's move this PR forward since it's on the critical path to releasing
  • Loading branch information
jeromedockes authored Nov 8, 2023
1 parent 930c7aa commit 77b1ccc
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 47 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ Major changes
Minor changes
-------------

* Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès <jeromedockes>`.

* :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`.
:pr:`592` by :user:`Leo Grinsztajn <LeoGrin>`

* ``inverse_transform`` in :class:`SimilarityEncoder` now works as expected; it used to raise an exception. :pr:`801` by :user:`Jérôme Dockès <jeromedockes>`.

Expand All @@ -66,6 +70,7 @@ Minor changes
:pr:`761` by :user:`Leo Grinsztajn <LeoGrin>`, :user:`Guillaume Lemaitre <glemaitre>`,
and :user:`Jerome Dockes <jeromedockes>`.


* Parallelized the :func:`deduplicate` function. Parameter `n_jobs`
added to the signature. :pr:`618` by :user:`Jovan Stojanovic <jovan-stojanovic>`
and :user:`Lilian Boulard <LilianBoulard>`
Expand Down
52 changes: 17 additions & 35 deletions examples/04_fuzzy_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
)

df1.tail(20)
# We merged the first WB table to our initial one.
# We merged the first World Bank table to our initial one.

###############################################################################
# .. topic:: Note:
Expand Down Expand Up @@ -175,7 +175,7 @@
gdppc,
left_on="Country",
right_on="Country Name",
match_score=0.35,
match_score=0.1,
return_score=True,
)
df1.sort_values("matching_score").head(4)
Expand All @@ -189,7 +189,7 @@
gdppc,
left_on="Country",
right_on="Country Name",
match_score=0.35,
match_score=0.1,
drop_unmatched=True,
)

Expand Down Expand Up @@ -232,7 +232,7 @@
life_exp,
left_on="Country",
right_on="Country Name",
match_score=0.45,
match_score=0.1,
)

df2.drop(columns=["Country Name"], inplace=True)
Expand Down Expand Up @@ -268,7 +268,7 @@
legal_rights,
left_on="Country",
right_on="Country Name",
match_score=0.45,
match_score=0.1,
)

df3.drop(columns=["Country Name"], inplace=True)
Expand Down Expand Up @@ -303,8 +303,8 @@
#
# We now separate our covariates (X), from the target (or exogenous)
# variables: y
X = df3.drop("Happiness score", axis=1).select_dtypes(exclude=object)
y = df3["Happiness score"]
X = df3.drop(["Happiness score", "Country"], axis=1)

###################################################################
# Let us now define the model that will be used to predict the happiness score:
Expand All @@ -313,10 +313,10 @@
from sklearn.model_selection import KFold

hgdb = HistGradientBoostingRegressor(random_state=0)
cv = KFold(n_splits=2, shuffle=True, random_state=0)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

#################################################################
# To evaluate our model, we will apply a `4-fold cross-validation`.
# To evaluate our model, we will apply a `5-fold cross-validation`.
# We evaluate our model using the `R2` score.
#
# Let's finally assess the results of our models:
Expand All @@ -326,10 +326,10 @@

cv_r2_t = cv_results_t["test_score"]

print(f"Mean R2 score is {cv_r2_t.mean():.2f} +- {cv_r2_t.std():.2f}")
print(f"Mean score is {cv_r2_t.mean():.2f} +- {cv_r2_t.std():.2f}")

#################################################################
# We have a satisfying first result: an R2 of 0.66!
# We have a satisfying first result: an of 0.63!
#
# Data cleaning varies from dataset to dataset: there are as
# many ways to clean a table as there are errors. |fj|
Expand Down Expand Up @@ -391,33 +391,15 @@

# We will test four possible values of match_score:
params = {
"joiner-1__match_score": [0.2, 0.9],
"joiner-2__match_score": [0.2, 0.9],
"joiner-3__match_score": [0.2, 0.9],
"joiner-1__match_score": [0.1, 0.9],
"joiner-2__match_score": [0.1, 0.9],
"joiner-3__match_score": [0.1, 0.9],
}

grid = GridSearchCV(pipeline, param_grid=params)
grid = GridSearchCV(pipeline, param_grid=params, cv=cv)
grid.fit(df, y)

print(grid.best_params_)
print("Best parameters:", grid.best_params_)

##########################################################################
# The grid searching gave us the best value of 0.5 for the parameter
# ``match_score``. Let's use this value in our regression:
#

print(f"Mean R2 score with pipeline is {grid.score(df, y):.2f}")

##########################################################################
#
# .. topic:: Note:
#
# Here, ``grid.score()`` takes directly the best model
# (with ``match_score=0.5``) that was found during the grid search.
# Thus, it is equivalent to fixing the ``match_score`` to 0.5 and
# refitting the pipeline on the data.
#
#
# Great, by evaluating the correct ``match_score`` we improved our
# results significantly!
#
# The gridsearch selects a stricter threshold on the matching_score than what
# we had set manually for the GDP and legal rights joins.
9 changes: 5 additions & 4 deletions skrub/_fuzzy_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,10 @@ def _nearest_matches(
neigh.fit(aux_array)
distance, neighbors = neigh.kneighbors(main_array, return_distance=True)
idx_closest = np.ravel(neighbors)
distance = distance / np.max(distance)
# Normalizing distance between 0 and 1:
matching_score = 1 - (distance / 2)
max_dist = distance.max()
if max_dist != 0:
distance /= max_dist
matching_score = 1 - distance
return idx_closest, matching_score


Expand Down Expand Up @@ -349,7 +350,7 @@ def fuzzy_join(
a_x b a_y c matching_score
0 ana 1 ana 7 1.0
1 lala 2 lala 6 1.0
2 nana 3 <NA> <NA> 0.5
2 nana 3 <NA> <NA> 0.0
As expected, the category "nana" has no exact match (`match_score=1`).
"""
Expand Down
38 changes: 30 additions & 8 deletions skrub/tests/test_fuzzy_join.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import warnings
from typing import Literal

import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_array_equal
from pandas.testing import assert_frame_equal
from sklearn.feature_extraction.text import HashingVectorizer

Expand All @@ -26,7 +28,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
right=df2,
left_on="a1",
right_on="a2",
match_score=0.45,
match_score=0.0,
return_score=True,
analyzer=analyzer,
)
Expand All @@ -41,7 +43,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
how="left",
left_on="a2",
right_on="a1",
match_score=0.35,
match_score=0.0,
return_score=True,
analyzer=analyzer,
)
Expand All @@ -54,7 +56,7 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
how="right",
right_on=["a2"],
left_on=["a1"],
match_score=0.35,
match_score=0.0,
return_score=True,
analyzer=analyzer,
)
Expand All @@ -80,6 +82,26 @@ def test_fuzzy_join(analyzer: Literal["char", "char_wb", "word"]) -> None:
assert ("a1l" and "a1r") in df.columns


def test_match_score():
left = pd.DataFrame({"A": ["aa", "bb"]})
right = pd.DataFrame({"A": ["aa", "ba"], "B": [1, 2]})
join = fuzzy_join(left, right, on="A", suffixes=("l", "r"))
assert join["B"].to_list() == [1, 2]
join = fuzzy_join(left, right, on="A", suffixes=("l", "r"), match_score=0.5)
assert join["B"].fillna(-1).to_list() == [1, -1]


def test_perfect_matches():
# non-regression test for https://github.com/skrub-data/skrub/issues/764
# fuzzy_join when all rows had a perfect match used to trigger a division by 0
df = pd.DataFrame({"A": [0, 1]})
with warnings.catch_warnings():
warnings.simplefilter("error")
warnings.filterwarnings("ignore", message="This feature is still experimental")
join = fuzzy_join(df, df, on="A", return_score=True)
assert_array_equal(join["matching_score"].to_numpy(), [1.0, 1.0])


def test_fuzzy_join_dtypes() -> None:
"""
Test that the dtypes of dataframes are maintained after join
Expand Down Expand Up @@ -144,16 +166,16 @@ def test_drop_unmatched() -> None:
a = pd.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]})
b = pd.DataFrame({"col1": ["aaa_", "bbb_", "cc ccc"], "col3": [1, 2, 3]})

c1 = fuzzy_join(a, b, on="col1", match_score=0.6, drop_unmatched=True)
c1 = fuzzy_join(a, b, on="col1", match_score=0.1, drop_unmatched=True)
assert c1.shape == (2, 4)

c2 = fuzzy_join(a, b, on="col1", match_score=0.6)
c2 = fuzzy_join(a, b, on="col1", match_score=0.1)
assert sum(c2["col3"].isna()) > 0

c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6)
c3 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1)
assert sum(c3["col3"].isna()) > 0

c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.6, drop_unmatched=True)
c4 = fuzzy_join(a, b, on="col1", how="right", match_score=0.1, drop_unmatched=True)
assert c4.shape == (2, 4)


Expand Down Expand Up @@ -301,7 +323,7 @@ def test_numerical_column() -> None:
left,
right,
on="int",
match_score=0.8,
match_score=0.4,
drop_unmatched=True,
)
assert fj_num3.shape == (2, n_cols)
Expand Down

0 comments on commit 77b1ccc

Please sign in to comment.