From 9744913b5a2fedd2f61125316226413330aea133 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:27:38 +0000 Subject: [PATCH 1/4] adjust tests for DateComparison --- tests/test_comparison_template_lib.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py index ff8e849fea..7d97bdd13c 100644 --- a/tests/test_comparison_template_lib.py +++ b/tests/test_comparison_template_lib.py @@ -2,6 +2,7 @@ import pytest import splink.comparison_template_library as ctl +from splink.column_expression import ColumnExpression from .decorator import mark_with_dialects_excluding @@ -18,7 +19,7 @@ def test_date_comparison_run(dialect): @mark_with_dialects_excluding("postgres", "sqlite") def test_date_comparison_dl_run(dialect): ctl.DateComparison( - "date", levenshtein_thresholds=[1], damerau_levenshtein_thresholds=[] + "date", fuzzy_thresholds=[1], fuzzy_metric="levenshtein" ).get_comparison(dialect) @@ -80,17 +81,26 @@ def test_datediff_levels(dialect, test_helpers, test_gamma_assert): # Generate our various settings objs settings = { "link_type": "dedupe_only", - "comparisons": [ctl.DateComparison("dob", cast_strings_to_date=True)], + "comparisons": [ + ctl.DateComparison( + # TODO: revert to default damerau_levenshtein metric + ColumnExpression("dob").try_parse_date(), fuzzy_metric="levenshtein", fuzzy_thresholds=[2] + ) + ], } # We need to put our column in datetime format for this to work df = helper.convert_frame(df) linker = helper.Linker(df, settings, **helper.extra_linker_args()) + linker.debug_mode = True linker_output = linker.predict().as_pandas_dataframe() # # Dict key: {gamma_level value: size} - size_gamma_lookup = {0: 8, 1: 15, 2: 5, 3: 5, 4: 1, 5: 2} + # 0 - else, 1 - 10 years, 2 - 1 year, 3 - 1 month, 4 - fuzzy, 5 - date match + size_gamma_lookup = {0: 8, 1: 15, 2: 5, 3: 3, 4: 3, 5: 2} + # Dam-lev version - difference between 1-months and fuzzy level + # size_gamma_lookup = {0: 8, 1: 15, 2: 6, 3: 5, 4: 1, 5: 2} # Check gamma sizes are as expected for gamma, expected_size in size_gamma_lookup.items(): @@ -457,7 +467,7 @@ def test_email_comparison_levels(dialect, test_helpers, test_gamma_assert): ctl.EmailComparison( col_name=col_name, invalid_emails_as_null=True, - thresholds=[2], + fuzzy_thresholds=[2], fuzzy_metric="damerau_levenshtein", include_domain_match_level=True, ) From b44da759d61181007bd4f8965b72069ef882b3e0 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:00:19 +0000 Subject: [PATCH 2/4] adjust test for NameComparison (with new defaults) --- tests/test_comparison_template_lib.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py index 7d97bdd13c..30cffb73f0 100644 --- a/tests/test_comparison_template_lib.py +++ b/tests/test_comparison_template_lib.py @@ -84,7 +84,9 @@ def test_datediff_levels(dialect, test_helpers, test_gamma_assert): "comparisons": [ ctl.DateComparison( # TODO: revert to default damerau_levenshtein metric - ColumnExpression("dob").try_parse_date(), fuzzy_metric="levenshtein", fuzzy_thresholds=[2] + ColumnExpression("dob").try_parse_date(), + fuzzy_metric="levenshtein", + fuzzy_thresholds=[2], ) ], } @@ -216,29 +218,24 @@ def test_name_comparison_levels(dialect, test_helpers): linker_output = linker.predict().as_pandas_dataframe() # # Dict key: {gamma_level value: size} - size_gamma_lookup = {0: 6, 1: 4, 2: 0, 3: 2, 4: 2, 5: 1} - # 5: exact_match - # 4: dmetaphone exact match - # 3: damerau_levenshtein <= 1 + size_gamma_lookup = {0: 6, 1: 6, 2: 0, 3: 2, 4: 1} + # 4: exact_match + # 3: dmetaphone exact match # 2: jaro_winkler > 0.9 # 1: jaro_winkler > 0.8 # 0: else # Check gamma sizes are as expected for gamma, expected_size in size_gamma_lookup.items(): - assert ( - sum(linker_output["gamma_custom_first_name_first_name_metaphone"] == gamma) - == expected_size - ) + assert sum(linker_output["gamma_first_name"] == gamma) == expected_size # Check individual IDs are assigned to the correct gamma values # Dict key: {gamma_value: tuple of ID pairs} size_gamma_lookup = { - 5: [[1, 6]], - 4: [(2, 3), (4, 5)], - 3: [(4, 6)], + 4: [[1, 6]], + 3: [(2, 3), (4, 5)], 2: [], - 1: [(1, 2), (2, 6)], + 1: [(1, 2), (2, 6), (4, 6)], 0: [(2, 4), (5, 6)], } @@ -248,7 +245,7 @@ def test_name_comparison_levels(dialect, test_helpers): linker_output.loc[ (linker_output.unique_id_l == left) & (linker_output.unique_id_r == right) - ]["gamma_custom_first_name_first_name_metaphone"].values[0] + ]["gamma_first_name"].values[0] == gamma ) From 1eda05360dcc1bf0827087615f64e648a384ae87 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:37:32 +0000 Subject: [PATCH 3/4] postcode comparison test update name --- tests/test_comparison_template_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py index 30cffb73f0..c320ee5183 100644 --- a/tests/test_comparison_template_lib.py +++ b/tests/test_comparison_template_lib.py @@ -350,7 +350,7 @@ def test_forename_surname_comparison_levels(dialect, test_helpers): ) -# postcode_comparison +# PostcodeComparison @mark_with_dialects_excluding("postgres", "sqlite") @@ -409,7 +409,7 @@ def test_postcode_comparison_levels(dialect, test_helpers, test_gamma_assert): settings = { "link_type": "dedupe_only", "comparisons": [ - ctl.postcode_comparison( + ctl.PostcodeComparison( col_name=col_name, lat_col="lat", long_col="long", From aa5c5516f5a510a15b84e3c8d317e0583f290c75 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 18 Jan 2024 16:26:17 +0000 Subject: [PATCH 4/4] fornamesurname test adjust gamma name --- tests/test_comparison_template_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py index c320ee5183..6e6495616e 100644 --- a/tests/test_comparison_template_lib.py +++ b/tests/test_comparison_template_lib.py @@ -322,7 +322,7 @@ def test_forename_surname_comparison_levels(dialect, test_helpers): # Check gamma sizes are as expected for gamma, expected_size in size_gamma_lookup.items(): - gamma_matches = linker_output.filter(like="gamma_custom") == gamma + gamma_matches = linker_output.filter(like="gamma_forename_surname") == gamma gamma_matches_size = gamma_matches.sum().values[0] assert gamma_matches_size == expected_size @@ -344,7 +344,7 @@ def test_forename_surname_comparison_levels(dialect, test_helpers): (linker_output.unique_id_l == left) & (linker_output.unique_id_r == right) ] - .filter(like="gamma_custom") + .filter(like="gamma_forename_surname") .values[0][0] == gamma )