moj-analytical-services · ADBond · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/tests/test_comparison_template_lib.py b/tests/test_comparison_template_lib.py
@@ -2,6 +2,7 @@
 import pytest
 
 import splink.comparison_template_library as ctl
+from splink.column_expression import ColumnExpression
 
 from .decorator import mark_with_dialects_excluding
 
@@ -18,7 +19,7 @@ def test_date_comparison_run(dialect):
 @mark_with_dialects_excluding("postgres", "sqlite")
 def test_date_comparison_dl_run(dialect):
     ctl.DateComparison(
-        "date", levenshtein_thresholds=[1], damerau_levenshtein_thresholds=[]
+        "date", fuzzy_thresholds=[1], fuzzy_metric="levenshtein"
     ).get_comparison(dialect)
 
 
@@ -80,17 +81,28 @@ def test_datediff_levels(dialect, test_helpers, test_gamma_assert):
     # Generate our various settings objs
     settings = {
         "link_type": "dedupe_only",
-        "comparisons": [ctl.DateComparison("dob", cast_strings_to_date=True)],
+        "comparisons": [
+            ctl.DateComparison(
+                # TODO: revert to default damerau_levenshtein metric
+                ColumnExpression("dob").try_parse_date(),
+                fuzzy_metric="levenshtein",
+                fuzzy_thresholds=[2],
+            )
+        ],
     }
 
     # We need to put our column in datetime format for this to work
 
     df = helper.convert_frame(df)
     linker = helper.Linker(df, settings, **helper.extra_linker_args())
+    linker.debug_mode = True
     linker_output = linker.predict().as_pandas_dataframe()
 
     # # Dict key: {gamma_level value: size}
-    size_gamma_lookup = {0: 8, 1: 15, 2: 5, 3: 5, 4: 1, 5: 2}
+    # 0 - else, 1 - 10 years, 2 - 1 year, 3 - 1 month, 4 - fuzzy, 5 - date match
+    size_gamma_lookup = {0: 8, 1: 15, 2: 5, 3: 3, 4: 3, 5: 2}
+    # Dam-lev version - difference between 1-months and fuzzy level
+    # size_gamma_lookup = {0: 8, 1: 15, 2: 6, 3: 5, 4: 1, 5: 2}
 
     # Check gamma sizes are as expected
     for gamma, expected_size in size_gamma_lookup.items():
@@ -206,29 +218,24 @@ def test_name_comparison_levels(dialect, test_helpers):
     linker_output = linker.predict().as_pandas_dataframe()
 
     # # Dict key: {gamma_level value: size}
-    size_gamma_lookup = {0: 6, 1: 4, 2: 0, 3: 2, 4: 2, 5: 1}
-    # 5: exact_match
-    # 4: dmetaphone exact match
-    # 3: damerau_levenshtein <= 1
+    size_gamma_lookup = {0: 6, 1: 6, 2: 0, 3: 2, 4: 1}
+    # 4: exact_match
+    # 3: dmetaphone exact match
     # 2: jaro_winkler > 0.9
     # 1: jaro_winkler > 0.8
     # 0: else
 
     # Check gamma sizes are as expected
     for gamma, expected_size in size_gamma_lookup.items():
-        assert (
-            sum(linker_output["gamma_custom_first_name_first_name_metaphone"] == gamma)
-            == expected_size
-        )
+        assert sum(linker_output["gamma_first_name"] == gamma) == expected_size
 
     # Check individual IDs are assigned to the correct gamma values
     # Dict key: {gamma_value: tuple of ID pairs}
     size_gamma_lookup = {
-        5: [[1, 6]],
-        4: [(2, 3), (4, 5)],
-        3: [(4, 6)],
+        4: [[1, 6]],
+        3: [(2, 3), (4, 5)],
         2: [],
-        1: [(1, 2), (2, 6)],
+        1: [(1, 2), (2, 6), (4, 6)],
         0: [(2, 4), (5, 6)],
     }
 
@@ -238,7 +245,7 @@ def test_name_comparison_levels(dialect, test_helpers):
                 linker_output.loc[
                     (linker_output.unique_id_l == left)
                     & (linker_output.unique_id_r == right)
-                ]["gamma_custom_first_name_first_name_metaphone"].values[0]
+                ]["gamma_first_name"].values[0]
                 == gamma
             )
 
@@ -315,7 +322,7 @@ def test_forename_surname_comparison_levels(dialect, test_helpers):
 
     # Check gamma sizes are as expected
     for gamma, expected_size in size_gamma_lookup.items():
-        gamma_matches = linker_output.filter(like="gamma_custom") == gamma
+        gamma_matches = linker_output.filter(like="gamma_forename_surname") == gamma
         gamma_matches_size = gamma_matches.sum().values[0]
         assert gamma_matches_size == expected_size
 
@@ -337,13 +344,13 @@ def test_forename_surname_comparison_levels(dialect, test_helpers):
                     (linker_output.unique_id_l == left)
                     & (linker_output.unique_id_r == right)
                 ]
-                .filter(like="gamma_custom")
+                .filter(like="gamma_forename_surname")
                 .values[0][0]
                 == gamma
             )
 
 
-# postcode_comparison
+# PostcodeComparison
 
 
 @mark_with_dialects_excluding("postgres", "sqlite")
@@ -402,7 +409,7 @@ def test_postcode_comparison_levels(dialect, test_helpers, test_gamma_assert):
     settings = {
         "link_type": "dedupe_only",
         "comparisons": [
-            ctl.postcode_comparison(
+            ctl.PostcodeComparison(
                 col_name=col_name,
                 lat_col="lat",
                 long_col="long",
@@ -457,7 +464,7 @@ def test_email_comparison_levels(dialect, test_helpers, test_gamma_assert):
             ctl.EmailComparison(
                 col_name=col_name,
                 invalid_emails_as_null=True,
-                thresholds=[2],
+                fuzzy_thresholds=[2],
                 fuzzy_metric="damerau_levenshtein",
                 include_domain_match_level=True,
             )