From a3a86d8d5d64943b36efb6d7ca709dffe12b8c9d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 12 Sep 2024 21:14:28 +0100 Subject: [PATCH 1/3] add absolutedifferencelevel --- splink/internals/comparison_level_library.py | 34 ++++++++++ tests/test_comparison_level_lib.py | 71 ++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/splink/internals/comparison_level_library.py b/splink/internals/comparison_level_library.py index 788d987063..33a149faab 100644 --- a/splink/internals/comparison_level_library.py +++ b/splink/internals/comparison_level_library.py @@ -843,3 +843,37 @@ def create_label_for_charts(self) -> str: f"Percentage difference of '{col.label}' " f"within {self.percentage_threshold:,.2%}" ) + + +class AbsoluteDifferenceLevel(ComparisonLevelCreator): + def __init__( + self, + col_name: Union[str, ColumnExpression], + difference_threshold: Union[int, float], + ): + """ + Represents a comparison level where the absolute difference between two + numerical values is within a specified threshold. + + Args: + col_name (str | ColumnExpression): Input column name or ColumnExpression. + difference_threshold (int | float): The maximum allowed absolute difference + between the two values. + """ + self.col_expression = ColumnExpression.instantiate_if_str(col_name) + self.difference_threshold = validate_numeric_parameter( + lower_bound=0, + upper_bound=float("inf"), + parameter_value=difference_threshold, + level_name=self.__class__.__name__, + parameter_name="difference_threshold", + ) + + def create_sql(self, sql_dialect: SplinkDialect) -> str: + self.col_expression.sql_dialect = sql_dialect + col = self.col_expression + return f"ABS({col.name_l} - {col.name_r}) <= {self.difference_threshold}" + + def create_label_for_charts(self) -> str: + col = self.col_expression + return f"Absolute difference of '{col.label}' <= {self.difference_threshold}" diff --git a/tests/test_comparison_level_lib.py b/tests/test_comparison_level_lib.py index 91c73b0c0d..dd6de7e3bb 100644 --- a/tests/test_comparison_level_lib.py +++ b/tests/test_comparison_level_lib.py @@ -292,3 +292,74 @@ def test_damerau_levenshtein_level(test_helpers, dialect): ] run_comparison_vector_value_tests(test_cases, db_api) + + +@mark_with_dialects_excluding() +def test_absolute_difference(test_helpers, dialect): + helper = test_helpers[dialect] + db_api = helper.extra_linker_args()["db_api"] + + abs_comparison = cl.CustomComparison( + comparison_description="amount", + comparison_levels=[ + cll.NullLevel("amount"), + cll.AbsoluteDifferenceLevel("amount", 0), # 5 + cll.AbsoluteDifferenceLevel("amount", 5), # 4 + cll.AbsoluteDifferenceLevel("amount", 10), # 3 + cll.AbsoluteDifferenceLevel("amount", 20), # 2 + cll.AbsoluteDifferenceLevel("amount", 50), # 1 + cll.ElseLevel(), + ], + ) + + test_cases = [ + { + "comparison": abs_comparison, + "inputs": [ + { + "amount_l": 100, + "amount_r": 100, + "expected_value": 5, + "expected_label": "Absolute difference of 'amount' <= 0", + }, + { + "amount_l": 100, + "amount_r": 103, + "expected_value": 4, + "expected_label": "Absolute difference of 'amount' <= 5", + }, + { + "amount_l": 100, + "amount_r": 108, + "expected_value": 3, + "expected_label": "Absolute difference of 'amount' <= 10", + }, + { + "amount_l": 100, + "amount_r": 115, + "expected_value": 2, + "expected_label": "Absolute difference of 'amount' <= 20", + }, + { + "amount_l": 100, + "amount_r": 140, + "expected_value": 1, + "expected_label": "Absolute difference of 'amount' <= 50", + }, + { + "amount_l": 100, + "amount_r": 200, + "expected_value": 0, + "expected_label": "All other comparisons", + }, + { + "amount_l": None, + "amount_r": 100, + "expected_value": -1, + "expected_label": "amount is NULL", + }, + ], + }, + ] + + run_comparison_vector_value_tests(test_cases, db_api) From f06ca2a081ac5120fae26f7758d23506bd93a967 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 12 Sep 2024 21:15:19 +0100 Subject: [PATCH 2/3] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15930b8109..f45ad7e84d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Match weight and m and u probabilities charts now have improved tooltips ([#2392](https://github.com/moj-analytical-services/splink/pull/2392)) +- Added new `AbsoluteDifferenceLevel` comparison level for numerical columns ([#2398](https://github.com/moj-analytical-services/splink/pull/2398)) ### Fixed From 34a394594a9a240b8af21dd50f3635e5697c324e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 12 Sep 2024 21:29:45 +0100 Subject: [PATCH 3/3] expose publicly --- splink/comparison_level_library.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py index 5e849144bc..364148c9d4 100644 --- a/splink/comparison_level_library.py +++ b/splink/comparison_level_library.py @@ -1,5 +1,6 @@ from splink.internals.comparison_level_library import ( AbsoluteDateDifferenceLevel, + AbsoluteDifferenceLevel, AbsoluteTimeDifferenceLevel, And, ArrayIntersectLevel, @@ -39,6 +40,7 @@ "DistanceInKMLevel", "ArrayIntersectLevel", "PercentageDifferenceLevel", + "AbsoluteDifferenceLevel", "And", "Not", "Or",