diff --git a/.gitignore b/.gitignore index f041eeb..0a288cf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ __pycache__/ dist valentine.egg-info build -.vscode/ \ No newline at end of file +.vscode/ +valentine.sublime-workspace +valentine.sublime-project diff --git a/README.md b/README.md index d90a1b1..4e369fa 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ After selecting one of the 5 matching methods, the user can initiate the pairwis matches = valentine_match(df1, df2, matcher, df1_name, df2_name) ``` -where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a dictionary storing as keys column pairs from the two DataFrames and as values the corresponding similarity scores. +where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a MatcherResults object, which is a dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores. ### Matching DataFrame Batch @@ -86,23 +86,48 @@ After selecting one of the 5 matching methods, the user can initiate the batch m matches = valentine_match_batch(df_iter_1, df_iter_2, matcher, df_iter_1_names, df_iter_2_names) ``` -where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a dictionary storing as keys column pairs from the DataFrames and as values the corresponding similarity scores. +where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a MatcherResults object, which is a dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores. -### Measuring effectiveness -Based on the matches retrieved by calling `valentine_match` the user can use +### MatcherResults instance +The `MatcherResults` instance has some convenience methods that the user can use to either obtain a subset of the data or to transform the data. This instance is a dictionary and is sorted upon instantiation, from high similarity to low similarity. +```python +top_n_matches = matches.take_top_n(5) + +top_n_percent_matches = matches.take_top_percent(25) + +one_to_one_matches = matches.one_to_one() +``` + + +### Measuring effectiveness +The MatcherResults instance that is returned by `valentine_match` or `valentine_match_batch` also has a `get_metrics` method that the user can use ```python -metrics = valentine_metrics.all_metrics(matches, ground_truth) +metrics = matches.get_metrics(ground_truth) ``` -in order to get all effectiveness metrics, such as Precision, Recall, F1-score and others as described in the original Valentine paper. In order to do so, the user needs to also input the ground truth of matches based on which the metrics will be calculated. The ground truth can be given as a list of tuples representing column matches that should hold. +in order to get all effectiveness metrics, such as Precision, Recall, F1-score and others as described in the original Valentine paper. In order to do so, the user needs to also input the ground truth of matches based on which the metrics will be calculated. The ground truth can be given as a list of tuples representing column matches that should hold (see example below). + +By default, all the core metrics will be used for this with default parameters, but the user can also customize which metrics to run with what parameters, and implement own custom metrics by extending from the `Metric` base class. Some sets of metrics are available as well. + +```python +from valentine.metrics import F1Score, PrecisionTopNPercent, METRICS_PRECISION_INCREASING_N +metrics_custom = matches.get_metrics(ground_truth, metrics={F1Score(one_to_one=False), PrecisionTopNPercent(n=70)}) +metrics_prefefined_set = matches.get_metrics(ground_truth, metrics=METRICS_PRECISION_INCREASING_N) + +``` ### Example -The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about authors and their publications, and then 2) how to assess its effectiveness based on a given ground truth (as found in [`valentine_example.py`](https://github.com/delftdata/valentine/blob/master/examples/valentine_example.py)): +The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about authors and their publications, and then 2) how to assess its effectiveness based on a given ground truth (a more extensive example is shown in [`valentine_example.py`](https://github.com/delftdata/valentine/blob/master/examples/valentine_example.py)): ```python +import os +import pandas as pd +from valentine import valentine_match +from valentine.algorithms import Coma + # Load data using pandas d1_path = os.path.join('data', 'authors1.csv') d2_path = os.path.join('data', 'authors2.csv') @@ -120,7 +145,7 @@ ground_truth = [('Cited by', 'Cited by'), ('Authors', 'Authors'), ('EID', 'EID')] -metrics = valentine_metrics.all_metrics(matches, ground_truth) +metrics = matches.get_metrics(ground_truth) print(metrics) ``` @@ -128,17 +153,18 @@ print(metrics) The output of the above code block is: ``` -{(('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313, -(('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037, -(('table_1', 'EID'), ('table_2', 'EID')): 0.8214057} -{'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, -'precision_at_10_percent': 1.0, -'precision_at_30_percent': 1.0, -'precision_at_50_percent': 1.0, -'precision_at_70_percent': 1.0, -'precision_at_90_percent': 1.0, -'recall_at_sizeof_ground_truth': 1.0} - +{ + (('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.86994505, + (('table_1', 'Authors'), ('table_2', 'Authors')): 0.8679843, + (('table_1', 'EID'), ('table_2', 'EID')): 0.8571245 +} +{ + 'Recall': 1.0, + 'F1Score': 1.0, + 'RecallAtSizeofGroundTruth': 1.0, + 'Precision': 1.0, + 'PrecisionTop10Percent': 1.0 +} ``` ## Cite Valentine diff --git a/examples/valentine_example.py b/examples/valentine_example.py index df3fd84..cb090db 100644 --- a/examples/valentine_example.py +++ b/examples/valentine_example.py @@ -1,8 +1,10 @@ import os import pandas as pd -from valentine import valentine_match, valentine_metrics -from valentine.algorithms import Coma +from valentine.metrics import F1Score, PrecisionTopNPercent +from valentine import valentine_match +from valentine.algorithms import JaccardDistanceMatcher import pprint +pp = pprint.PrettyPrinter(indent=4, sort_dicts=False) def main(): @@ -13,28 +15,40 @@ def main(): df2 = pd.read_csv(d2_path) # Instantiate matcher and run - # Coma requires java to be installed on your machine - # If java is not an option, all the other algorithms are in Python (e.g., Cupid) - matcher = Coma(use_instances=False) + matcher = JaccardDistanceMatcher() matches = valentine_match(df1, df2, matcher) + # MatcherResults is a wrapper object that has several useful + # utility/transformation functions + print("Found the following matches:") + pp.pprint(matches) + + print("\nGetting the one-to-one matches:") + pp.pprint(matches.one_to_one()) + # If ground truth available valentine could calculate the metrics ground_truth = [('Cited by', 'Cited by'), ('Authors', 'Authors'), ('EID', 'EID')] - metrics = valentine_metrics.all_metrics(matches, ground_truth) - - pp = pprint.PrettyPrinter(indent=4) - print("Found the following matches:") - pp.pprint(matches) + metrics = matches.get_metrics(ground_truth) print("\nAccording to the ground truth:") pp.pprint(ground_truth) - print("\nThese are the scores of the matcher:") + print("\nThese are the scores of the default metrics for the matcher:") pp.pprint(metrics) + print("\nYou can also get specific metric scores:") + pp.pprint(matches.get_metrics(ground_truth, metrics={ + PrecisionTopNPercent(n=80), + F1Score() + })) + + print("\nThe MatcherResults object is a dict and can be treated such:") + for match in matches: + print(f"{str(match): <60} {matches[match]}") + if __name__ == '__main__': main() diff --git a/tests/test_matcher_results.py b/tests/test_matcher_results.py new file mode 100644 index 0000000..99a2860 --- /dev/null +++ b/tests/test_matcher_results.py @@ -0,0 +1,86 @@ +import unittest +import math + +from tests import df1, df2 +from valentine.algorithms.matcher_results import MatcherResults +from valentine.algorithms import JaccardDistanceMatcher +from valentine.metrics import Precision +from valentine import valentine_match + + +class TestMatcherResults(unittest.TestCase): + def setUp(self): + self.matches = valentine_match(df1, df2, JaccardDistanceMatcher()) + self.ground_truth = [ + ('Cited by', 'Cited by'), + ('Authors', 'Authors'), + ('EID', 'EID') + ] + + def test_dict(self): + assert isinstance(self.matches, dict) + + def test_get_metrics(self): + metrics = self.matches.get_metrics(self.ground_truth) + assert all([x in metrics for x in {"Precision", "Recall", "F1Score"}]) + + metrics_specific = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) + assert "Precision" in metrics_specific + + def test_one_to_one(self): + m = self.matches + + # Add multiple matches per column + pairs = list(m.keys()) + for (ta, ca), (tb, cb) in pairs: + m[((ta, ca), (tb, cb + 'foo'))] = m[((ta, ca), (tb, cb))] / 2 + + # Verify that len gets corrected from 6 to 3 + m_one_to_one = m.one_to_one() + assert len(m_one_to_one) == 3 and len(m) == 6 + + # Verify that none of the lower similarity "foo" entries made it + for (ta, ca), (tb, cb) in pairs: + assert ((ta, ca), (tb, cb + 'foo')) not in m_one_to_one + + # Verify that the cache resets on a new MatcherResults instance + m_entry = MatcherResults(m) + assert m_entry._cached_one_to_one is None + + # Add one new entry with lower similarity + m_entry[(('table_1', 'BLA'), ('table_2', 'BLA'))] = 0.7214057 + + # Verify that the new one_to_one is different from the old one + m_entry_one_to_one = m_entry.one_to_one() + assert m_one_to_one != m_entry_one_to_one + + # Verify that all remaining values are above the median + median = sorted(list(m_entry.values()), reverse=True)[math.ceil(len(m_entry)/2)] + for k in m_entry_one_to_one: + assert m_entry_one_to_one[k] >= median + + def test_take_top_percent(self): + take_0_percent = self.matches.take_top_percent(0) + assert len(take_0_percent) == 0 + + take_40_percent = self.matches.take_top_percent(40) + assert len(take_40_percent) == 2 + + take_100_percent = self.matches.take_top_percent(100) + assert len(take_100_percent) == len(self.matches) + + def test_take_top_n(self): + take_none = self.matches.take_top_n(0) + assert len(take_none) == 0 + + take_some = self.matches.take_top_n(2) + assert len(take_some) == 2 + + take_all = self.matches.take_top_n(len(self.matches)) + assert len(take_all) == len(self.matches) + + take_more_than_all = self.matches.take_top_n(len(self.matches)+1) + assert len(take_more_than_all) == len(self.matches) + + def test_copy(self): + assert self.matches.get_copy() is not self.matches \ No newline at end of file diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 4fd55c6..3099b2a 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,47 +1,76 @@ import unittest +from valentine.metrics import * +from valentine.algorithms.matcher_results import MatcherResults +from valentine.metrics.metric_helpers import get_fp, get_tp_fn -import math -from valentine.metrics.metrics import one_to_one_matches -from copy import deepcopy +class TestMetrics(unittest.TestCase): + def setUp(self): + self.matches = MatcherResults({ + (('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313, + (('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037, + (('table_1', 'EID'), ('table_2', 'EID')): 0.8214057, + (('table_1', 'Title'), ('table_2', 'DUMMY1')): 0.8214057, + (('table_1', 'Title'), ('table_2', 'DUMMY2')): 0.8114057, + }) + self.ground_truth = [ + ('Cited by', 'Cited by'), + ('Authors', 'Authors'), + ('EID', 'EID'), + ('Title', 'Title'), + ('DUMMY3', 'DUMMY3') -matches = { - (('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313, - (('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037, - (('table_1', 'EID'), ('table_2', 'EID')): 0.8214057, -} + ] -ground_truth = [ - ('Cited by', 'Cited by'), - ('Authors', 'Authors'), - ('EID', 'EID') -] + def test_precision(self): + precision = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) + assert 'Precision' in precision and precision['Precision'] == 0.75 + precision_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={Precision(one_to_one=False)}) + assert 'Precision' in precision_not_one_to_one and precision_not_one_to_one['Precision'] == 0.6 -class TestMetrics(unittest.TestCase): + def test_recall(self): + recall = self.matches.get_metrics(self.ground_truth, metrics={Recall()}) + assert 'Recall' in recall and recall['Recall'] == 0.6 + + recall_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={Recall(one_to_one=False)}) + assert 'Recall' in recall_not_one_to_one and recall_not_one_to_one['Recall'] == 0.6 + + def test_f1(self): + f1 = self.matches.get_metrics(self.ground_truth, metrics={F1Score()}) + assert 'F1Score' in f1 and round(100*f1['F1Score']) == 67 + + f1_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={F1Score(one_to_one=False)}) + assert 'F1Score' in f1_not_one_to_one and f1_not_one_to_one['F1Score'] == 0.6 + + def test_precision_top_n_percent(self): + precision_0 = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=0)}) + assert 'PrecisionTop0Percent' in precision_0 and precision_0['PrecisionTop0Percent'] == 0 - def test_one_to_one(self): - m = deepcopy(matches) + precision_50 = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=50)}) + assert 'PrecisionTop50Percent' in precision_50 and precision_50['PrecisionTop50Percent'] == 1.0 - # Add multiple matches per column - pairs = list(m.keys()) - for (ta, ca), (tb, cb) in pairs: - m[((ta, ca), (tb, cb + 'foo'))] = m[((ta, ca), (tb, cb))] / 2 + precision = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) + precision_100 = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=100)}) + assert 'PrecisionTop100Percent' in precision_100 and precision_100['PrecisionTop100Percent'] == precision['Precision'] - # Verify that len gets corrected to 3 - m_one_to_one = one_to_one_matches(m) - assert len(m_one_to_one) == 3 and len(m) == 6 + precision_70_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=70, one_to_one=False)}) + assert 'PrecisionTop70Percent' in precision_70_not_one_to_one and precision_70_not_one_to_one['PrecisionTop70Percent'] == 0.75 - # Verify that none of the lower similarity "foo" entries made it - for (ta, ca), (tb, cb) in pairs: - assert ((ta, ca), (tb, cb + 'foo')) not in m_one_to_one + def test_recall_at_size_of_ground_truth(self): + recall = self.matches.get_metrics(self.ground_truth, metrics={RecallAtSizeofGroundTruth()}) + assert 'RecallAtSizeofGroundTruth' in recall and recall['RecallAtSizeofGroundTruth'] == 0.6 - # Add one new entry with lower similarity - m_entry = deepcopy(matches) - m_entry[(('table_1', 'BLA'), ('table_2', 'BLA'))] = 0.7214057 + def test_metric_helpers(self): + limit = 2 + tp, fn = get_tp_fn(self.matches, self.ground_truth, n=limit) + assert tp <= len(self.ground_truth) and fn <= len(self.ground_truth) - m_entry_one_to_one = one_to_one_matches(m_entry) + fp = get_fp(self.matches, self.ground_truth, n=limit) + assert fp <= limit + assert tp == 2 and fn == 3 # Since we limit to 2 of the matches + assert fp == 0 - # Verify that all remaining values are above the median - median = sorted(set(m_entry.values()), reverse=True)[math.ceil(len(m_entry)/2)] - for k in m_entry_one_to_one: - assert m_entry_one_to_one[k] >= median + def test_metric_equals(self): + assert PrecisionTopNPercent(n=10, one_to_one=False) == PrecisionTopNPercent(n=10, one_to_one=False) + assert PrecisionTopNPercent(n=10, one_to_one=False) != PrecisionTopNPercent(n=10, one_to_one=True) + assert PrecisionTopNPercent(n=10, one_to_one=False) != Precision() diff --git a/tests/test_valentine.py b/tests/test_valentine.py index 3614ee6..f4a7e07 100644 --- a/tests/test_valentine.py +++ b/tests/test_valentine.py @@ -2,9 +2,9 @@ from valentine.data_sources import DataframeTable -from valentine import valentine_match, valentine_match_batch, valentine_metrics, NotAValentineMatcher +from valentine import valentine_match, valentine_match_batch, NotAValentineMatcher from tests import df1, df2 -from valentine.algorithms import Coma, DistributionBased +from valentine.algorithms import JaccardDistanceMatcher class TestValentine(unittest.TestCase): @@ -12,7 +12,7 @@ class TestValentine(unittest.TestCase): def test_match(self): assert not DataframeTable(df1, name='df1_name').is_empty assert not DataframeTable(df2, name='df2_name').is_empty - matches = valentine_match(df1, df2, Coma(use_instances=True)) + matches = valentine_match(df1, df2, JaccardDistanceMatcher()) assert len(matches) > 0 try: valentine_match(df1, df2, None) @@ -21,14 +21,6 @@ def test_match(self): else: assert False - def test_metrics(self): - matches = valentine_match(df1, df2, Coma(use_instances=True)) - golden_standard = [('Cited by', 'Cited by'), - ('Authors', 'Authors'), - ('EID', 'EID')] - metrics = valentine_metrics.all_metrics(matches, golden_standard) - assert metrics['recall_at_sizeof_ground_truth'] == 1.0 - def test_batch_generator(self): n = 3 @@ -40,9 +32,13 @@ def generate_df2(): for _ in range(n): yield df2 - matches = valentine_match_batch(generate_df1(), generate_df2(), DistributionBased()) + matches = valentine_match_batch(generate_df1(), generate_df2(), JaccardDistanceMatcher()) assert len(matches) > 0 def test_batch_list(self): - matches = valentine_match_batch([df1, df1, df1], [df2, df2, df2], DistributionBased()) + matches = valentine_match_batch([df1, df1, df1], [df2, df2, df2], JaccardDistanceMatcher()) assert len(matches) > 0 + + def test_batch_names(self): + matches = valentine_match_batch([df1, df1], [df2, df2], JaccardDistanceMatcher(), ['ta1', 'tb1'], ['ta2', 'tb2']) + assert len(matches) > 0 \ No newline at end of file diff --git a/valentine/__init__.py b/valentine/__init__.py index efb266a..79c8cc0 100644 --- a/valentine/__init__.py +++ b/valentine/__init__.py @@ -1,11 +1,11 @@ -from typing import Iterable, List, Union - import pandas as pd -import valentine.metrics as valentine_metrics import valentine.algorithms import valentine.data_sources +from typing import Iterable, List, Union +from valentine.algorithms.matcher_results import MatcherResults + class NotAValentineMatcher(Exception): pass @@ -13,7 +13,7 @@ class NotAValentineMatcher(Exception): def validate_matcher(matcher): if not isinstance(matcher, valentine.algorithms.BaseMatcher): - raise NotAValentineMatcher('The method that you selected is not supported by Valentine') + raise NotAValentineMatcher('Please provide a valid matcher') def valentine_match(df1: pd.DataFrame, @@ -26,10 +26,9 @@ def valentine_match(df1: pd.DataFrame, table_1 = valentine.data_sources.DataframeTable(df1, name=df1_name) table_2 = valentine.data_sources.DataframeTable(df2, name=df2_name) - matches = dict(sorted(matcher.get_matches(table_1, table_2).items(), - key=lambda item: item[1], reverse=True)) + matches = matcher.get_matches(table_1, table_2) - return matches + return MatcherResults(matches) def valentine_match_batch(df_iter_1: Iterable[pd.DataFrame], @@ -50,6 +49,4 @@ def valentine_match_batch(df_iter_1: Iterable[pd.DataFrame], table_2 = valentine.data_sources.DataframeTable(df2, name=table_2_name) matches.update(matcher.get_matches(table_1, table_2)) - matches = dict(sorted(matches.items(), key=lambda item: item[1], reverse=True)) - - return matches + return MatcherResults(matches) diff --git a/valentine/algorithms/match.py b/valentine/algorithms/match.py index 53733d8..edc251d 100644 --- a/valentine/algorithms/match.py +++ b/valentine/algorithms/match.py @@ -1,20 +1,25 @@ -class Match(object): +from __future__ import annotations +from dataclasses import dataclass +from typing import Dict, Tuple + + +@dataclass +class Match: """ - Class representing a match of two columns target is the one we want to find the matches of, source an other - that exists in the database and the similarity between the two. + Class representing a match of two columns. target is the one we want to + find the matches of, source an other that exists in the database and the + similarity between the two. - NOTE: Use the to_dict method when you want to append a match to a list of matches + NOTE: Use the to_dict method when you want to append a match to a list of + matches """ - def __init__(self, target_table_name: str, target_column_name: str, - source_table_name: str, source_column_name: str, - similarity: float): - self.target_table_name = target_table_name - self.target_column_name = target_column_name - self.source_table_name = source_table_name - self.source_column_name = source_column_name - self.similarity = similarity + target_table_name: str + target_column_name: str + source_table_name: str + source_column_name: str + similarity: float @property - def to_dict(self) -> dict: + def to_dict(self: Match) -> Dict[Tuple[Tuple[str, str], Tuple[str, str]], float]: return {((self.source_table_name, self.source_column_name), (self.target_table_name, self.target_column_name)): self.similarity} diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py new file mode 100644 index 0000000..f37d58c --- /dev/null +++ b/valentine/algorithms/matcher_results.py @@ -0,0 +1,159 @@ +from __future__ import annotations +import math +from ..metrics import METRICS_CORE +from ..metrics.base_metric import Metric + +from typing import Dict, Tuple, List, Any, Set + + +class MatcherResults(dict): + """This is a dictionary with additional valentine-specific functionality. + This class is the result of a matcher's `get_matches` method. + + Certain transformations such as "one_to_one" get cached, since they do not + differ from call to call and are required by many metrics. + + The assumption is that the results are sorted from high similarity to low + similarity. This is also enforced upon creation through sorting, as + dictionaries preserve their insertion order as of Python 3.6. + + Aside from transformations, one can also obtain metric scores based on the + results, which can be imported from the metrics module. The metrics come in + handy predefined sets as well, e.g. METRICS_CORE, which is the default. + """ + + def __init__(self: MatcherResults, res: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], *args, **kwargs): + self._cached_one_to_one = None + sorted_res = {k:res[k] for k in sorted(res, key=res.get, reverse=True)} + dict.__init__(self, sorted_res, *args, **kwargs) + + def one_to_one(self: MatcherResults) -> MatcherResults: + """A filter that takes a dict of column matches and returns a dict of 1 + to 1 matches. The filter works in the following way: At first it + gets the median similarity of the set of the values and removes all + matches that have a similarity lower than that. Then from what + remained it matches columns for me highest similarity to the lowest + till the columns have at most one match. + + Once calculated, the one-to-one matches are cached, to avoid redundant + calculations for metrics. + + Returns + ------- + MatcherResults + MatcherResults with one-to-one matches. + """ + if self._cached_one_to_one is not None: + return MatcherResults(self._cached_one_to_one.copy()) + + matches_dict = self.get_copy() + + set_match_values = set(matches_dict.values()) + + if len(set_match_values) < 2: + self._cached_one_to_one = matches_dict + return MatcherResults(matches_dict) + + matched = dict() + + for key in matches_dict.keys(): + matched[key[0]] = False + matched[key[1]] = False + + median = sorted(set_match_values, reverse=True)[ + math.ceil(len(set_match_values)/2)] + + matches1to1_dict = dict() + + for key in matches_dict.keys(): + if (not matched[key[0]]) and (not matched[key[1]]): + similarity = matches_dict.get(key) + if similarity is not None and similarity >= median: + matches1to1_dict[key] = similarity + matched[key[0]] = True + matched[key[1]] = True + else: + break + + self._cached_one_to_one = matches1to1_dict + return MatcherResults(matches1to1_dict) + + def take_top_percent(self: MatcherResults, percent: int) -> MatcherResults: + """Summary + Takes the top 'percent' of matches and returns a new MatcherResults + containing only these matches. + + Parameters + ---------- + percent : int + Percentage of matches to keep. + + Returns + ------- + MatcherResults + Matcher results containing only the + top 'percent' of matches. + """ + matches = self.get_copy() + number_to_keep = int( + math.ceil((percent / 100) * len(matches.keys()))) + matches = dict(sorted(matches.items(), + key=lambda x: x[1], + reverse=True)[:number_to_keep]) + return MatcherResults(matches) + + def take_top_n(self: MatcherResults, n: int) -> MatcherResults: + """Summary + Takes the top 'n' matches and returns a new MatcherResults + containing only these matches. + + Parameters + ---------- + n : int + Number of matches to keep. + + Returns + ------- + MatcherResults + Matcher results containing only the + top 'n' matches. + """ + matches = self.get_copy() + matches = dict(sorted(matches.items(), + key=lambda x: x[1], reverse=True)[:n]) + return MatcherResults(matches) + + def get_metrics(self: MatcherResults, ground_truth: List[Tuple[str, str]], metrics: Set[Metric] = METRICS_CORE) -> Dict[str, Any]: + """Summary + Given ground truth column matches and a set of metric instances, this + method will calculate scores for these metrics. Metrics can be imported + from the 'metrics' module, which also contains predefined sets of + metrics. + + Parameters + ---------- + ground_truth : List[Tuple[str, str]] + The ground truth column matches as a list of column name tuples. + metrics : Set[Metric], optional + The set of metric instances. + + Returns + ------- + Dict[str, Any] + A dictionary with metric scores. + """ + res = {} + for metric in metrics: + res.update(metric.apply(self, ground_truth)) + return res + + def get_copy(self: MatcherResults) -> MatcherResults: + """Summary + Returns a copy of this instance. + + Returns + ------- + MatcherResults + A copy of this MatcherResults instance. + """ + return MatcherResults(self.copy()) diff --git a/valentine/metrics/__init__.py b/valentine/metrics/__init__.py index 6b3a088..b05135d 100644 --- a/valentine/metrics/__init__.py +++ b/valentine/metrics/__init__.py @@ -1,23 +1,8 @@ -from valentine.metrics import metrics as metrics_module -from typing import List, Dict, Tuple - -metrics = {"names": ["precision", "recall", "f1_score", "precision_at_n_percent", "recall_at_sizeof_ground_truth"], - "args": { - "n": [10, 30, 50, 70, 90] - }} - - -def all_metrics(matches: List[Dict[Tuple[Tuple[str, str], Tuple[str, str]], float]], - golden_standard): - # load and print the specified metrics - metric_fns = [getattr(metrics_module, met) for met in metrics['names']] - - final_metrics = dict() - - for metric in metric_fns: - if metric.__name__ != "precision_at_n_percent": - final_metrics[metric.__name__] = metric(matches, golden_standard) - else: - for n in metrics['args']['n']: - final_metrics[metric.__name__.replace('_n_', '_' + str(n) + '_')] = metric(matches, golden_standard, n) - return final_metrics +from valentine.metrics.base_metric import Metric +from .metrics import * + +# Some predefined sets of metrics +METRICS_ALL = {metric() for metric in Metric.__subclasses__()} # Note: will also catch newly defined metrics +METRICS_CORE = {Precision(), Recall(), F1Score(), PrecisionTopNPercent(), RecallAtSizeofGroundTruth()} +METRICS_PRECISION_RECALL = {Precision(), Recall()} +METRICS_PRECISION_INCREASING_N = {PrecisionTopNPercent(n=x + 10) for x in range(0, 100, 10)} diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py new file mode 100644 index 0000000..c3b33b3 --- /dev/null +++ b/valentine/metrics/base_metric.py @@ -0,0 +1,66 @@ +"""Provides the base metric class, that can be inherited from to implement +metrics. +""" +from __future__ import annotations +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..algorithms.matcher_results import MatcherResults +from abc import ABC, abstractmethod +from typing import Dict, Tuple, List, Any, final +from dataclasses import dataclass + + +@dataclass(eq=True, frozen=True) +class Metric(ABC): + """Base class for a metric. Metrics can be prepared with parameters by + instantiating them, their application is deferred to a later moment this + way, which can be implemented by overriding the `apply` method. + """ + + @abstractmethod + def apply(self: Metric, matches: MatcherResults, ground_truth: List[Tuple[str, str]]) -> Dict[str, Any]: + """Applies the metric to a `MatcherResults` instance, given ground + truth. + + Parameters + ---------- + matches : MatcherResults + The `MatcherResults` instance, obtained from `valentine_match`. + + ground_truth : List[Tuple[str, str]] + The ground truth column match pairs, by column name. + e.g. [("col1_tab_A", "col1_tab_B"), ...etc...] + + Raises + ------ + NotImplementedError + Override this method in concrete implementations. + """ + pass + + def name(self: Metric) -> str: + """The name of the metric, as it appears in the metric results. + + Returns + ------- + str + The name of the metric. + """ + return self.__class__.__name__ + + @final + def return_format(self: Metric, value: Any) -> Dict[str, Any]: + """The return format of the `apply` method. + + Parameters + ---------- + value : Any + The metric value or score. + + Returns + ------- + Dict[str, Any] + The formatted metric value or score. + """ + return {self.name(): value} diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py new file mode 100644 index 0000000..036b91d --- /dev/null +++ b/valentine/metrics/metric_helpers.py @@ -0,0 +1,81 @@ +from __future__ import annotations +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from ..algorithms.matcher_results import MatcherResults +from typing import Tuple, List + + +def get_tp_fn(matches: MatcherResults, + ground_truth: List[Tuple[str, str]], + n: int | None = None): + """Counts the amount of true positives and the amount of false + negatives among the matches in the given MatcherResults. + + Parameters + ---------- + matches : MatcherResults + A MatcherResults object that is obtained from a matcher. + ground_truth : list + A list with tuples that correspond to the ground truth matches. + e.g. [("col1_tab_A", "col1_tab_B"), ...etc...] + n : int, optional + The percentage of matches to consider. + e.g. (90) for 90% of the matches + + Returns + ------- + (int, int) + Amount of true positives and amount of false negatives. + """ + tp = 0 + fn = 0 + + matches_dict = matches.get_copy() + all_matches = [(m[0][1], m[1][1]) for m in matches_dict.keys()] + + if n is not None: + all_matches = all_matches[:n] + + for expected_match in ground_truth: + if expected_match in all_matches: + tp += 1 + else: + fn += 1 + + return tp, fn + + +def get_fp(matches: MatcherResults, + ground_truth: List[Tuple[str, str]], + n: int | None = None): + """Counts the amount of false positives among the matches in the + given MatcherResults. + + Parameters + ---------- + matches : MatcherResults + A MatcherResults object that is obtained from a matcher. + ground_truth : list + A list with tuples that correspond to the ground truth matches. + e.g. [("col1_tab_A", "col1_tab_B"), ...etc...] + n : int, optional + The percentage of matches to consider. + e.g. (90) for 90% of the matches + + Returns + ------- + int + Amount of false positives. + """ + fp = 0 + matches_dict = matches.get_copy() + all_matches = [(m[0][1], m[1][1]) for m in matches_dict.keys()] + + if n is not None: + all_matches = all_matches[:n] + + for possible_match in all_matches: + if possible_match not in ground_truth: + fp += 1 + + return fp diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 5af75bb..b184842 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -1,256 +1,128 @@ -import math -from typing import Dict, Tuple, List +"""Here one can find some common metric implementations. Custom metrics can be +made by subclassing the `Metric` ABC. Marking them with the dataclass decorator +allows for proper hashing/equals without the boilerplate. +""" +from .base_metric import Metric +from .metric_helpers import * +from dataclasses import dataclass -def one_to_one_matches(matches: dict): - """ - A filter that takes a dict of column matches and returns a dict of 1 to 1 matches. The filter works in the following - way: At first it gets the median similarity of the set of the values and removes all matches - that have a similarity lower than that. Then from what remained it matches columns for me highest similarity - to the lowest till the columns have at most one match. - Parameters +@dataclass(eq=True, frozen=True) +class Precision(Metric): + """Metric for calculating precision. + + Attributes ---------- - matches : dict - The ranked list of matches - Returns - ------- - dict - The ranked list of matches after the 1 to 1 filter + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - set_match_values = set(matches.values()) - - if len(set_match_values) < 2: - return matches - - matched = dict() + one_to_one: bool = True - for key in matches.keys(): - matched[key[0]] = False - matched[key[1]] = False + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - median = sorted(set_match_values, reverse=True)[math.ceil(len(set_match_values)/2)] + tp, _ = get_tp_fn(matches, ground_truth) + fp = get_fp(matches, ground_truth) + precision = 0 + if tp + fp > 0: + precision = tp / (tp + fp) - matches1to1 = dict() + return self.return_format(precision) - for key in matches.keys(): - if (not matched[key[0]]) and (not matched[key[1]]): - similarity = matches.get(key) - if similarity >= median: - matches1to1[key] = similarity - matched[key[0]] = True - matched[key[1]] = True - else: - break - return matches1to1 +@dataclass(eq=True, frozen=True) +class Recall(Metric): + """Metric for calculating recall. -def get_tp_fn(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - n: int = None): - """ - Calculate the true positive and false negative numbers of the given matches - - Parameters + Attributes ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - n : int, optional - The percentage number that we want to consider from the ranked list (matches) - e.g. (90) for 90% of the matches - - Returns - ------- - (int, int) - True positive and false negative counts + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - tp = 0 - fn = 0 + one_to_one: bool = True - all_matches = [(m[0][1], m[1][1]) for m in matches.keys()] + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - if n is not None: - all_matches = all_matches[:n] + tp, fn = get_tp_fn(matches, ground_truth) + recall = 0 + if tp + fn > 0: + recall = tp / (tp + fn) - for expected_match in golden_standard: - if expected_match in all_matches: - tp = tp + 1 - else: - fn = fn + 1 - return tp, fn + return self.return_format(recall) -def get_fp(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - n: int = None): - """ - Calculate the false positive number of the given matches +@dataclass(eq=True, frozen=True) +class F1Score(Metric): + """Metric for calculating f1 score. - Parameters + Attributes ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - n : int, optional - The percentage number that we want to consider from the ranked list (matches) - e.g. (90) for 90% of the matches - - Returns - ------- - int - False positive + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - fp = 0 - - all_matches = [(m[0][1], m[1][1]) for m in matches.keys()] - - if n is not None: - all_matches = all_matches[:n] - - for possible_match in all_matches: - if possible_match not in golden_standard: - fp = fp + 1 - return fp + one_to_one: bool = True + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() -def recall(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - one_to_one=True): - """ - Function that calculates the recall of the matches against the golden standard. If one_to_one is set to true, it - also performs an 1-1 match filer. Meaning that each column will match only with another one. - - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - one_to_one : bool, optional - If to perform the 1-1 match filter + tp, fn = get_tp_fn(matches, ground_truth) + fp = get_fp(matches, ground_truth) + f1 = 0 + if tp > 0: + pr = tp / (tp + fp) + re = tp / (tp + fn) + f1 = 2 * ((pr * re) / (pr + re)) - Returns - ------- - float - The recall - """ - if one_to_one: - matches = one_to_one_matches(matches) - tp, fn = get_tp_fn(matches, golden_standard) - if tp + fn == 0: - return 0 - return tp / (tp + fn) + return self.return_format(f1) -def precision(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - one_to_one=True): - """ - Function that calculates the precision of the matches against the golden standard. If one_to_one is set to true, it - also performs an 1-1 match filer. Meaning that each column will match only with another one. +@dataclass(eq=True, frozen=True) +class PrecisionTopNPercent(Metric): + """Metric for calculating precision of the top N percent of matches. - Parameters + Attributes ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - one_to_one : bool, optional - If to perform the 1-1 match filter - - Returns - ------- - float - The precision + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. + n : int + The percent of matches to consider. """ - if one_to_one: - matches = one_to_one_matches(matches) - tp, _ = get_tp_fn(matches, golden_standard) - fp = get_fp(matches, golden_standard) - if tp + fp == 0: - return 0 - return tp / (tp + fp) + one_to_one: bool = True + n: int = 10 + def name(self): + return super().name().replace('N', str(self.n)) -def f1_score(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - one_to_one=True): - """ - Function that calculates the F1 score of the matches against the golden standard. If one_to_one is set to true, it - also performs an 1-1 match filer. Meaning that each column will match only with another one. + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - one_to_one : bool, optional - If to perform the 1-1 match filter + n_matches = matches.take_top_percent(self.n) - Returns - ------- - float - The f1_score - """ - pr = precision(matches, golden_standard, one_to_one) - re = recall(matches, golden_standard, one_to_one) - if pr + re == 0: - return 0 - return 2 * ((pr * re) / (pr + re)) + tp, _ = get_tp_fn(n_matches, ground_truth) + fp = get_fp(n_matches, ground_truth) + precision_top_n_percent = 0 + if tp + fp > 0: + precision_top_n_percent = tp / (tp + fp) + return self.return_format(precision_top_n_percent) -def precision_at_n_percent(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - n: int): - """ - Function that calculates the precision at n % - e.g. if n is 10 then only the first 10% of the matches will be considered for the precision calculation - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - n : int - The integer percentage number - - Returns - ------- - float - The precision at n % +@dataclass(eq=True, frozen=True) +class RecallAtSizeofGroundTruth(Metric): + """Metric for calculating recall at the size of the ground truth. """ - number_to_keep = int(math.ceil((n / 100) * len(matches.keys()))) - tp, _ = get_tp_fn(matches, golden_standard, number_to_keep) - fp = get_fp(matches, golden_standard, number_to_keep) - if tp + fp == 0: - return 0 - return tp / (tp + fp) + def apply(self, matches, ground_truth): + n_matches = matches.take_top_n(len(ground_truth)) -def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]],): - """ - Function that calculates the recall at the size of the ground truth. - e.g. if the size of ground truth size is 10 then only the first 10 matches will be considered for - the recall calculation + tp, fn = get_tp_fn(n_matches, ground_truth) + recall = 0 + if tp + fn > 0: + recall = tp / (tp + fn) - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - - Returns - ------- - float - The recall at the size of ground truth - """ - tp, fn = get_tp_fn(matches, golden_standard, len(golden_standard)) - if tp + fn == 0: - return 0 - return tp / (tp + fn) + return self.return_format(recall)