diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ca5b7cf..ffab98b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,6 +21,9 @@ jobs: - name: install coverage run: pip install coverage + - name: install pytest + run: pip install pytest==8.2.0 + - name: run tests run: coverage run --source=valentine -m unittest discover tests diff --git a/.github/workflows/build_all_os.yml b/.github/workflows/build_all_os.yml index f46bebb..1d75774 100755 --- a/.github/workflows/build_all_os.yml +++ b/.github/workflows/build_all_os.yml @@ -25,5 +25,7 @@ jobs: java-version: '11' - name: Install valentine run: pip install . - - name: run tests + - name: Install test dependencies + run: pip install pytest==8.2.0 + - name: Run tests run: python -m unittest discover tests diff --git a/requirements.txt b/requirements.txt index bce155d..4c70343 100755 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,6 @@ jellyfish==1.0.1 PuLP==2.7.0 pyemd==1.0.0 # data loading -python-dateutil==2.8.2 \ No newline at end of file +python-dateutil==2.8.2 +# testing +pytest~=8.2.0 \ No newline at end of file diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py index 63d683f..697cf4e 100644 --- a/tests/test_algorithms.py +++ b/tests/test_algorithms.py @@ -1,125 +1,79 @@ -import unittest +import pytest from tests import df1, df2 from valentine.algorithms import Coma, JaccardDistanceMatcher, DistributionBased, SimilarityFlooding, Cupid -from valentine.data_sources import DataframeTable from valentine.algorithms.jaccard_distance import StringDistanceFunction +from valentine.data_sources import DataframeTable d1 = DataframeTable(df1, name='authors1') d2 = DataframeTable(df2, name='authors2') -class TestAlgorithms(unittest.TestCase): +def test_coma(): + # Test the schema variant of coma + coma_matcher_schema = Coma(use_instances=False) + matches_coma_matcher_schema = coma_matcher_schema.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_coma_matcher_schema) > 0 + # Test the instance variant of coma + coma_matcher_instances = Coma(use_instances=True) + matches_coma_matcher_instances = coma_matcher_instances.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_coma_matcher_instances) > 0 + # Assume the Schema and instance should provide different results + assert matches_coma_matcher_schema != matches_coma_matcher_instances - def test_coma(self): - # Test the schema variant of coma - coma_matcher_schema = Coma(use_instances=False) - matches_coma_matcher_schema = coma_matcher_schema.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_coma_matcher_schema) > 0 - # Test the instance variant of coma - coma_matcher_instances = Coma(use_instances=True) - matches_coma_matcher_instances = coma_matcher_instances.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_coma_matcher_instances) > 0 - # Assume the Schema and instance should provide different results - assert matches_coma_matcher_schema != matches_coma_matcher_instances - def test_cupid(self): - # Test the CUPID matcher - cu_matcher = Cupid() - matches_cu_matcher = cu_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_cu_matcher) > 0 - cu_matcher = Cupid(parallelism=2) - matches_cu_matcher = cu_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_cu_matcher) > 0 +def test_cupid(): + # Test the CUPID matcher + cu_matcher = Cupid() + matches_cu_matcher = cu_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_cu_matcher) > 0 + cu_matcher = Cupid(parallelism=2) + matches_cu_matcher = cu_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_cu_matcher) > 0 - def test_distribution_based(self): - # Test the Distribution based matcher - distribution_based_matcher = DistributionBased() - matches_db_matcher = distribution_based_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_db_matcher) > 0 - distribution_based_matcher = DistributionBased(process_num=2) - matches_db_matcher = distribution_based_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_db_matcher) > 0 - def test_jaccard(self): - # Test the Jaccard matcher with exact string similarity - jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Exact) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 +def test_distribution_based(): + # Test the Distribution based matcher + distribution_based_matcher = DistributionBased() + matches_db_matcher = distribution_based_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_db_matcher) > 0 + distribution_based_matcher = DistributionBased(process_num=2) + matches_db_matcher = distribution_based_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_db_matcher) > 0 - def test_jaccard_hamming(self): - # Test the Jaccard matcher with Hamming distance - jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Hamming) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, - process_num=2, - distance_fun=StringDistanceFunction.Hamming) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - def test_jaccard_levenshtein(self): - # Test the Jaccard matcher with Levenshtein distance - jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Levenshtein) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, - process_num=2, - distance_fun=StringDistanceFunction.Levenshtein) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 +def test_jaccard(): + # Test the Jaccard matcher with exact string similarity + jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Exact) + matches_jd_matcher = jd_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_jd_matcher) > 0 - def test_jaccard_damerau_levenshtein(self): - # Test the Jaccard matcher with Damerau-Levenshtein distance - jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.DamerauLevenshtein) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, - process_num=2, - distance_fun=StringDistanceFunction.DamerauLevenshtein) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - def test_jaccard_jaro_winkler(self): - # Test the Jaccard matcher with Jaro-Winkler distance - jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.JaroWinkler) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, - process_num=2, - distance_fun=StringDistanceFunction.JaroWinkler) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 +@pytest.mark.parametrize("distance_function", [StringDistanceFunction.Hamming, StringDistanceFunction.Levenshtein, + StringDistanceFunction.DamerauLevenshtein, + StringDistanceFunction.JaroWinkler, StringDistanceFunction.Jaro]) +def test_jaccard_distance_function(distance_function): + # Test the Jaccard matcher with different distance functions + jd_matcher = JaccardDistanceMatcher(distance_fun=distance_function) + matches_jd_matcher = jd_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_jd_matcher) > 0 + jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=distance_function) + matches_jd_matcher = jd_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_jd_matcher) > 0 - def test_jaccard_jaro(self): - # Test the Jaccard matcher with Jaro distance - jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Jaro) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Jaro) - matches_jd_matcher = jd_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_jd_matcher) > 0 - def test_similarity_flooding(self): - # Test the Similarity flooding matcher - sf_matcher = SimilarityFlooding() - matches_sf_matcher = sf_matcher.get_matches(d1, d2) - # Check that it actually produced output - assert len(matches_sf_matcher) > 0 +def test_similarity_flooding(): + # Test the Similarity flooding matcher + sf_matcher = SimilarityFlooding() + matches_sf_matcher = sf_matcher.get_matches(d1, d2) + # Check that it actually produced output + assert len(matches_sf_matcher) > 0