feat: implement ValidationErrCounter()

This counter allows counting errors by categories without losing information of the errors
dandi · Jan 13, 2025 · 6aaf511 · 6aaf511
1 parent ab355b4
commit 6aaf511
Show file tree

Hide file tree

Showing 2 changed files with 242 additions and 0 deletions.
diff --git a/src/dandisets_linkml_status_tools/tools/validation_err_counter.py b/src/dandisets_linkml_status_tools/tools/validation_err_counter.py
@@ -0,0 +1,71 @@
+from collections import Counter, defaultdict
+from collections.abc import Callable, Iterable
+
+
+class ValidationErrCounter:
+
+    def __init__(self, err_categorizer: Callable[[tuple], tuple]):
+        """
+        Initialize the validation error counter
+
+        :param err_categorizer: A function that categorizes validation errors,
+            represented by tuples, into categories, also represented by tuples
+        """
+        self._err_categorizer = err_categorizer
+
+        # A dictionary with keys being the categories of the errors and values being
+        # `Counter` objects with keys being individual errors counted in the
+        # corresponding category.
+        self._err_ctrs_by_cat: dict[tuple, Counter[tuple]] = defaultdict(Counter)
+
+    def count(self, errs: Iterable[tuple]) -> None:
+        """
+        Count the validation errors
+
+        :param errs: An iterable of tuples representing validation errors
+        """
+        for err in errs:
+            cat = self._err_categorizer(err)
+
+            # Count the error in the corresponding category
+            self._err_ctrs_by_cat[cat].update([err])
+
+    @property
+    def counts_by_cat(self) -> dict[tuple, int]:
+        """
+        Get the counts of validation errors keyed by their categories
+
+        :return: A dictionary with keys being tuples representing categories of the
+            errors and values being the counts of the errors in the corresponding
+            categories
+        """
+        return {cat: ctr.total() for cat, ctr in self._err_ctrs_by_cat.items()}
+
+    def cats(self) -> set[tuple]:
+        """
+        Get the categories of the validation errors
+
+        :return: The set of all categories of the validation errors represented
+            by tuples
+        """
+        return set(self._err_ctrs_by_cat.keys())
+
+    def __getitem__(self, cat: tuple) -> Counter[tuple]:
+        """
+        Get the `Counter` corresponding to a category of the validation errors
+
+        :param cat: The category of the validation errors
+        :return: The `Counter` object
+        """
+        return self._err_ctrs_by_cat[cat].copy()
+
+    def items(self) -> list[tuple[tuple, Counter[tuple]]]:
+        """
+        Get the items of the counter
+
+        :return: A list of tuples, each consisting of a tuple representing a category of
+            validation errors and a `Counter` object representing the counts of the
+            errors in the that category
+        """
+
+        return [(cat, ctr.copy()) for cat, ctr in self._err_ctrs_by_cat.items()]
diff --git a/tests/test_tools/test_validation_err_counter.py b/tests/test_tools/test_validation_err_counter.py
@@ -0,0 +1,171 @@
+from collections import Counter
+
+import pytest
+
+from dandisets_linkml_status_tools.tools.validation_err_counter import (
+    ValidationErrCounter,
+)
+
+
+def simple_categorizer(err: tuple) -> tuple:
+    """
+    A simple categorizer function that:
+      - Uses the first element of the error tuple to categorize errors by type
+      - Returns a tuple representing the category. For example, ('TypeError',)
+    """
+    # In practice, you might analyze multiple elements of `err`
+    return (err[0],)
+
+
+@pytest.fixture
+def err_counter():
+    """
+    Returns a ValidationErrCounter instance using our simple_categorizer above.
+    """
+    return ValidationErrCounter(err_categorizer=simple_categorizer)
+
+
+def test_init(err_counter):
+    """
+    Test that the ValidationErrCounter is initialized properly.
+    """
+    assert isinstance(err_counter._err_ctrs_by_cat, dict)
+    assert len(err_counter._err_ctrs_by_cat) == 0
+
+
+def test_count_single(err_counter):
+    """
+    Test counting a single error.
+    """
+    errors = [("ValueError", "Some message")]
+    err_counter.count(errors)
+
+    err_ctrs_by_cat = err_counter._err_ctrs_by_cat
+
+    assert set(err_ctrs_by_cat.keys()) == {("ValueError",)}
+    assert err_ctrs_by_cat[("ValueError",)] == {("ValueError", "Some message"): 1}
+
+
+def test_count_multiple_same_category(err_counter):
+    """
+    Test counting multiple errors that fall into the same category.
+    """
+    errors = [
+        ("TypeError", "Message 1"),
+        ("TypeError", "Message 2"),
+        ("TypeError", "Message 1"),  # repeated error
+    ]
+    err_counter.count(errors)
+
+    err_ctrs_by_cat = err_counter._err_ctrs_by_cat
+
+    assert set(err_ctrs_by_cat.keys()) == {("TypeError",)}
+
+    # "Message 1" appears twice, "Message 2" appears once
+    assert err_ctrs_by_cat[("TypeError",)] == {
+        ("TypeError", "Message 1"): 2,
+        ("TypeError", "Message 2"): 1,
+    }
+
+
+def test_count_multiple_different_categories(err_counter):
+    """
+    Test counting multiple errors that fall into different categories.
+    """
+    errors = [
+        ("TypeError", "Message A"),
+        ("ValueError", "Message B"),
+        ("ValueError", "Message B"),
+        ("KeyError", "Message C"),
+    ]
+    err_counter.count(errors)
+
+    err_ctrs_by_cat = err_counter._err_ctrs_by_cat
+
+    assert set(err_ctrs_by_cat.keys()) == {
+        ("TypeError",),
+        ("ValueError",),
+        ("KeyError",),
+    }
+
+    assert err_ctrs_by_cat[("TypeError",)] == {("TypeError", "Message A"): 1}
+    assert err_ctrs_by_cat[("ValueError",)] == {("ValueError", "Message B"): 2}
+    assert err_ctrs_by_cat[("KeyError",)] == {("KeyError", "Message C"): 1}
+
+
+def test_counts_by_cat(err_counter):
+    """
+    Test the counts_by_cat property, which returns the sum of errors in each category.
+    """
+    errors = [
+        ("TypeError", "Message A"),
+        ("TypeError", "Message B"),
+        ("KeyError", "Message C"),
+        ("KeyError", "Message C"),
+        ("KeyError", "Message C"),
+    ]
+    err_counter.count(errors)
+
+    counts = err_counter.counts_by_cat
+    # There are 2 'TypeError' errors and 3 'KeyError' error
+    assert counts[("TypeError",)] == 2
+    assert counts[("KeyError",)] == 3
+
+
+def test_cats(err_counter):
+    """
+    Test the cats method, which returns the set of error categories.
+    """
+    errors = [
+        ("TypeError", "Message 1"),
+        ("ValueError", "Message 2"),
+    ]
+    err_counter.count(errors)
+    categories = err_counter.cats()
+
+    # Should contain exactly ("TypeError",) and ("ValueError",)
+    assert categories == {("TypeError",), ("ValueError",)}
+
+
+def test_getitem(err_counter):
+    """
+    Test the __getitem__ method, which returns a copy of the Counter for a given category.
+    """
+    errors = [("ValueError", "Some message")]
+    err_counter.count(errors)
+
+    value_error_counter = err_counter[("ValueError",)]
+
+    assert isinstance(value_error_counter, Counter)
+    # value_error_counter should be a copy of the original Counter
+    assert value_error_counter is not err_counter._err_ctrs_by_cat[("ValueError",)]
+    assert value_error_counter == {("ValueError", "Some message"): 1}
+
+
+def test_items(err_counter):
+    """
+    Test the items method, which returns list of (category, Counter) pairs.
+    """
+    errors = [
+        ("TypeError", "Message 1"),
+        ("ValueError", "Message 2"),
+    ]
+    err_counter.count(errors)
+
+    items = err_counter.items()
+    # We expect two pairs: ((TypeError,), Counter) and ((ValueError,), Counter)
+    assert len(items) == 2
+
+    # Sort so we can reliably test
+    items_sorted = sorted(items, key=lambda x: x[0])
+    assert items_sorted[0][0] == ("TypeError",)
+    assert items_sorted[1][0] == ("ValueError",)
+
+    # Check that the counters inside match
+    assert items_sorted[0][1] == {("TypeError", "Message 1"): 1}
+    assert items_sorted[1][1] == {("ValueError", "Message 2"): 1}
+
+    # Test the Counter in each pair is a copy
+    for _, counter in items:
+        for counter_original in err_counter._err_ctrs_by_cat.values():
+            assert counter is not counter_original