Skip to content

Commit

Permalink
feat: implement ValidationErrCounter()
Browse files Browse the repository at this point in the history
This counter allows counting errors by categories
without losing information of the errors
  • Loading branch information
candleindark committed Jan 13, 2025
1 parent ab355b4 commit 6aaf511
Show file tree
Hide file tree
Showing 2 changed files with 242 additions and 0 deletions.
71 changes: 71 additions & 0 deletions src/dandisets_linkml_status_tools/tools/validation_err_counter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from collections import Counter, defaultdict
from collections.abc import Callable, Iterable


class ValidationErrCounter:

def __init__(self, err_categorizer: Callable[[tuple], tuple]):
"""
Initialize the validation error counter
:param err_categorizer: A function that categorizes validation errors,
represented by tuples, into categories, also represented by tuples
"""
self._err_categorizer = err_categorizer

# A dictionary with keys being the categories of the errors and values being
# `Counter` objects with keys being individual errors counted in the
# corresponding category.
self._err_ctrs_by_cat: dict[tuple, Counter[tuple]] = defaultdict(Counter)

def count(self, errs: Iterable[tuple]) -> None:
"""
Count the validation errors
:param errs: An iterable of tuples representing validation errors
"""
for err in errs:
cat = self._err_categorizer(err)

# Count the error in the corresponding category
self._err_ctrs_by_cat[cat].update([err])

@property
def counts_by_cat(self) -> dict[tuple, int]:
"""
Get the counts of validation errors keyed by their categories
:return: A dictionary with keys being tuples representing categories of the
errors and values being the counts of the errors in the corresponding
categories
"""
return {cat: ctr.total() for cat, ctr in self._err_ctrs_by_cat.items()}

def cats(self) -> set[tuple]:
"""
Get the categories of the validation errors
:return: The set of all categories of the validation errors represented
by tuples
"""
return set(self._err_ctrs_by_cat.keys())

def __getitem__(self, cat: tuple) -> Counter[tuple]:
"""
Get the `Counter` corresponding to a category of the validation errors
:param cat: The category of the validation errors
:return: The `Counter` object
"""
return self._err_ctrs_by_cat[cat].copy()

def items(self) -> list[tuple[tuple, Counter[tuple]]]:
"""
Get the items of the counter
:return: A list of tuples, each consisting of a tuple representing a category of
validation errors and a `Counter` object representing the counts of the
errors in the that category
"""

return [(cat, ctr.copy()) for cat, ctr in self._err_ctrs_by_cat.items()]
171 changes: 171 additions & 0 deletions tests/test_tools/test_validation_err_counter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from collections import Counter

import pytest

from dandisets_linkml_status_tools.tools.validation_err_counter import (
ValidationErrCounter,
)


def simple_categorizer(err: tuple) -> tuple:
"""
A simple categorizer function that:
- Uses the first element of the error tuple to categorize errors by type
- Returns a tuple representing the category. For example, ('TypeError',)
"""
# In practice, you might analyze multiple elements of `err`
return (err[0],)


@pytest.fixture
def err_counter():
"""
Returns a ValidationErrCounter instance using our simple_categorizer above.
"""
return ValidationErrCounter(err_categorizer=simple_categorizer)


def test_init(err_counter):
"""
Test that the ValidationErrCounter is initialized properly.
"""
assert isinstance(err_counter._err_ctrs_by_cat, dict)
assert len(err_counter._err_ctrs_by_cat) == 0


def test_count_single(err_counter):
"""
Test counting a single error.
"""
errors = [("ValueError", "Some message")]
err_counter.count(errors)

err_ctrs_by_cat = err_counter._err_ctrs_by_cat

assert set(err_ctrs_by_cat.keys()) == {("ValueError",)}
assert err_ctrs_by_cat[("ValueError",)] == {("ValueError", "Some message"): 1}


def test_count_multiple_same_category(err_counter):
"""
Test counting multiple errors that fall into the same category.
"""
errors = [
("TypeError", "Message 1"),
("TypeError", "Message 2"),
("TypeError", "Message 1"), # repeated error
]
err_counter.count(errors)

err_ctrs_by_cat = err_counter._err_ctrs_by_cat

assert set(err_ctrs_by_cat.keys()) == {("TypeError",)}

# "Message 1" appears twice, "Message 2" appears once
assert err_ctrs_by_cat[("TypeError",)] == {
("TypeError", "Message 1"): 2,
("TypeError", "Message 2"): 1,
}


def test_count_multiple_different_categories(err_counter):
"""
Test counting multiple errors that fall into different categories.
"""
errors = [
("TypeError", "Message A"),
("ValueError", "Message B"),
("ValueError", "Message B"),
("KeyError", "Message C"),
]
err_counter.count(errors)

err_ctrs_by_cat = err_counter._err_ctrs_by_cat

assert set(err_ctrs_by_cat.keys()) == {
("TypeError",),
("ValueError",),
("KeyError",),
}

assert err_ctrs_by_cat[("TypeError",)] == {("TypeError", "Message A"): 1}
assert err_ctrs_by_cat[("ValueError",)] == {("ValueError", "Message B"): 2}
assert err_ctrs_by_cat[("KeyError",)] == {("KeyError", "Message C"): 1}


def test_counts_by_cat(err_counter):
"""
Test the counts_by_cat property, which returns the sum of errors in each category.
"""
errors = [
("TypeError", "Message A"),
("TypeError", "Message B"),
("KeyError", "Message C"),
("KeyError", "Message C"),
("KeyError", "Message C"),
]
err_counter.count(errors)

counts = err_counter.counts_by_cat
# There are 2 'TypeError' errors and 3 'KeyError' error
assert counts[("TypeError",)] == 2
assert counts[("KeyError",)] == 3


def test_cats(err_counter):
"""
Test the cats method, which returns the set of error categories.
"""
errors = [
("TypeError", "Message 1"),
("ValueError", "Message 2"),
]
err_counter.count(errors)
categories = err_counter.cats()

# Should contain exactly ("TypeError",) and ("ValueError",)
assert categories == {("TypeError",), ("ValueError",)}


def test_getitem(err_counter):
"""
Test the __getitem__ method, which returns a copy of the Counter for a given category.
"""
errors = [("ValueError", "Some message")]
err_counter.count(errors)

value_error_counter = err_counter[("ValueError",)]

assert isinstance(value_error_counter, Counter)
# value_error_counter should be a copy of the original Counter
assert value_error_counter is not err_counter._err_ctrs_by_cat[("ValueError",)]
assert value_error_counter == {("ValueError", "Some message"): 1}


def test_items(err_counter):
"""
Test the items method, which returns list of (category, Counter) pairs.
"""
errors = [
("TypeError", "Message 1"),
("ValueError", "Message 2"),
]
err_counter.count(errors)

items = err_counter.items()
# We expect two pairs: ((TypeError,), Counter) and ((ValueError,), Counter)
assert len(items) == 2

# Sort so we can reliably test
items_sorted = sorted(items, key=lambda x: x[0])
assert items_sorted[0][0] == ("TypeError",)
assert items_sorted[1][0] == ("ValueError",)

# Check that the counters inside match
assert items_sorted[0][1] == {("TypeError", "Message 1"): 1}
assert items_sorted[1][1] == {("ValueError", "Message 2"): 1}

# Test the Counter in each pair is a copy
for _, counter in items:
for counter_original in err_counter._err_ctrs_by_cat.values():
assert counter is not counter_original

0 comments on commit 6aaf511

Please sign in to comment.