-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: implement
ValidationErrCounter()
This counter allows counting errors by categories without losing information of the errors
- Loading branch information
1 parent
ab355b4
commit 6aaf511
Showing
2 changed files
with
242 additions
and
0 deletions.
There are no files selected for viewing
71 changes: 71 additions & 0 deletions
71
src/dandisets_linkml_status_tools/tools/validation_err_counter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from collections import Counter, defaultdict | ||
from collections.abc import Callable, Iterable | ||
|
||
|
||
class ValidationErrCounter: | ||
|
||
def __init__(self, err_categorizer: Callable[[tuple], tuple]): | ||
""" | ||
Initialize the validation error counter | ||
:param err_categorizer: A function that categorizes validation errors, | ||
represented by tuples, into categories, also represented by tuples | ||
""" | ||
self._err_categorizer = err_categorizer | ||
|
||
# A dictionary with keys being the categories of the errors and values being | ||
# `Counter` objects with keys being individual errors counted in the | ||
# corresponding category. | ||
self._err_ctrs_by_cat: dict[tuple, Counter[tuple]] = defaultdict(Counter) | ||
|
||
def count(self, errs: Iterable[tuple]) -> None: | ||
""" | ||
Count the validation errors | ||
:param errs: An iterable of tuples representing validation errors | ||
""" | ||
for err in errs: | ||
cat = self._err_categorizer(err) | ||
|
||
# Count the error in the corresponding category | ||
self._err_ctrs_by_cat[cat].update([err]) | ||
|
||
@property | ||
def counts_by_cat(self) -> dict[tuple, int]: | ||
""" | ||
Get the counts of validation errors keyed by their categories | ||
:return: A dictionary with keys being tuples representing categories of the | ||
errors and values being the counts of the errors in the corresponding | ||
categories | ||
""" | ||
return {cat: ctr.total() for cat, ctr in self._err_ctrs_by_cat.items()} | ||
|
||
def cats(self) -> set[tuple]: | ||
""" | ||
Get the categories of the validation errors | ||
:return: The set of all categories of the validation errors represented | ||
by tuples | ||
""" | ||
return set(self._err_ctrs_by_cat.keys()) | ||
|
||
def __getitem__(self, cat: tuple) -> Counter[tuple]: | ||
""" | ||
Get the `Counter` corresponding to a category of the validation errors | ||
:param cat: The category of the validation errors | ||
:return: The `Counter` object | ||
""" | ||
return self._err_ctrs_by_cat[cat].copy() | ||
|
||
def items(self) -> list[tuple[tuple, Counter[tuple]]]: | ||
""" | ||
Get the items of the counter | ||
:return: A list of tuples, each consisting of a tuple representing a category of | ||
validation errors and a `Counter` object representing the counts of the | ||
errors in the that category | ||
""" | ||
|
||
return [(cat, ctr.copy()) for cat, ctr in self._err_ctrs_by_cat.items()] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
from collections import Counter | ||
|
||
import pytest | ||
|
||
from dandisets_linkml_status_tools.tools.validation_err_counter import ( | ||
ValidationErrCounter, | ||
) | ||
|
||
|
||
def simple_categorizer(err: tuple) -> tuple: | ||
""" | ||
A simple categorizer function that: | ||
- Uses the first element of the error tuple to categorize errors by type | ||
- Returns a tuple representing the category. For example, ('TypeError',) | ||
""" | ||
# In practice, you might analyze multiple elements of `err` | ||
return (err[0],) | ||
|
||
|
||
@pytest.fixture | ||
def err_counter(): | ||
""" | ||
Returns a ValidationErrCounter instance using our simple_categorizer above. | ||
""" | ||
return ValidationErrCounter(err_categorizer=simple_categorizer) | ||
|
||
|
||
def test_init(err_counter): | ||
""" | ||
Test that the ValidationErrCounter is initialized properly. | ||
""" | ||
assert isinstance(err_counter._err_ctrs_by_cat, dict) | ||
assert len(err_counter._err_ctrs_by_cat) == 0 | ||
|
||
|
||
def test_count_single(err_counter): | ||
""" | ||
Test counting a single error. | ||
""" | ||
errors = [("ValueError", "Some message")] | ||
err_counter.count(errors) | ||
|
||
err_ctrs_by_cat = err_counter._err_ctrs_by_cat | ||
|
||
assert set(err_ctrs_by_cat.keys()) == {("ValueError",)} | ||
assert err_ctrs_by_cat[("ValueError",)] == {("ValueError", "Some message"): 1} | ||
|
||
|
||
def test_count_multiple_same_category(err_counter): | ||
""" | ||
Test counting multiple errors that fall into the same category. | ||
""" | ||
errors = [ | ||
("TypeError", "Message 1"), | ||
("TypeError", "Message 2"), | ||
("TypeError", "Message 1"), # repeated error | ||
] | ||
err_counter.count(errors) | ||
|
||
err_ctrs_by_cat = err_counter._err_ctrs_by_cat | ||
|
||
assert set(err_ctrs_by_cat.keys()) == {("TypeError",)} | ||
|
||
# "Message 1" appears twice, "Message 2" appears once | ||
assert err_ctrs_by_cat[("TypeError",)] == { | ||
("TypeError", "Message 1"): 2, | ||
("TypeError", "Message 2"): 1, | ||
} | ||
|
||
|
||
def test_count_multiple_different_categories(err_counter): | ||
""" | ||
Test counting multiple errors that fall into different categories. | ||
""" | ||
errors = [ | ||
("TypeError", "Message A"), | ||
("ValueError", "Message B"), | ||
("ValueError", "Message B"), | ||
("KeyError", "Message C"), | ||
] | ||
err_counter.count(errors) | ||
|
||
err_ctrs_by_cat = err_counter._err_ctrs_by_cat | ||
|
||
assert set(err_ctrs_by_cat.keys()) == { | ||
("TypeError",), | ||
("ValueError",), | ||
("KeyError",), | ||
} | ||
|
||
assert err_ctrs_by_cat[("TypeError",)] == {("TypeError", "Message A"): 1} | ||
assert err_ctrs_by_cat[("ValueError",)] == {("ValueError", "Message B"): 2} | ||
assert err_ctrs_by_cat[("KeyError",)] == {("KeyError", "Message C"): 1} | ||
|
||
|
||
def test_counts_by_cat(err_counter): | ||
""" | ||
Test the counts_by_cat property, which returns the sum of errors in each category. | ||
""" | ||
errors = [ | ||
("TypeError", "Message A"), | ||
("TypeError", "Message B"), | ||
("KeyError", "Message C"), | ||
("KeyError", "Message C"), | ||
("KeyError", "Message C"), | ||
] | ||
err_counter.count(errors) | ||
|
||
counts = err_counter.counts_by_cat | ||
# There are 2 'TypeError' errors and 3 'KeyError' error | ||
assert counts[("TypeError",)] == 2 | ||
assert counts[("KeyError",)] == 3 | ||
|
||
|
||
def test_cats(err_counter): | ||
""" | ||
Test the cats method, which returns the set of error categories. | ||
""" | ||
errors = [ | ||
("TypeError", "Message 1"), | ||
("ValueError", "Message 2"), | ||
] | ||
err_counter.count(errors) | ||
categories = err_counter.cats() | ||
|
||
# Should contain exactly ("TypeError",) and ("ValueError",) | ||
assert categories == {("TypeError",), ("ValueError",)} | ||
|
||
|
||
def test_getitem(err_counter): | ||
""" | ||
Test the __getitem__ method, which returns a copy of the Counter for a given category. | ||
""" | ||
errors = [("ValueError", "Some message")] | ||
err_counter.count(errors) | ||
|
||
value_error_counter = err_counter[("ValueError",)] | ||
|
||
assert isinstance(value_error_counter, Counter) | ||
# value_error_counter should be a copy of the original Counter | ||
assert value_error_counter is not err_counter._err_ctrs_by_cat[("ValueError",)] | ||
assert value_error_counter == {("ValueError", "Some message"): 1} | ||
|
||
|
||
def test_items(err_counter): | ||
""" | ||
Test the items method, which returns list of (category, Counter) pairs. | ||
""" | ||
errors = [ | ||
("TypeError", "Message 1"), | ||
("ValueError", "Message 2"), | ||
] | ||
err_counter.count(errors) | ||
|
||
items = err_counter.items() | ||
# We expect two pairs: ((TypeError,), Counter) and ((ValueError,), Counter) | ||
assert len(items) == 2 | ||
|
||
# Sort so we can reliably test | ||
items_sorted = sorted(items, key=lambda x: x[0]) | ||
assert items_sorted[0][0] == ("TypeError",) | ||
assert items_sorted[1][0] == ("ValueError",) | ||
|
||
# Check that the counters inside match | ||
assert items_sorted[0][1] == {("TypeError", "Message 1"): 1} | ||
assert items_sorted[1][1] == {("ValueError", "Message 2"): 1} | ||
|
||
# Test the Counter in each pair is a copy | ||
for _, counter in items: | ||
for counter_original in err_counter._err_ctrs_by_cat.values(): | ||
assert counter is not counter_original |