Skip to content

Commit

Permalink
feat: Implement Pydantic validation err count table using `Validation…
Browse files Browse the repository at this point in the history
…ErrCounter()`
  • Loading branch information
candleindark committed Jan 13, 2025
1 parent 614b342 commit e5e6c66
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 196 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import logging
from itertools import chain
from pathlib import Path
from typing import Annotated
from typing import TYPE_CHECKING, Annotated, cast

if TYPE_CHECKING:
from collections.abc import Iterable

from jsondiff import diff
from pydantic import Field
Expand Down Expand Up @@ -32,7 +35,10 @@
gen_diff_cell,
gen_pydantic_validation_errs_cell,
gen_row,
pydantic_validation_err_count_table,
validation_err_count_table,
)
from dandisets_linkml_status_tools.tools.validation_err_counter import (
ValidationErrCounter,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -312,24 +318,65 @@ def _output_dandiset_validation_diff_reports(
logger.info("Creating dandiset validation diff report directory %s", output_dir)
output_dir.mkdir(parents=True)

err1_rep_iters: list[Iterable[tuple[str, str, tuple[str | int], Path]]] = []
err2_rep_iters: list[Iterable[tuple[str, str, tuple[str | int], Path]]] = []
for r in reports:
p = Path(r.dandiset_identifier, r.dandiset_version)

# Tuple representation of the Pydantic validation errors
err1_rep_iters.append(
(e["type"], e["msg"], tuple(e["loc"]), p)
for e in r.pydantic_validation_errs1
)
err2_rep_iters.append(
(e["type"], e["msg"], tuple(e["loc"]), p)
for e in r.pydantic_validation_errs2
)

err1_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
chain.from_iterable(err1_rep_iters)
)
err2_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
chain.from_iterable(err2_rep_iters)
)

def err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]:
"""
Categorize a Pydantic validation error represented as a tuple using the same
tuple without the path component to the dandiset at a particular version and
with a generalized "loc" with all array indices replaced by "[*]"
:param err: The tuple representing the Pydantic validation error
:return: The tuple representing the category that the error belongs to
"""
err = cast(tuple[str, str, tuple[str | int, ...], Path], err)
type_, msg = err[0], err[1]

# Generalize the "loc" by replacing all array indices with "[*]"
loc = cast(
tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2])
)

return type_, msg, loc

pydantic_validation_errs1_ctr = ValidationErrCounter(err_categorizer)
pydantic_validation_errs2_ctr = ValidationErrCounter(err_categorizer)

pydantic_validation_errs1_ctr.count(err1_reps)
pydantic_validation_errs2_ctr.count(err2_reps)

with (output_dir / summary_file_name).open("w") as summary_f:
# === Output counts of different categories of Pydantic validation errors for
# validations done with separate schemas ===
summary_f.write("### Pydantic errs 1 counts\n\n")
summary_f.write(
pydantic_validation_err_count_table(
chain.from_iterable(r.pydantic_validation_errs1 for r in reports),
compress=True,
)
validation_err_count_table(pydantic_validation_errs1_ctr.counts_by_cat)
)

summary_f.write("\n")
summary_f.write("### Pydantic errs 2 counts\n\n")
summary_f.write(
pydantic_validation_err_count_table(
chain.from_iterable(r.pydantic_validation_errs2 for r in reports),
compress=True,
)
validation_err_count_table(pydantic_validation_errs2_ctr.counts_by_cat)
)

# Write the header and alignment rows of the summary table
Expand Down
30 changes: 0 additions & 30 deletions src/dandisets_linkml_status_tools/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,33 +551,3 @@ def get_validation_reports_entries(
for dandiset_version in reports_of_specific_dandiset_id:
entries.add((dandiset_id, dandiset_version))
return entries


def count_pydantic_validation_errs(
errs: Iterable[dict[str, Any]], *, compress: bool = False
) -> Counter[tuple[str, str, tuple[int | str, ...]]]:
"""
Count an iterable of Pydantic validation errors each represented by a dictionary
:param errs: The iterable of Pydantic validation errors
:param compress: A boolean indicating whether to compress the counts by considering
all index values in the location of the errors the same. These values are to be
represented by the string "[*]" in the keys of the returning counter.
:return: The `Counter` object that counts the errors by categories identified by
the error type, message, and location. I.e., each key in the counter is a tuple,
consisting of the error type ("type"), message ("msg"),
and location ("loc" as a tuple) of the errors counted in that category.
"""
if not compress:
c = Counter((err["type"], err["msg"], tuple(err["loc"])) for err in errs)
else:
c = Counter(
(
err["type"],
err["msg"],
tuple("[*]" if isinstance(v, int) else v for v in err["loc"]),
)
for err in errs
)

return c
21 changes: 0 additions & 21 deletions src/dandisets_linkml_status_tools/tools/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from collections.abc import Iterable
from pathlib import Path
from typing import Any

from dandisets_linkml_status_tools.models import PydanticValidationErrsType
from dandisets_linkml_status_tools.tools.typing import Stringable
Expand Down Expand Up @@ -85,26 +84,6 @@ def validation_err_count_table(c: dict[tuple, int]) -> str:
)


def pydantic_validation_err_count_table(
errs: Iterable[dict[str, Any]], *, compress: bool = False
) -> str:
"""
Generate a table of Pydantic validation error counts from an iterable of Pydantic
validation errors each represented by a dictionary
:param errs: The iterable of Pydantic validation errors
:param compress: A boolean indicating whether to compress the counts by considering
all index values in the location of the errors the same. These values are to be
represented by the string "[*]" in the categories of the errors.
:return: The string presenting the table in Markdown format
"""
from dandisets_linkml_status_tools.tools import count_pydantic_validation_errs

return validation_err_count_table(
count_pydantic_validation_errs(errs, compress=compress)
)


# The set of special Markdown characters that need to be escaped
# This set doesn't include (<, >, |) because they are HTML-sensitive characters
BASE_SPECIAL_CHARS = set(r"\`*_{}[]()#+-.!")
Expand Down
136 changes: 1 addition & 135 deletions tests/test_tools/test__init__.py
Original file line number Diff line number Diff line change
@@ -1,144 +1,10 @@
from collections import Counter

import pytest
from jsonschema import ValidationError
from linkml.validator.report import Severity, ValidationResult

from dandisets_linkml_status_tools.models import JsonschemaValidationErrorType
from dandisets_linkml_status_tools.tools import (
count_pydantic_validation_errs,
get_linkml_err_counts,
)


@pytest.mark.parametrize(
("errs", "compress", "expected"),
[
# ──────────────────────────
# Empty input
# ──────────────────────────
([], False, Counter()),
([], True, Counter()),
# ──────────────────────────
# Single error
# ──────────────────────────
(
[{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]}],
False,
Counter({("value_error", "Invalid value", ("field_a",)): 1}),
),
(
[{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]}],
True,
# Same as above, because there's no integer index to compress
Counter({("value_error", "Invalid value", ("field_a",)): 1}),
),
# ──────────────────────────
# Multiple distinct errors
# ──────────────────────────
(
[
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
{"type": "type_error", "msg": "Wrong type", "loc": ["field_b", 2]},
{"type": "missing_field", "msg": "Field required", "loc": ["field_c"]},
],
False,
Counter(
{
("value_error", "Invalid value", ("field_a", 0)): 1,
("type_error", "Wrong type", ("field_b", 2)): 1,
("missing_field", "Field required", ("field_c",)): 1,
}
),
),
(
[
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
{"type": "type_error", "msg": "Wrong type", "loc": ["field_b", 2]},
{"type": "missing_field", "msg": "Field required", "loc": ["field_c"]},
],
True,
# Integer indices in loc become "[*]"
Counter(
{
("value_error", "Invalid value", ("field_a", "[*]")): 1,
("type_error", "Wrong type", ("field_b", "[*]")): 1,
("missing_field", "Field required", ("field_c",)): 1,
}
),
),
# ──────────────────────────
# Repeated identical errors
# ──────────────────────────
(
[
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
],
False,
Counter({("value_error", "Invalid value", ("field_a",)): 3}),
),
(
[
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
],
True,
# Same as above, because there's no integer index to compress
Counter({("value_error", "Invalid value", ("field_a",)): 3}),
),
# ──────────────────────────
# Multiple integer indices
# ──────────────────────────
(
[
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 1]},
{
"type": "type_error",
"msg": "Wrong type",
"loc": ["field_b", 2, "subfield"],
},
],
False,
Counter(
{
("value_error", "Invalid value", ("field_a", 0)): 1,
("value_error", "Invalid value", ("field_a", 1)): 1,
("type_error", "Wrong type", ("field_b", 2, "subfield")): 1,
}
),
),
(
[
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
{"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 1]},
{
"type": "type_error",
"msg": "Wrong type",
"loc": ["field_b", 2, "subfield"],
},
],
True,
# Index locations are replaced by "[*]"
Counter(
{
("value_error", "Invalid value", ("field_a", "[*]")): 2,
("type_error", "Wrong type", ("field_b", "[*]", "subfield")): 1,
}
),
),
],
)
def test_count_pydantic_validation_errs(errs, compress, expected):
"""
Test the count_pydantic_validation_errs function with 'loc' as a list rather than
a tuple, under multiple scenarios of input errors and compression settings.
"""
result = count_pydantic_validation_errs(errs, compress=compress)
assert result == expected
from dandisets_linkml_status_tools.tools import get_linkml_err_counts


@pytest.mark.parametrize(
Expand Down

0 comments on commit e5e6c66

Please sign in to comment.