feat: Implement Pydantic validation err count table using `Validation…

…ErrCounter()`
dandi · Jan 13, 2025 · e5e6c66 · e5e6c66
1 parent 614b342
commit e5e6c66
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 196 deletions.
diff --git a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
@@ -1,7 +1,10 @@
 import logging
 from itertools import chain
 from pathlib import Path
-from typing import Annotated
+from typing import TYPE_CHECKING, Annotated, cast
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
 
 from jsondiff import diff
 from pydantic import Field
@@ -32,7 +35,10 @@
     gen_diff_cell,
     gen_pydantic_validation_errs_cell,
     gen_row,
-    pydantic_validation_err_count_table,
+    validation_err_count_table,
+)
+from dandisets_linkml_status_tools.tools.validation_err_counter import (
+    ValidationErrCounter,
 )
 
 logger = logging.getLogger(__name__)
@@ -312,24 +318,65 @@ def _output_dandiset_validation_diff_reports(
     logger.info("Creating dandiset validation diff report directory %s", output_dir)
     output_dir.mkdir(parents=True)
 
+    err1_rep_iters: list[Iterable[tuple[str, str, tuple[str | int], Path]]] = []
+    err2_rep_iters: list[Iterable[tuple[str, str, tuple[str | int], Path]]] = []
+    for r in reports:
+        p = Path(r.dandiset_identifier, r.dandiset_version)
+
+        # Tuple representation of the Pydantic validation errors
+        err1_rep_iters.append(
+            (e["type"], e["msg"], tuple(e["loc"]), p)
+            for e in r.pydantic_validation_errs1
+        )
+        err2_rep_iters.append(
+            (e["type"], e["msg"], tuple(e["loc"]), p)
+            for e in r.pydantic_validation_errs2
+        )
+
+    err1_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
+        chain.from_iterable(err1_rep_iters)
+    )
+    err2_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
+        chain.from_iterable(err2_rep_iters)
+    )
+
+    def err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]:
+        """
+        Categorize a Pydantic validation error represented as a tuple using the same
+        tuple without the path component to the dandiset at a particular version and
+        with a generalized "loc" with all array indices replaced by "[*]"
+
+        :param err: The tuple representing the Pydantic validation error
+        :return: The tuple representing the category that the error belongs to
+        """
+        err = cast(tuple[str, str, tuple[str | int, ...], Path], err)
+        type_, msg = err[0], err[1]
+
+        # Generalize the "loc" by replacing all array indices with "[*]"
+        loc = cast(
+            tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2])
+        )
+
+        return type_, msg, loc
+
+    pydantic_validation_errs1_ctr = ValidationErrCounter(err_categorizer)
+    pydantic_validation_errs2_ctr = ValidationErrCounter(err_categorizer)
+
+    pydantic_validation_errs1_ctr.count(err1_reps)
+    pydantic_validation_errs2_ctr.count(err2_reps)
+
     with (output_dir / summary_file_name).open("w") as summary_f:
         # === Output counts of different categories of Pydantic validation errors for
         # validations done with separate schemas ===
         summary_f.write("### Pydantic errs 1 counts\n\n")
         summary_f.write(
-            pydantic_validation_err_count_table(
-                chain.from_iterable(r.pydantic_validation_errs1 for r in reports),
-                compress=True,
-            )
+            validation_err_count_table(pydantic_validation_errs1_ctr.counts_by_cat)
         )
 
         summary_f.write("\n")
         summary_f.write("### Pydantic errs 2 counts\n\n")
         summary_f.write(
-            pydantic_validation_err_count_table(
-                chain.from_iterable(r.pydantic_validation_errs2 for r in reports),
-                compress=True,
-            )
+            validation_err_count_table(pydantic_validation_errs2_ctr.counts_by_cat)
         )
 
         # Write the header and alignment rows of the summary table

diff --git a/src/dandisets_linkml_status_tools/tools/__init__.py b/src/dandisets_linkml_status_tools/tools/__init__.py
@@ -551,33 +551,3 @@ def get_validation_reports_entries(
         for dandiset_version in reports_of_specific_dandiset_id:
             entries.add((dandiset_id, dandiset_version))
     return entries
-
-
-def count_pydantic_validation_errs(
-    errs: Iterable[dict[str, Any]], *, compress: bool = False
-) -> Counter[tuple[str, str, tuple[int | str, ...]]]:
-    """
-    Count an iterable of Pydantic validation errors each represented by a dictionary
-
-    :param errs: The iterable of Pydantic validation errors
-    :param compress: A boolean indicating whether to compress the counts by considering
-        all index values in the location of the errors the same. These values are to be
-        represented by the string "[*]" in the keys of the returning counter.
-    :return: The `Counter` object that counts the errors by categories identified by
-        the error type, message, and location. I.e., each key in the counter is a tuple,
-        consisting of the error type ("type"), message ("msg"),
-        and location ("loc" as a tuple) of the errors counted in that category.
-    """
-    if not compress:
-        c = Counter((err["type"], err["msg"], tuple(err["loc"])) for err in errs)
-    else:
-        c = Counter(
-            (
-                err["type"],
-                err["msg"],
-                tuple("[*]" if isinstance(v, int) else v for v in err["loc"]),
-            )
-            for err in errs
-        )
-
-    return c
diff --git a/src/dandisets_linkml_status_tools/tools/md.py b/src/dandisets_linkml_status_tools/tools/md.py
@@ -2,7 +2,6 @@
 
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Any
 
 from dandisets_linkml_status_tools.models import PydanticValidationErrsType
 from dandisets_linkml_status_tools.tools.typing import Stringable
@@ -85,26 +84,6 @@ def validation_err_count_table(c: dict[tuple, int]) -> str:
     )
 
 
-def pydantic_validation_err_count_table(
-    errs: Iterable[dict[str, Any]], *, compress: bool = False
-) -> str:
-    """
-    Generate a table of Pydantic validation error counts from an iterable of Pydantic
-    validation errors each represented by a dictionary
-
-    :param errs: The iterable of Pydantic validation errors
-    :param compress: A boolean indicating whether to compress the counts by considering
-        all index values in the location of the errors the same. These values are to be
-        represented by the string "[*]" in the categories of the errors.
-    :return: The string presenting the table in Markdown format
-    """
-    from dandisets_linkml_status_tools.tools import count_pydantic_validation_errs
-
-    return validation_err_count_table(
-        count_pydantic_validation_errs(errs, compress=compress)
-    )
-
-
 # The set of special Markdown characters that need to be escaped
 # This set doesn't include (<, >, |) because they are HTML-sensitive characters
 BASE_SPECIAL_CHARS = set(r"\`*_{}[]()#+-.!")

diff --git a/tests/test_tools/test__init__.py b/tests/test_tools/test__init__.py
@@ -1,144 +1,10 @@
-from collections import Counter
 
 import pytest
 from jsonschema import ValidationError
 from linkml.validator.report import Severity, ValidationResult
 
 from dandisets_linkml_status_tools.models import JsonschemaValidationErrorType
-from dandisets_linkml_status_tools.tools import (
-    count_pydantic_validation_errs,
-    get_linkml_err_counts,
-)
-
-
-@pytest.mark.parametrize(
-    ("errs", "compress", "expected"),
-    [
-        # ──────────────────────────
-        # Empty input
-        # ──────────────────────────
-        ([], False, Counter()),
-        ([], True, Counter()),
-        # ──────────────────────────
-        # Single error
-        # ──────────────────────────
-        (
-            [{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]}],
-            False,
-            Counter({("value_error", "Invalid value", ("field_a",)): 1}),
-        ),
-        (
-            [{"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]}],
-            True,
-            # Same as above, because there's no integer index to compress
-            Counter({("value_error", "Invalid value", ("field_a",)): 1}),
-        ),
-        # ──────────────────────────
-        # Multiple distinct errors
-        # ──────────────────────────
-        (
-            [
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
-                {"type": "type_error", "msg": "Wrong type", "loc": ["field_b", 2]},
-                {"type": "missing_field", "msg": "Field required", "loc": ["field_c"]},
-            ],
-            False,
-            Counter(
-                {
-                    ("value_error", "Invalid value", ("field_a", 0)): 1,
-                    ("type_error", "Wrong type", ("field_b", 2)): 1,
-                    ("missing_field", "Field required", ("field_c",)): 1,
-                }
-            ),
-        ),
-        (
-            [
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
-                {"type": "type_error", "msg": "Wrong type", "loc": ["field_b", 2]},
-                {"type": "missing_field", "msg": "Field required", "loc": ["field_c"]},
-            ],
-            True,
-            # Integer indices in loc become "[*]"
-            Counter(
-                {
-                    ("value_error", "Invalid value", ("field_a", "[*]")): 1,
-                    ("type_error", "Wrong type", ("field_b", "[*]")): 1,
-                    ("missing_field", "Field required", ("field_c",)): 1,
-                }
-            ),
-        ),
-        # ──────────────────────────
-        # Repeated identical errors
-        # ──────────────────────────
-        (
-            [
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
-            ],
-            False,
-            Counter({("value_error", "Invalid value", ("field_a",)): 3}),
-        ),
-        (
-            [
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a"]},
-            ],
-            True,
-            # Same as above, because there's no integer index to compress
-            Counter({("value_error", "Invalid value", ("field_a",)): 3}),
-        ),
-        # ──────────────────────────
-        # Multiple integer indices
-        # ──────────────────────────
-        (
-            [
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 1]},
-                {
-                    "type": "type_error",
-                    "msg": "Wrong type",
-                    "loc": ["field_b", 2, "subfield"],
-                },
-            ],
-            False,
-            Counter(
-                {
-                    ("value_error", "Invalid value", ("field_a", 0)): 1,
-                    ("value_error", "Invalid value", ("field_a", 1)): 1,
-                    ("type_error", "Wrong type", ("field_b", 2, "subfield")): 1,
-                }
-            ),
-        ),
-        (
-            [
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 0]},
-                {"type": "value_error", "msg": "Invalid value", "loc": ["field_a", 1]},
-                {
-                    "type": "type_error",
-                    "msg": "Wrong type",
-                    "loc": ["field_b", 2, "subfield"],
-                },
-            ],
-            True,
-            # Index locations are replaced by "[*]"
-            Counter(
-                {
-                    ("value_error", "Invalid value", ("field_a", "[*]")): 2,
-                    ("type_error", "Wrong type", ("field_b", "[*]", "subfield")): 1,
-                }
-            ),
-        ),
-    ],
-)
-def test_count_pydantic_validation_errs(errs, compress, expected):
-    """
-    Test the count_pydantic_validation_errs function with 'loc' as a list rather than
-    a tuple, under multiple scenarios of input errors and compression settings.
-    """
-    result = count_pydantic_validation_errs(errs, compress=compress)
-    assert result == expected
+from dandisets_linkml_status_tools.tools import get_linkml_err_counts
 
 
 @pytest.mark.parametrize(