Skip to content

Commit

Permalink
Merge pull request #22 from reworkd/schema-validation
Browse files Browse the repository at this point in the history
More additions to schema validation
  • Loading branch information
snshn authored May 22, 2024
2 parents 0cd1c6d + 59c3e40 commit 619c1cb
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 70 deletions.
11 changes: 2 additions & 9 deletions harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
ElementHandle,
TimeoutError as PlaywrightTimeoutError,
)
from pydantic import ValidationError

from harambe.handlers import (
ResourceRequestHandler,
Expand All @@ -26,7 +25,7 @@
DuplicateHandler,
ObservationTrigger,
)
from harambe.parser.parser import PydanticSchemaParser, SchemaValidationError
from harambe.parser.parser import PydanticSchemaParser
from harambe.tracker import FileDataTracker
from harambe.types import (
URL,
Expand Down Expand Up @@ -102,13 +101,7 @@ async def save_data(self, *data: ScrapeResult) -> None:
url = self.page.url
for d in data:
if self._validator is not None:
try:
self._validator.validate(d)
except ValidationError:
raise SchemaValidationError(
data=d,
schema=self._validator.schema,
)
self._validator.validate(d)
d["__url"] = url
await self._notify_observers("on_save_data", d)

Expand Down
54 changes: 48 additions & 6 deletions harambe/parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, Type, Optional
from typing import Any, Dict, List, Optional, Type

from pydantic import BaseModel, create_model, Field, AnyUrl, Extra
from pydantic import BaseModel, create_model, Field, AnyUrl, Extra, ValidationError
from harambe.types import Schema


Expand All @@ -16,10 +16,10 @@ def validate(self, data: Dict[str, Any]) -> None:


class SchemaValidationError(Exception):
def __init__(self, schema, data):
def __init__(self, schema, data, message):
super().__init__(
"Data {data} does not match schema {schema}".format(
data=data, schema=schema
"Data {data} does not match schema {schema}. {message}".format(
data=data, schema=schema, message=message
)
)

Expand All @@ -34,7 +34,39 @@ def __init__(self, schema: Schema):
self.model = _schema_to_pydantic_model(schema)

def validate(self, data: Dict[str, Any]) -> None:
self.model(**data)
try:
self.model(**data)
except ValidationError as validation_error:
raise SchemaValidationError(
data=data, schema=self.schema, message=validation_error
)


def _items_schema_to_python_type(
items_info: Schema, model_name: str = "DynamicModelItem"
) -> Type:
"""
Convert a JSON schema's items property to a Python type
"""
item_type = items_info.get("type")

if item_type == OBJECT_TYPE:
python_type = _schema_to_pydantic_model(
items_info.get("properties", {}),
model_name=f"{model_name}Object",
)
elif item_type == LIST_TYPE:
# Lists can't be null
python_type = List[
_items_schema_to_python_type(
items_info.get("items", {}),
)
]
else:
# Non complex types aren't optional when they're within a list
python_type = get_type(item_type)

return python_type


def _schema_to_pydantic_model(
Expand All @@ -54,6 +86,14 @@ def _schema_to_pydantic_model(
field_info.get("properties", {}),
model_name=f"{model_name}{field_name.capitalize()}",
)
elif field_type == LIST_TYPE:
# Lists can't be null
python_type = List[
_items_schema_to_python_type(
field_info.get("items", {}),
model_name=f"{model_name}Item",
)
]
else:
# Non complex types should be optional
python_type = Optional[get_type(field_type)]
Expand Down Expand Up @@ -86,6 +126,8 @@ def get_type(field: str) -> Type:
"number": float,
"float": float,
"double": float,
LIST_TYPE: List,
OBJECT_TYPE: Dict[str, Any],
# TODO: Add support for date and datetime types
# TODO: The URL type should have a custom validator to handle relative URLs
"url": AnyUrl,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "harambe-sdk"
version = "0.10.0"
version = "0.11.0"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = ["awtkns <[email protected]>"]
readme = "README.md"
Expand Down
205 changes: 151 additions & 54 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import Any

import pytest
from pydantic import ValidationError

from harambe.parser.parser import PydanticSchemaParser
from harambe.parser.parser import PydanticSchemaParser, SchemaValidationError
import tests.parser.schemas as schemas
from harambe.types import Schema

Expand Down Expand Up @@ -56,37 +55,69 @@
"address": {"street": None, "city": None, "zip": None},
},
),
# TODO: Support lists and nested objects
# (documents_schema, {"documents": []}),
#
# (documents_schema, {
# "documents": [
# {"title": "Document One", "document_url": "http://example.com/doc1"},
# ]
# }),
#
# (list_of_strings_schema, {"tags": ["python", "pydantic", "typing"]}),
#
# (list_of_objects_schema,
# {"users": [{"name": "Alice", "email": "[email protected]"}, {"name": "Bob", "email": "[email protected]"}]}),
#
# (object_with_list_schema, {"team": {"name": "Developers", "members": ["Alice", "Bob"]}}),
#
# (list_of_lists_schema, {"matrix": [[1, 2], [3, 4]]}),
#
# (nested_lists_and_objects_schema, {
# "departments": [
# {
# "name": "Engineering",
# "teams": [
# {
# "team_name": "Backend",
# "members": ["Alice", "Bob"]
# }
# ]
# }
# ]
# }),
(
# Schema
schemas.documents_schema,
# Data
{"documents": []},
),
(
# Schema
schemas.documents_schema,
# Data
{
"documents": [
{
"title": "Document One",
"document_url": "http://example.com/doc1",
},
]
},
),
(
# Schema
schemas.list_of_strings_schema,
# Data
{"tags": ["python", "pydantic", "typing"]},
),
(
# Schema
schemas.list_of_objects_schema,
# Data
{
"users": [
{"name": "Alice", "email": "[email protected]"},
{"name": "Bob", "email": "[email protected]"},
]
},
),
(
# Schema
schemas.object_with_list_schema,
# Data
{"team": {"name": "Developers", "members": ["Alice", "Bob"]}},
),
(
# Schema
schemas.list_of_lists_schema,
# Data
{"matrix": [[1, 2], [3, 4]]},
),
(
# Schema
schemas.nested_lists_and_objects_schema,
# Data
{
"departments": [
{
"name": "Engineering",
"teams": [
{"team_name": "Backend", "members": ["Alice", "Bob"]}
],
}
]
},
),
],
)
def test_pydantic_schema_validator_success(
Expand Down Expand Up @@ -153,31 +184,97 @@ def test_pydantic_schema_validator_success(
"address": None, # ❌ No sub-fields
},
),
# TODO: Support lists and nested objects
# (documents_schema, {"documents": None}), # Null list
# (documents_schema, {"documents": [None]}), # Null item in list
# (list_of_strings_schema, {"tags": [None, "pydantic", "typing"]}), # None in list of strings
# (list_of_objects_schema, {"users": [{"name": "Alice", "email": 12345}]}), # Invalid email type
# (object_with_list_schema, {"team": {"name": "Developers", "members": [None]}}), # None in sub-list
# (list_of_lists_schema, {"matrix": [[1, "a"], [3, 4]]}), # Invalid type in nested list
# (nested_lists_and_objects_schema, {
# "departments": [
# {
# "name": "Engineering",
# "teams": [
# {
# "team_name": "Backend",
# "members": ["Alice", None] # None in nested object list
# }
# ]
# }
# ]
# }),
(
# Schema
schemas.documents_schema,
# Data
{
"documents": None # ❌ Null list
},
),
(
# Schema
schemas.documents_schema,
# Data
{
"documents": [
None # ❌ Null item in list
]
},
),
(
# Schema
schemas.list_of_strings_schema,
# Data
{
"tags": [
None, # ❌ None in list of strings
"pydantic",
"typing",
]
},
),
(
# Schema
schemas.list_of_objects_schema,
# Data
{
"users": [
{
"name": "Alice",
"email": 12345, # ❌ Invalid email type
}
]
},
),
(
# Schema
schemas.object_with_list_schema,
# Data
{
"team": {
"name": "Developers",
"members": [None], # ❌ None in sub-list
}
},
),
(
# Schema
schemas.list_of_lists_schema,
# Data
{
"matrix": [
[1, "a"], # ❌ Invalid type in nested list
[3, 4],
]
},
),
(
# Schema
schemas.nested_lists_and_objects_schema,
# Data
{
"departments": [
{
"name": "Engineering",
"teams": [
{
"team_name": "Backend",
"members": [
"Alice",
None, # ❌ None in nested object list
],
}
],
}
]
},
),
],
)
def test_pydantic_schema_validator_error(schema: Schema, data: dict[str, Any]) -> None:
validator = PydanticSchemaParser(schema)
with pytest.raises(ValidationError):
with pytest.raises(SchemaValidationError):
validator.validate(data)


Expand Down

0 comments on commit 619c1cb

Please sign in to comment.