From 24f4da4a2bda2eed5c1bc86818ba8036fb5a5de6 Mon Sep 17 00:00:00 2001 From: awtkns <32209255+awtkns@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:12:30 -0800 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=8C=B3=20Fix=20output=20parsing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../parser/expression/functions.py | 2 +- core/harambe_core/parser/parser.py | 49 ++++++++-------- core/pyproject.toml | 2 +- core/test/parser/test_null_values.py | 58 ++++++++++++++++++- core/test/parser/test_parser.py | 4 +- sdk/pyproject.toml | 4 +- sdk/test/test_e2e.py | 14 ++--- sdk/uv.lock | 2 +- 8 files changed, 97 insertions(+), 38 deletions(-) diff --git a/core/harambe_core/parser/expression/functions.py b/core/harambe_core/parser/expression/functions.py index 5f6377a..6b3bb5f 100644 --- a/core/harambe_core/parser/expression/functions.py +++ b/core/harambe_core/parser/expression/functions.py @@ -1,6 +1,6 @@ from typing import Any -from slugify import slugify as python_slugify +# from slugify import slugify as python_slugify from harambe_core.parser.expression.evaluator import ExpressionEvaluator diff --git a/core/harambe_core/parser/parser.py b/core/harambe_core/parser/parser.py index f929562..974dadf 100644 --- a/core/harambe_core/parser/parser.py +++ b/core/harambe_core/parser/parser.py @@ -51,10 +51,6 @@ def validate(self, data: dict[str, Any], base_url: str) -> dict[str, Any]: self.field_types = self._get_field_types(base_url) model = self._schema_to_pydantic_model(self.schema) - if self._all_fields_empty(data): - raise SchemaValidationError( - message="All fields are null or empty.", - ) try: res = model(**data).model_dump() if self._pk_expression: @@ -245,25 +241,6 @@ def _get_type(self, field: SchemaFieldType, required: bool | None) -> Type[Any]: field_type = Optional[field_type] return field_type - def _all_fields_empty(self, data: dict[str, Any]) -> bool: - """ - Recursively check if all fields in the data are either None or empty. - This includes handling nested dictionaries and lists. - """ - - def is_empty(value: Any) -> bool: - if value is None: - return True - if isinstance(value, dict): - return all(is_empty(v) for v in value.values()) - if isinstance(value, list): - return all(is_empty(v) for v in value) - if isinstance(value, str): - return not value.strip() - return False - - return all(is_empty(value) for value in data.values()) - def base_model_factory( config: ConfigDict, computed_fields: dict[str, str], evaluator: ExpressionEvaluator @@ -304,6 +281,32 @@ def evaluate_computed_fields(self) -> Self: for field, expression in computed_fields.items(): res = evaluator.evaluate(expression, self) setattr(self, field, res) + + if _all_fields_empty(self.model_dump()): + raise SchemaValidationError( + message="All fields are null or empty.", + ) + return self return PreValidatedBaseModel + + +def _all_fields_empty(data: dict[str, Any]) -> bool: + """ + Recursively check if all fields in the data are either None or empty. + This includes handling nested dictionaries and lists. + """ + + def is_empty(value: Any) -> bool: + if value is None: + return True + if isinstance(value, dict): + return all(is_empty(v) for v in value.values()) + if isinstance(value, list): + return all(is_empty(v) for v in value) + if isinstance(value, str): + return not value.strip() + return False + + return all(is_empty(value) for value in data.values()) diff --git a/core/pyproject.toml b/core/pyproject.toml index 9b11629..0c4e28d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.50.0" +version = "0.50.1" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/test/parser/test_null_values.py b/core/test/parser/test_null_values.py index 38a87af..0e62d0a 100644 --- a/core/test/parser/test_null_values.py +++ b/core/test/parser/test_null_values.py @@ -118,7 +118,7 @@ def test_pydantic_schema_validation_error_fail(data: Dict[str, Any]) -> None: "code_type": "", "code": "", "code_description": "", - "description": "", + "description": "Somthing", } ], }, @@ -127,3 +127,59 @@ def test_pydantic_schema_validation_error_fail(data: Dict[str, Any]) -> None: def test_pydantic_schema_validation_success(data: Dict[str, Any]): validator = SchemaParser(government_contracts) validator.validate(data, base_url="http://example.com") + + +@pytest.mark.parametrize( + "data", + [ + {"group": "Team", "members": [{}]}, + { + "group": "Team", + "members": [ + { + "name": "", + "age": None, + }, + ], + }, + ], +) +def test_with_emtpy_objects(data): + schema = { + "group": {"type": "string"}, + "members": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + }, + }, + } + + with pytest.raises(SchemaValidationError): + validator = SchemaParser(schema) + validator.validate(data, base_url="http://example.com") + + +@pytest.mark.parametrize( + "strings", + [ + ( + ["", None, None], + [None, None], + ["a", " "], + ["a", "b", "c", ""], + ) + ], +) +def test_with_empty_literals(strings): + schema = { + "strings": {"type": "array", "items": {"type": "integer"}}, + } + + with pytest.raises(SchemaValidationError): + validator = SchemaParser(schema) + validator.validate({"strings": strings}, base_url="http://example.com") diff --git a/core/test/parser/test_parser.py b/core/test/parser/test_parser.py index 08dfe37..fc21250 100644 --- a/core/test/parser/test_parser.py +++ b/core/test/parser/test_parser.py @@ -37,8 +37,8 @@ ( load_schema("contact"), { - "name": {"first_name": None, "last_name": None}, - "address": {"street": None, "city": None, "zip": None}, + "name": {"first_name": "Adam", "last_name": None}, + "address": {"street": None, "city": None, "zip": "9104"}, "phone_numbers": [{"type": "mobile", "number": "+1 (628) 555-3456"}], }, ), diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index aeeac41..07cdc64 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.50.0" +version = "0.50.1" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.50.0", + "harambe_core==0.50.1", "pydantic==2.9.2", "playwright==1.47.0", "setuptools==73.0.0", diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py index 48de12a..e345cde 100644 --- a/sdk/test/test_e2e.py +++ b/sdk/test/test_e2e.py @@ -249,10 +249,10 @@ async def scraper(sdk: SDK, *args, **kwargs): assert observer.data[0]["page_content"] == observer.data[1]["table_content"] for text in ["Apple", "Orange", "Banana"]: assert ( - text in observer.data[0]["page_content"] + text in observer.data[0]["page_content"] ), f"{text} not in {observer.data[0]['page_content']}" assert ( - text in observer.data[1]["table_content"] + text in observer.data[1]["table_content"] ), f"{text} not in {observer.data[1]['table_content']}" @@ -361,13 +361,13 @@ async def scraper(sdk: SDK, *args, **kwargs): ({"key1": "value1", "key2": 2}, '{"key1": "value1", "key2": 2}'), # Nested structure ( - {"list": [1, 2, {"nested": "value"}]}, - '{"list": [1, 2, {"nested": "value"}]}', + {"list": [1, 2, {"nested": "value"}]}, + '{"list": [1, 2, {"nested": "value"}]}', ), ], ) async def test_load_local_storage( - server, observer, harness, test_value, expected_value + server, observer, harness, test_value, expected_value ): local_storage_entry_1 = { "domain": "asim-shrestha.com", @@ -541,8 +541,8 @@ async def scrape(sdk: SDK, url, context) -> None: assert observer.data[0]["solicitation_id"] == "6100062375" assert observer.data[0]["title"] == "23SW SGL 111 Conn Road" assert ( - observer.data[0]["description"] - == "The State of Pennsylvania is seeking proposals for IT services" + observer.data[0]["description"] + == "The State of Pennsylvania is seeking proposals for IT services" ) assert observer.data[0]["status"] == "Open" assert len(observer.data[0]["attachments"]) == 4 diff --git a/sdk/uv.lock b/sdk/uv.lock index 2073d14..1f75869 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -461,7 +461,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.50.0" +version = "0.45.2" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, From 544d6825f3a927a485fd20ceb9351a7ca3015e07 Mon Sep 17 00:00:00 2001 From: Adam Watkins Date: Wed, 13 Nov 2024 12:19:48 -0800 Subject: [PATCH 2/5] Update core/harambe_core/parser/expression/functions.py --- core/harambe_core/parser/expression/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/harambe_core/parser/expression/functions.py b/core/harambe_core/parser/expression/functions.py index 6b3bb5f..5f6377a 100644 --- a/core/harambe_core/parser/expression/functions.py +++ b/core/harambe_core/parser/expression/functions.py @@ -1,6 +1,6 @@ from typing import Any -# from slugify import slugify as python_slugify +from slugify import slugify as python_slugify from harambe_core.parser.expression.evaluator import ExpressionEvaluator From 706755fca3f710b30aec927749cb874ce4458f31 Mon Sep 17 00:00:00 2001 From: awtkns <32209255+awtkns@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:24:07 -0800 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=90=9B=20Fix=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/uv.lock | 2 +- sdk/test/test_e2e.py | 14 +++++++------- sdk/uv.lock | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/core/uv.lock b/core/uv.lock index 06f920c..c5889c0 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -141,7 +141,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.50.0" +version = "0.50.1" source = { virtual = "." } dependencies = [ { name = "dateparser" }, diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py index e345cde..48de12a 100644 --- a/sdk/test/test_e2e.py +++ b/sdk/test/test_e2e.py @@ -249,10 +249,10 @@ async def scraper(sdk: SDK, *args, **kwargs): assert observer.data[0]["page_content"] == observer.data[1]["table_content"] for text in ["Apple", "Orange", "Banana"]: assert ( - text in observer.data[0]["page_content"] + text in observer.data[0]["page_content"] ), f"{text} not in {observer.data[0]['page_content']}" assert ( - text in observer.data[1]["table_content"] + text in observer.data[1]["table_content"] ), f"{text} not in {observer.data[1]['table_content']}" @@ -361,13 +361,13 @@ async def scraper(sdk: SDK, *args, **kwargs): ({"key1": "value1", "key2": 2}, '{"key1": "value1", "key2": 2}'), # Nested structure ( - {"list": [1, 2, {"nested": "value"}]}, - '{"list": [1, 2, {"nested": "value"}]}', + {"list": [1, 2, {"nested": "value"}]}, + '{"list": [1, 2, {"nested": "value"}]}', ), ], ) async def test_load_local_storage( - server, observer, harness, test_value, expected_value + server, observer, harness, test_value, expected_value ): local_storage_entry_1 = { "domain": "asim-shrestha.com", @@ -541,8 +541,8 @@ async def scrape(sdk: SDK, url, context) -> None: assert observer.data[0]["solicitation_id"] == "6100062375" assert observer.data[0]["title"] == "23SW SGL 111 Conn Road" assert ( - observer.data[0]["description"] - == "The State of Pennsylvania is seeking proposals for IT services" + observer.data[0]["description"] + == "The State of Pennsylvania is seeking proposals for IT services" ) assert observer.data[0]["status"] == "Open" assert len(observer.data[0]["attachments"]) == 4 diff --git a/sdk/uv.lock b/sdk/uv.lock index 1f75869..5194ec9 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.50.0" +version = "0.50.1" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -461,7 +461,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.45.2" +version = "0.50.1" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, From 3954ec1e59de2bbfffd4cbcfecd477a91f2e1308 Mon Sep 17 00:00:00 2001 From: asim <50181239+asim-shrestha@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:25:16 -0800 Subject: [PATCH 4/5] Update core/test/parser/test_null_values.py --- core/test/parser/test_null_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/test/parser/test_null_values.py b/core/test/parser/test_null_values.py index 0e62d0a..6709c43 100644 --- a/core/test/parser/test_null_values.py +++ b/core/test/parser/test_null_values.py @@ -118,7 +118,7 @@ def test_pydantic_schema_validation_error_fail(data: Dict[str, Any]) -> None: "code_type": "", "code": "", "code_description": "", - "description": "Somthing", + "description": "Something", } ], }, From 8ff1fce52bbdac817159cda7532f28386ce2b416 Mon Sep 17 00:00:00 2001 From: awtkns <32209255+awtkns@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:28:41 -0800 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=AB=A1=20Add=20another=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/test/parser/test_null_values.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/core/test/parser/test_null_values.py b/core/test/parser/test_null_values.py index 0e62d0a..3f53765 100644 --- a/core/test/parser/test_null_values.py +++ b/core/test/parser/test_null_values.py @@ -142,6 +142,19 @@ def test_pydantic_schema_validation_success(data: Dict[str, Any]): }, ], }, + { + "group": "Team", + "members": [ + { + "name": "Adam", + "age": 29, + }, + { + "name": "", + "age": None, + }, + ], + }, ], ) def test_with_emtpy_objects(data):