From be898250e39ddb5954f3cfdef8e2b6093a4064c2 Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Wed, 29 Nov 2023 17:16:39 +0100 Subject: [PATCH] Update `ElasticSearchDocumentStore` to use latest `haystack-ai` version (#63) * Update haystack version * Update imports * Support new filters * Update ElasticSearchDocumentStore * Update tests * Fix corner cases when filter value is None * Convert legacy filters if used * Fix linting --- integrations/elasticsearch/pyproject.toml | 4 +- .../elasticsearch_haystack/bm25_retriever.py | 4 +- .../elasticsearch_haystack/document_store.py | 105 +----- .../embedding_retriever.py | 4 +- .../src/elasticsearch_haystack/filters.py | 307 ++++++++++++------ .../tests/test_bm25_retriever.py | 10 +- .../tests/test_document_store.py | 229 ++++--------- .../tests/test_embedding_retriever.py | 10 +- .../elasticsearch/tests/test_filters.py | 258 +++++++-------- 9 files changed, 428 insertions(+), 503 deletions(-) diff --git a/integrations/elasticsearch/pyproject.toml b/integrations/elasticsearch/pyproject.toml index c54be02f2..f67e9cc35 100644 --- a/integrations/elasticsearch/pyproject.toml +++ b/integrations/elasticsearch/pyproject.toml @@ -24,10 +24,8 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" - "haystack-ai==0.143.0", + "haystack-ai", "elasticsearch>=8,<9", - "typing_extensions", # This is not a direct dependency, but `haystack-ai` is missing it cause `canals` is missing it ] [project.urls] diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py index 017860a9a..804e8db15 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional -from haystack.preview import component, default_from_dict, default_to_dict -from haystack.preview.dataclasses import Document +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document from elasticsearch_haystack.document_store import ElasticsearchDocumentStore diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py index d131cdf01..6dae14341 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -9,11 +9,10 @@ # There are no import stubs for elastic_transport and elasticsearch so mypy fails from elastic_transport import NodeConfig # type: ignore[import-not-found] from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found] -from haystack.preview import default_from_dict, default_to_dict -from haystack.preview.dataclasses import Document -from haystack.preview.document_stores.decorator import document_store -from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.preview.document_stores.protocols import DuplicatePolicy +from haystack import default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy, document_store +from haystack.utils.filters import convert from elasticsearch_haystack.filters import _normalize_filters @@ -130,103 +129,29 @@ def _search_documents(self, **kwargs) -> List[Document]: return documents def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: - """ - Returns the documents that match the filters provided. - - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, - `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. - - Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata - field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or - (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default - operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used - as default operation. - - Example: - - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" - } - } - ``` - - To use the same logical operator multiple times on the same level, logical operators can take a list of - dictionaries as value. - - Example: - - ```python - filters = { - "$or": [ - { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } - }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] - } - ``` + if filters and "operator" not in filters and "conditions" not in filters: + filters = convert(filters) - :param filters: the filters to apply to the document list. - :return: a list of Documents that match the given filters. - """ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None documents = self._search_documents(query=query) return documents - def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None: + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ - Writes (or overwrites) documents into the store. - - :param documents: a list of documents. - :param policy: documents with the same ID count as duplicates. When duplicates are met, - the store can: - - skip: keep the existing document and ignore the new one. - - overwrite: remove the old document and write the new one. - - fail: an error is raised - - :raises ValueError: if 'documents' parameter is not a list of Document objects - :raises DuplicateDocumentError: Exception trigger on duplicate document if `policy=DuplicatePolicy.FAIL` - :raises DocumentStoreError: Exception trigger on any other error when writing documents - :return: None + Writes Documents to Elasticsearch. + If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the + same ID already exists in the document store. """ if len(documents) > 0: if not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" raise ValueError(msg) + if policy == DuplicatePolicy.NONE: + policy = DuplicatePolicy.FAIL + action = "index" if policy == DuplicatePolicy.OVERWRITE else "create" - _, errors = helpers.bulk( + documents_written, errors = helpers.bulk( client=self._client, actions=( { @@ -262,6 +187,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D msg = f"Failed to write documents to Elasticsearch. Errors:\n{other_errors}" raise DocumentStoreError(msg) + return documents_written + def _deserialize_document(self, hit: Dict[str, Any]) -> Document: """ Creates a Document from the search hit provided. diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py index 3bb4576ec..2aaba382d 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional -from haystack.preview import component, default_from_dict, default_to_dict -from haystack.preview.dataclasses import Document +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document from elasticsearch_haystack.document_store import ElasticsearchDocumentStore diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py index 78adae585..bb5b15311 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py @@ -1,123 +1,218 @@ -from typing import Any, Dict, List, Union +from datetime import datetime +from typing import Any, Dict, List -from haystack.preview.errors import FilterError +from haystack.errors import FilterError from pandas import DataFrame -def _normalize_filters(filters: Union[List[Dict], Dict], logical_condition="") -> Dict[str, Any]: +def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]: """ Converts Haystack filters in ElasticSearch compatible filters. """ - if not isinstance(filters, dict) and not isinstance(filters, list): - msg = "Filters must be either a dictionary or a list" + if not isinstance(filters, dict): + msg = "Filters must be a dictionary" raise FilterError(msg) - conditions = [] - if isinstance(filters, dict): - filters = [filters] - for filter_ in filters: - for operator, value in filter_.items(): - if operator in ["$not", "$and", "$or"]: - # Logical operators - conditions.append(_normalize_filters(value, operator)) - else: - # Comparison operators - conditions.extend(_parse_comparison(operator, value)) + if "field" in filters: + return {"bool": {"must": _parse_comparison_condition(filters)}} + return _parse_logical_condition(filters) + + +def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise FilterError(msg) + + operator = condition["operator"] + conditions = [_parse_comparison_condition(c) for c in condition["conditions"]] if len(conditions) > 1: conditions = _normalize_ranges(conditions) + if operator == "AND": + return {"bool": {"must": conditions}} + elif operator == "OR": + return {"bool": {"should": conditions}} + elif operator == "NOT": + return {"bool": {"must_not": [{"bool": {"must": conditions}}]}} else: - # mypy is complaining we're assigning a dict to a list of dicts. - # We're ok with this as we're returning right after this. - conditions = conditions[0] # type: ignore[assignment] + msg = f"Unknown logical operator '{operator}'" + raise FilterError(msg) - if logical_condition == "$not": - return {"bool": {"must_not": conditions}} - elif logical_condition == "$or": - return {"bool": {"should": conditions}} - # If no logical condition is specified we default to "$and" - return {"bool": {"must": conditions}} - - -def _parse_comparison(field: str, comparison: Union[Dict, List, str, float]) -> List: - result: List[Dict[str, Any]] = [] - if isinstance(comparison, dict): - for comparator, val in comparison.items(): - if isinstance(val, DataFrame): - # Ruff is complaining we're overriding the loop variable `var` - # but we actually want to override it. So we ignore the error. - val = val.to_json() # noqa: PLW2901 - if comparator == "$eq": - if isinstance(val, list): - result.append( - { - "terms_set": { - field: { - "terms": val, - "minimum_should_match_script": { - "source": f"Math.max(params.num_terms, doc['{field}'].size())" - }, - } - } - } - ) - result.append({"term": {field: val}}) - elif comparator == "$ne": - if isinstance(val, list): - result.append({"bool": {"must_not": {"terms": {field: val}}}}) - else: - result.append( - {"bool": {"must_not": {"match": {field: {"query": val, "minimum_should_match": "100%"}}}}} - ) - elif comparator == "$in": - if not isinstance(val, list): - msg = f"{field}'s value must be a list when using '{comparator}' comparator" - raise FilterError(msg) - result.append({"terms": {field: val}}) - elif comparator == "$nin": - if not isinstance(val, list): - msg = f"{field}'s value must be a list when using '{comparator}' comparator" - raise FilterError(msg) - result.append({"bool": {"must_not": {"terms": {field: val}}}}) - elif comparator in ["$gt", "$gte", "$lt", "$lte"]: - if not isinstance(val, str) and not isinstance(val, int) and not isinstance(val, float): - msg = f"{field}'s value must be 'str', 'int', 'float' types when using '{comparator}' comparator" - raise FilterError(msg) - result.append({"range": {field: {comparator[1:]: val}}}) - elif comparator in ["$not", "$or"]: - if isinstance(val, list): - # This handles corner cases like this: - # `{"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}}` - # If we don't handle it like this we'd lose the "name" field and the - # generated query would be wrong and return unexpected results. - comparisons = [_parse_comparison(field, v)[0] for v in val] - if comparator == "$not": - result.append({"bool": {"must_not": comparisons}}) - elif comparator == "$or": - result.append({"bool": {"should": comparisons}}) - else: - result.append(_normalize_filters(val, comparator)) - elif comparator == "$and" and isinstance(val, list): - # We're assuming there are no duplicate items in the list - flat_filters = {k: v for d in val for k, v in d.items()} - result.extend(_parse_comparison(field, flat_filters)) - elif comparator == "$and": - result.append(_normalize_filters({field: val}, comparator)) - else: - msg = f"Unknown comparator '{comparator}'" - raise FilterError(msg) - elif isinstance(comparison, list): - result.append({"terms": {field: comparison}}) - elif isinstance(comparison, DataFrame): - result.append({"match": {field: {"query": comparison.to_json(), "minimum_should_match": "100%"}}}) - elif isinstance(comparison, str): - # We can't use "term" for text fields as ElasticSearch changes the value of text. - # More info here: - # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html#query-dsl-term-query - result.append({"match": {field: {"query": comparison, "minimum_should_match": "100%"}}}) - else: - result.append({"term": {field: comparison}}) - return result +def _equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + return {"bool": {"must_not": {"exists": {"field": field}}}} + + if isinstance(value, list): + return { + "terms_set": { + field: { + "terms": value, + "minimum_should_match_script": {"source": f"Math.max(params.num_terms, doc['{field}'].size())"}, + } + } + } + if field in ["text", "dataframe"]: + # We want to fully match the text field. + return {"match": {field: {"query": value, "minimum_should_match": "100%"}}} + return {"term": {field: value}} + + +def _not_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + return {"exists": {"field": field}} + + if isinstance(value, list): + return {"bool": {"must_not": {"terms": {field: value}}}} + if field in ["text", "dataframe"]: + # We want to fully match the text field. + return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}} + + return {"bool": {"must_not": {"term": {field: value}}}} + + +def _greater_than(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '>' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"gt": value}}} + + +def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '>=' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"gte": value}}} + + +def _less_than(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '<' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"lt": value}}} + + +def _less_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '<=' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"lte": value}}} + + +def _in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators" + raise FilterError(msg) + return {"terms": {field: value}} + + +def _not_in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators" + raise FilterError(msg) + return {"bool": {"must_not": {"terms": {field: value}}}} + + +COMPARISON_OPERATORS = { + "==": _equal, + "!=": _not_equal, + ">": _greater_than, + ">=": _greater_than_equal, + "<": _less_than, + "<=": _less_than_equal, + "in": _in, + "not in": _not_in, +} + + +def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "field" not in condition: + # 'field' key is only found in comparison dictionaries. + # We assume this is a logic dictionary since it's not present. + return _parse_logical_condition(condition) + field: str = condition["field"] + + if field.startswith("meta."): + # Remove the "meta." prefix if present. + # Documents are flattened when using the ElasticSearchDocumentStore + # so we don't need to specify the "meta." prefix. + # Instead of raising an error we handle it gracefully. + field = field[5:] + + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise FilterError(msg) + operator: str = condition["operator"] + value: Any = condition["value"] + if isinstance(value, DataFrame): + value = value.to_json() + + return COMPARISON_OPERATORS[operator](field, value) def _normalize_ranges(conditions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/integrations/elasticsearch/tests/test_bm25_retriever.py b/integrations/elasticsearch/tests/test_bm25_retriever.py index 9139368d9..8f19c8897 100644 --- a/integrations/elasticsearch/tests/test_bm25_retriever.py +++ b/integrations/elasticsearch/tests/test_bm25_retriever.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch -from haystack.preview.dataclasses import Document +from haystack.dataclasses import Document from elasticsearch_haystack.bm25_retriever import ElasticsearchBM25Retriever from elasticsearch_haystack.document_store import ElasticsearchDocumentStore @@ -24,7 +24,7 @@ def test_to_dict(_mock_elasticsearch_client): retriever = ElasticsearchBM25Retriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "ElasticsearchBM25Retriever", + "type": "elasticsearch_haystack.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { "init_parameters": { @@ -32,7 +32,7 @@ def test_to_dict(_mock_elasticsearch_client): "index": "default", "embedding_similarity_function": "cosine", }, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "fuzziness": "AUTO", @@ -45,11 +45,11 @@ def test_to_dict(_mock_elasticsearch_client): @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_from_dict(_mock_elasticsearch_client): data = { - "type": "ElasticsearchBM25Retriever", + "type": "elasticsearch_haystack.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { "init_parameters": {"hosts": "some fake host", "index": "default"}, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "fuzziness": "AUTO", diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index e71603126..d6428e762 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -6,13 +6,12 @@ from typing import List from unittest.mock import patch -import pandas as pd import pytest from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found] -from haystack.preview.dataclasses.document import Document -from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.preview.document_stores.protocols import DuplicatePolicy -from haystack.preview.testing.document_store import DocumentStoreBaseTests +from haystack.dataclasses.document import Document +from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError +from haystack.document_stores.protocols import DuplicatePolicy +from haystack.testing.document_store import DocumentStoreBaseTests from elasticsearch_haystack.document_store import ElasticsearchDocumentStore @@ -24,7 +23,7 @@ class TestDocumentStore(DocumentStoreBaseTests): """ @pytest.fixture - def docstore(self, request): + def document_store(self, request): """ This is the most basic requirement for the child class: provide an instance of this document store so the base class can use it. @@ -43,12 +42,37 @@ def docstore(self, request): yield store store._client.options(ignore_status=[400, 404]).indices.delete(index=index) + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): + """ + The ElasticSearchDocumentStore.filter_documents() method returns a Documents with their score set. + We don't want to compare the score, so we set it to None before comparing the documents. + """ + received_meta = [] + for doc in received: + r = { + "number": doc.meta.get("number"), + "name": doc.meta.get("name"), + } + received_meta.append(r) + + expected_meta = [] + for doc in expected: + r = { + "number": doc.meta.get("number"), + "name": doc.meta.get("name"), + } + expected_meta.append(r) + for doc in received: + doc.score = None + + super().assert_documents_are_equal(received, expected) + @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_to_dict(self, _mock_elasticsearch_client): document_store = ElasticsearchDocumentStore(hosts="some hosts") res = document_store.to_dict() assert res == { - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", "init_parameters": { "hosts": "some hosts", "index": "default", @@ -59,7 +83,7 @@ def test_to_dict(self, _mock_elasticsearch_client): @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_from_dict(self, _mock_elasticsearch_client): data = { - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", "init_parameters": { "hosts": "some hosts", "index": "default", @@ -71,8 +95,14 @@ def test_from_dict(self, _mock_elasticsearch_client): assert document_store._index == "default" assert document_store._embedding_similarity_function == "cosine" - def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): - docstore.write_documents( + def test_write_documents(self, document_store: ElasticsearchDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, DuplicatePolicy.FAIL) + + def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore): + document_store.write_documents( [ Document(content="Haskell is a functional programming language"), Document(content="Lisp is a functional programming language"), @@ -88,17 +118,17 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): ] ) - res = docstore._bm25_retrieval("functional", top_k=3) + res = document_store._bm25_retrieval("functional", top_k=3) assert len(res) == 3 assert "functional" in res[0].content assert "functional" in res[1].content assert "functional" in res[2].content - def test_bm25_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): + def test_bm25_retrieval_pagination(self, document_store: ElasticsearchDocumentStore): """ Test that handling of pagination works as expected, when the matching documents are > 10. """ - docstore.write_documents( + document_store.write_documents( [ Document(content="Haskell is a functional programming language"), Document(content="Lisp is a functional programming language"), @@ -118,12 +148,12 @@ def test_bm25_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): ] ) - res = docstore._bm25_retrieval("programming", top_k=11) + res = document_store._bm25_retrieval("programming", top_k=11) assert len(res) == 11 assert all("programming" in doc.content for doc in res) - def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore): - docstore.write_documents( + def test_bm25_retrieval_with_fuzziness(self, document_store: ElasticsearchDocumentStore): + document_store.write_documents( [ Document(content="Haskell is a functional programming language"), Document(content="Lisp is a functional programming language"), @@ -141,161 +171,30 @@ def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStor query_with_typo = "functinal" # Query without fuzziness to search for the exact match - res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0") + res = document_store._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0") # Nothing is found as the query contains a typo assert res == [] # Query with fuzziness with the same query - res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1") + res = document_store._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1") assert len(res) == 3 assert "functional" in res[0].content assert "functional" in res[1].content assert "functional" in res[2].content - def test_write_duplicate_fail(self, docstore: ElasticsearchDocumentStore): - """ - Verify `DuplicateDocumentError` is raised when trying to write duplicate files. - - `DocumentStoreBaseTests` declares this test but we override it since we return - a different error message that it expects. - """ - doc = Document(content="test doc") - docstore.write_documents([doc]) - with pytest.raises(DuplicateDocumentError): - docstore.write_documents(documents=[doc], policy=DuplicatePolicy.FAIL) - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] - - def test_delete_not_empty(self, docstore: ElasticsearchDocumentStore): - """ - Verifies delete properly deletes specified document. - - `DocumentStoreBaseTests` declares this test but we override it since we - want `delete_documents` to be idempotent. - """ - doc = Document(content="test doc") - docstore.write_documents([doc]) - - docstore.delete_documents([doc.id]) - - res = docstore.filter_documents(filters={"id": doc.id}) - assert res == [] - - def test_delete_empty(self, docstore: ElasticsearchDocumentStore): - """ - Verifies delete doesn't raises when trying to delete a non-existing document. - - `DocumentStoreBaseTests` declares this test but we override it since we - want `delete_documents` to be idempotent. - """ - docstore.delete_documents(["test"]) - - def test_delete_not_empty_nonexisting(self, docstore: ElasticsearchDocumentStore): - """ - `DocumentStoreBaseTests` declares this test but we override it since we - want `delete_documents` to be idempotent. - """ - doc = Document(content="test doc") - docstore.write_documents([doc]) - - docstore.delete_documents(["non_existing"]) - - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] - - @pytest.mark.skip(reason="Not supported") - def test_in_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_in_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_nin_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_nin_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_eq_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - """ - If the embedding field is a dense vector (as expected), raise the following error: - - elasticsearch.BadRequestError: BadRequestError(400, 'search_phase_execution_exception', - "failed to create query: Field [embedding] of type [dense_vector] doesn't support term queries") - """ - pass - - @pytest.mark.skip(reason="Not supported") - def test_ne_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - """ - If the embedding field is a dense vector (as expected), raise the following error: - - elasticsearch.BadRequestError: BadRequestError(400, 'search_phase_execution_exception', - "failed to create query: Field [embedding] of type [dense_vector] doesn't support term queries") - """ - pass - - def test_gt_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$gt": "100"}}) - assert self.contains_same_docs( - result, [d for d in filterable_docs if "page" in d.meta and d.meta["page"] > "100"] - ) - - def test_gt_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$gt": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert result == [] - - def test_gte_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$gte": "100"}}) - assert self.contains_same_docs( - result, [d for d in filterable_docs if "page" in d.meta and d.meta["page"] >= "100"] - ) - - def test_gte_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$gte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert result == [] - - def test_lt_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$lt": "100"}}) - assert result == [] - - def test_lt_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$lt": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert self.contains_same_docs(result, [d for d in filterable_docs if d.dataframe is not None]) - - def test_lte_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$lte": "100"}}) - assert self.contains_same_docs( - result, [d for d in filterable_docs if "page" in d.meta and d.meta["page"] <= "100"] - ) - - def test_lte_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$lte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert self.contains_same_docs(result, [d for d in filterable_docs if d.dataframe is not None]) - - def test_embedding_retrieval(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore): docs = [ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), Document(content="Not very similar document", embedding=[0.0, 0.8, 0.3, 0.9]), ] - docstore.write_documents(docs) - results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters={}) + document_store.write_documents(docs) + results = document_store._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters={}) assert len(results) == 2 assert results[0].content == "Most similar document" assert results[1].content == "2nd best document" - def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval_with_filters(self, document_store: ElasticsearchDocumentStore): docs = [ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), @@ -305,14 +204,14 @@ def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStor meta={"meta_field": "custom_value"}, ), ] - docstore.write_documents(docs) + document_store.write_documents(docs) - filters = {"meta_field": {"$eq": "custom_value"}} - results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters=filters) + filters = {"field": "meta_field", "operator": "==", "value": "custom_value"} + results = document_store._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters=filters) assert len(results) == 1 assert results[0].content == "Not very similar document with meta field" - def test_embedding_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval_pagination(self, document_store: ElasticsearchDocumentStore): """ Test that handling of pagination works as expected, when the matching documents are > 10. """ @@ -322,21 +221,23 @@ def test_embedding_retrieval_pagination(self, docstore: ElasticsearchDocumentSto for i in range(20) ] - docstore.write_documents(docs) - results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={}) + document_store.write_documents(docs) + results = document_store._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={}) assert len(results) == 11 - def test_embedding_retrieval_query_documents_different_embedding_sizes(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval_query_documents_different_embedding_sizes( + self, document_store: ElasticsearchDocumentStore + ): """ Test that the retrieval fails if the query embedding and the documents have different embedding sizes. """ docs = [Document(content="Hello world", embedding=[0.1, 0.2, 0.3, 0.4])] - docstore.write_documents(docs) + document_store.write_documents(docs) with pytest.raises(BadRequestError): - docstore._embedding_retrieval(query_embedding=[0.1, 0.1]) + document_store._embedding_retrieval(query_embedding=[0.1, 0.1]) - def test_write_documents_different_embedding_sizes_fail(self, docstore: ElasticsearchDocumentStore): + def test_write_documents_different_embedding_sizes_fail(self, document_store: ElasticsearchDocumentStore): """ Test that write_documents fails if the documents have different embedding sizes. """ @@ -346,4 +247,4 @@ def test_write_documents_different_embedding_sizes_fail(self, docstore: Elastics ] with pytest.raises(DocumentStoreError): - docstore.write_documents(docs) + document_store.write_documents(docs) diff --git a/integrations/elasticsearch/tests/test_embedding_retriever.py b/integrations/elasticsearch/tests/test_embedding_retriever.py index b16e28830..fd60b0940 100644 --- a/integrations/elasticsearch/tests/test_embedding_retriever.py +++ b/integrations/elasticsearch/tests/test_embedding_retriever.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch -from haystack.preview.dataclasses import Document +from haystack.dataclasses import Document from elasticsearch_haystack.document_store import ElasticsearchDocumentStore from elasticsearch_haystack.embedding_retriever import ElasticsearchEmbeddingRetriever @@ -24,7 +24,7 @@ def test_to_dict(_mock_elasticsearch_client): retriever = ElasticsearchEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "ElasticsearchEmbeddingRetriever", + "type": "elasticsearch_haystack.embedding_retriever.ElasticsearchEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -32,7 +32,7 @@ def test_to_dict(_mock_elasticsearch_client): "index": "default", "embedding_similarity_function": "cosine", }, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "top_k": 10, @@ -44,11 +44,11 @@ def test_to_dict(_mock_elasticsearch_client): @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_from_dict(_mock_elasticsearch_client): data = { - "type": "ElasticsearchEmbeddingRetriever", + "type": "elasticsearch_haystack.embedding_retriever.ElasticsearchEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": {"hosts": "some fake host", "index": "default"}, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "top_k": 10, diff --git a/integrations/elasticsearch/tests/test_filters.py b/integrations/elasticsearch/tests/test_filters.py index efaa168b0..6db6a0dd2 100644 --- a/integrations/elasticsearch/tests/test_filters.py +++ b/integrations/elasticsearch/tests/test_filters.py @@ -1,108 +1,99 @@ import pytest -from haystack.preview.errors import FilterError +from haystack.errors import FilterError from elasticsearch_haystack.filters import _normalize_filters, _normalize_ranges filters_data = [ ( { - "$and": { - "type": {"$eq": "article"}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - } + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.date", "operator": "<", "value": "2021-01-01"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], }, { "bool": { - "must": { - "bool": { - "must": [ - {"term": {"type": "article"}}, - { - "bool": { - "should": [ - {"terms": {"genre": ["economy", "politics"]}}, - {"term": {"publisher": "nytimes"}}, - ] - } - }, - {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, - {"range": {"rating": {"gte": 3}}}, - ] - } - } - } - }, - ), - ( - { - "$or": [ - {"Type": "News Paper", "Date": {"$lt": "2019-01-01"}}, - {"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}}, - ] - }, - { - "bool": { - "must": { - "bool": { - "should": [ - {"match": {"Type": {"query": "News Paper", "minimum_should_match": "100%"}}}, - {"match": {"Type": {"query": "Blog Post", "minimum_should_match": "100%"}}}, - {"range": {"Date": {"lt": "2019-01-01", "gte": "2019-01-01"}}}, - ] - } - } + "must": [ + {"term": {"type": "article"}}, + { + "bool": { + "should": [ + {"terms": {"genre": ["economy", "politics"]}}, + {"term": {"publisher": "nytimes"}}, + ] + } + }, + {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, + {"range": {"rating": {"gte": 3}}}, + ] } }, ), ( { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - } + "operator": "OR", + "conditions": [ + { + "operator": "AND", + "conditions": [ + {"field": "meta.Type", "operator": "==", "value": "News Paper"}, + {"field": "meta.Date", "operator": "<", "value": "2020-01-01"}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "meta.Type", "operator": "==", "value": "Blog Post"}, + {"field": "meta.Date", "operator": ">=", "value": "2019-01-01"}, + ], + }, + ], }, { "bool": { - "must": { - "bool": { - "must": [ - {"term": {"type": "article"}}, - { - "bool": { - "should": [ - {"terms": {"genre": ["economy", "politics"]}}, - {"term": {"publisher": "nytimes"}}, - ] - } - }, - {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, - {"range": {"rating": {"gte": 3}}}, - ] - } - } + "should": [ + {"bool": {"must": [{"term": {"Type": "News Paper"}}, {"range": {"Date": {"lt": "2020-01-01"}}}]}}, + {"bool": {"must": [{"term": {"Type": "Blog Post"}}, {"range": {"Date": {"gte": "2019-01-01"}}}]}}, + ] } }, ), ( { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": ["economy", "politics"], "publisher": "nytimes"}, + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.date", "operator": "<", "value": "2021-01-01"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], }, { "bool": { "must": [ - {"match": {"type": {"query": "article", "minimum_should_match": "100%"}}}, + {"term": {"type": "article"}}, { "bool": { "should": [ {"terms": {"genre": ["economy", "politics"]}}, - {"match": {"publisher": {"query": "nytimes", "minimum_should_match": "100%"}}}, + {"term": {"publisher": "nytimes"}}, ] } }, @@ -113,75 +104,72 @@ }, ), ( - {"text": "A Foo Document 1"}, - {"bool": {"must": {"match": {"text": {"query": "A Foo Document 1", "minimum_should_match": "100%"}}}}}, + {"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]}, + {"bool": {"must": [{"match": {"text": {"query": "A Foo Document 1", "minimum_should_match": "100%"}}}]}}, ), ( - {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}, + { + "operator": "OR", + "conditions": [ + { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.name", "operator": "==", "value": "name_1"}, + ], + }, + {"field": "meta.number", "operator": "<", "value": 1.0}, + ], + }, { "bool": { - "must": { - "bool": { - "should": [ - { - "bool": { - "should": [ - {"term": {"name": "name_0"}}, - {"term": {"name": "name_1"}}, - ] - } - }, - {"range": {"number": {"lt": 1.0}}}, - ] - } - } + "should": [ + {"bool": {"should": [{"term": {"name": "name_0"}}, {"term": {"name": "name_1"}}]}}, + {"range": {"number": {"lt": 1.0}}}, + ] } }, ), ( - {"$and": {"number": {"$and": {"$lte": 2, "$gte": 0}}, "name": {"$in": ["name_0", "name_1"]}}}, { - "bool": { - "must": { - "bool": { - "must": [ - {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, - {"terms": {"name": ["name_0", "name_1"]}}, - ] - } - } - } + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "<=", "value": 2}, + {"field": "meta.number", "operator": ">=", "value": 0}, + {"field": "meta.name", "operator": "in", "value": ["name_0", "name_1"]}, + ], }, + {"bool": {"must": [{"terms": {"name": ["name_0", "name_1"]}}, {"range": {"number": {"lte": 2, "gte": 0}}}]}}, ), ( - {"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]}, { - "bool": { - "must": [ - {"terms": {"name": ["name_0", "name_1"]}}, - {"range": {"number": {"lte": 2, "gte": 0}}}, - ] - } + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "<=", "value": 2}, + {"field": "meta.number", "operator": ">=", "value": 0}, + ], }, + {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, ), ( - {"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}}, - {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.name", "operator": "==", "value": "name_1"}, + ], + }, + {"bool": {"should": [{"term": {"name": "name_0"}}, {"term": {"name": "name_1"}}]}}, ), ( - {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}}, { - "bool": { - "must": { - "bool": { - "should": [ - {"term": {"name": "name_0"}}, - {"term": {"name": "name_1"}}, - ] - } - } - } + "operator": "NOT", + "conditions": [ + {"field": "meta.number", "operator": "==", "value": 100}, + {"field": "meta.name", "operator": "==", "value": "name_0"}, + ], }, + {"bool": {"must_not": [{"bool": {"must": [{"term": {"number": 100}}, {"term": {"name": "name_0"}}]}}]}}, ), ] @@ -192,15 +180,31 @@ def test_normalize_filters(filters, expected): assert result == expected -def test_normalize_filters_raises_with_malformed_filters(): +def test_normalize_filters_invalid_operator(): + with pytest.raises(FilterError): + _normalize_filters({"operator": "INVALID", "conditions": []}) + + +def test_normalize_filters_malformed(): + # Missing operator + with pytest.raises(FilterError): + _normalize_filters({"conditions": []}) + + # Missing conditions + with pytest.raises(FilterError): + _normalize_filters({"operator": "AND"}) + + # Missing comparison field with pytest.raises(FilterError): - _normalize_filters("not a filter") + _normalize_filters({"operator": "AND", "conditions": [{"operator": "==", "value": "article"}]}) + # Missing comparison operator with pytest.raises(FilterError): - _normalize_filters({"number": {"page": "100"}}) + _normalize_filters({"operator": "AND", "conditions": [{"field": "meta.type", "operator": "=="}]}) + # Missing comparison value with pytest.raises(FilterError): - _normalize_filters({"number": {"page": {"chapter": "intro"}}}) + _normalize_filters({"operator": "AND", "conditions": [{"field": "meta.type", "value": "article"}]}) def test_normalize_ranges():