diff --git a/.github/workflows/pinecone.yml b/.github/workflows/pinecone.yml index 8f91c4a71..d42330849 100644 --- a/.github/workflows/pinecone.yml +++ b/.github/workflows/pinecone.yml @@ -25,8 +25,9 @@ jobs: strategy: fail-fast: false matrix: + # Pinecone tests are time expensive, so the matrix is limited to Python 3.9 and 3.10 os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10"] steps: - uses: actions/checkout@v4 diff --git a/integrations/pinecone/src/pinecone_haystack/__init__.py b/integrations/pinecone/src/pinecone_haystack/__init__.py index dbd6664ea..dbfb60832 100644 --- a/integrations/pinecone/src/pinecone_haystack/__init__.py +++ b/integrations/pinecone/src/pinecone_haystack/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present John Doe +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 from pinecone_haystack.document_store import PineconeDocumentStore diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index ae812e7ec..af295e0c0 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -181,7 +181,7 @@ def _embedding_retrieval( self, query_embedding: List[float], *, - filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 + filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: """ diff --git a/integrations/pinecone/tests/__init__.py b/integrations/pinecone/tests/__init__.py index 7eda7517e..e873bc332 100644 --- a/integrations/pinecone/tests/__init__.py +++ b/integrations/pinecone/tests/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present John Doe +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py new file mode 100644 index 000000000..63ec94819 --- /dev/null +++ b/integrations/pinecone/tests/conftest.py @@ -0,0 +1,46 @@ +import time +from random import randint + +import pytest + +from pinecone_haystack.document_store import PineconeDocumentStore + +# This is the approximate time it takes for the documents to be available +SLEEP_TIME = 17 + + +@pytest.fixture() +def sleep_time(): + return SLEEP_TIME + + +@pytest.fixture +def document_store(request): + """ + This is the most basic requirement for the child class: provide + an instance of this document store so the base class can use it. + """ + environment = "gcp-starter" + index = "default" + # Use a different namespace for each test so we can run them in parallel + namespace = f"{request.node.name}-{randint(0, 1000)}" # noqa: S311 Ruff complains about using random numbers for cryptographic purposes + dimension = 10 + + store = PineconeDocumentStore( + environment=environment, + index=index, + namespace=namespace, + dimension=dimension, + ) + + # Override the count_documents method to wait for the documents to be available + original_count_documents = store.count_documents + + def count_documents_sleep(): + time.sleep(SLEEP_TIME) + return original_count_documents() + + store.count_documents = count_documents_sleep + + yield store + store._index.delete(delete_all=True, namespace=namespace) diff --git a/integrations/pinecone/tests/pinecone_mock.py b/integrations/pinecone/tests/pinecone_mock.py deleted file mode 100644 index 215b0e428..000000000 --- a/integrations/pinecone/tests/pinecone_mock.py +++ /dev/null @@ -1,329 +0,0 @@ -import logging -from typing import Any, Dict, List, Optional, Union - -logger = logging.getLogger(__name__) - - -# Mock Pinecone instance -CONFIG: dict = {"api_key": None, "environment": None, "indexes": {}} - - -# Mock Pinecone Index instance -class IndexObject: - def __init__( - self, - index: str, - api_key: Optional[str] = None, - environment: Optional[str] = None, - dimension: Optional[int] = None, - metric: Optional[str] = None, - replicas: Optional[int] = None, - shards: Optional[int] = None, - metadata_config: Optional[dict] = None, - ): - self.index = index - self.api_key = api_key - self.environment = environment - self.dimension = dimension - self.metric = metric - self.replicas = replicas - self.shards = shards - self.metadata_config = metadata_config - self.namespaces: dict = {} - - -# Mock the Pinecone Index class -class Index: - def __init__(self, index: str): - self.index = index - self.index_config = CONFIG["indexes"][index] - - def upsert(self, vectors: List[tuple], namespace: str = ""): - if namespace not in self.index_config.namespaces: - self.index_config.namespaces[namespace] = {} - upsert_count = 0 - for record in vectors: - # Extract info from tuple - _id = record[0] - vector = record[1] - metadata = record[2] - # Checks - assert type(_id) is str - assert type(vector) is list - assert len(vector) == self.index_config.dimension - assert type(metadata) is dict - # Create record (eg document) - new_record: dict = {"id": _id, "values": vector, "metadata": metadata} - self.index_config.namespaces[namespace][_id] = new_record - upsert_count += 1 - return {"upserted_count": upsert_count} - - def update(self, namespace: str, id: str, set_metadata: dict): - # Get existing item metadata - meta = self.index_config.namespaces[namespace][id]["metadata"] - # Add new metadata to existing item metadata - self.index_config.namespaces[namespace][id]["metadata"] = {**meta, **set_metadata} - - def describe_index_stats(self, filter=None): - namespaces = {} - for namespace in self.index_config.namespaces.items(): - records = self.index_config.namespaces[namespace[0]] - if filter: - filtered_records = [] - for record in records.values(): - if self._filter(metadata=record["metadata"], filters=filter, top_level=True): - filtered_records.append(record) - records = filtered_records - namespaces[namespace[0]] = {"vector_count": len(records)} - return {"dimension": self.index_config.dimension, "index_fullness": 0.0, "namespaces": namespaces} - - def query( - self, - vector: List[float], - top_k: int, - namespace: str = "", - include_values: bool = False, - include_metadata: bool = False, - filter: Optional[dict] = None, - ): - return self.query_filter( - vector=vector, - top_k=top_k, - namespace=namespace, - include_values=include_values, - include_metadata=include_metadata, - filter=filter, - ) - - def query_filter( - self, - vector: List[float], - top_k: int, - namespace: str = "", - include_values: bool = False, - include_metadata: bool = False, - filter: Optional[dict] = None, - ): - assert len(vector) == self.index_config.dimension - response: dict = {"matches": []} - if namespace not in self.index_config.namespaces: - return response - else: - records = self.index_config.namespaces[namespace] - namespace_ids = list(records.keys())[:top_k] - - for _id in namespace_ids: - match = {"id": _id} - if include_values: - match["values"] = records[_id]["values"].copy() - if include_metadata: - match["metadata"] = records[_id]["metadata"].copy() - match["score"] = 0.0 - - if filter is None or ( - filter is not None and self._filter(records[_id]["metadata"], filter, top_level=True) - ): - # filter if needed - response["matches"].append(match) - return response - - def fetch(self, ids: List[str], namespace: str = ""): - response: dict = {"namespace": namespace, "vectors": {}} - if namespace not in self.index_config.namespaces: - # If we query an empty/non-existent namespace, Pinecone will just return an empty response - logger.warning("No namespace called '%s'", namespace) - return response - records = self.index_config.namespaces[namespace] - namespace_ids = records.keys() - for _id in namespace_ids: - if _id in ids.copy(): - response["vectors"][_id] = { - "id": _id, - "metadata": records[_id]["metadata"].copy(), - "values": records[_id]["values"].copy(), - } - return response - - def _filter( - self, - metadata: dict, - filters: Dict[str, Any] = None, - mode: Optional[str] = "$and", - top_level=False, - ) -> dict: - """ - Mock filtering function - """ - # This function has a very high McCabe cyclomatic complexity score of 38 - # (recommended is 10) and contains 55 branches (recommended is 12). - bools = [] - if type(filters) is list: - list_bools = [] - for _filter in filters: - res = self._filter(metadata, _filter, mode=mode) - for key, value in res.items(): - if key == "$and": - list_bools.append(all(value)) - else: - list_bools.append(any(value)) - if mode == "$and": - bools.append(all(list_bools)) - elif mode == "$or": - bools.append(any(list_bools)) - else: - for field, potential_value in filters.items(): - if field in ["$and", "$or"]: - bools.append(self._filter(metadata, potential_value, mode=field)) - mode = field - cond = field - else: - if type(potential_value) is dict: - sub_bool = [] - for cond, value in potential_value.items(): - if len(potential_value.keys()) > 1: - sub_filter = {field: {cond: value}} - bools.append(self._filter(metadata, sub_filter)) - if len(sub_bool) > 1: - if field == "$or": - bools.append(any(sub_bool)) - else: - bools.append(all(sub_bool)) - elif type(potential_value) is list: - cond = "$in" - value = potential_value - else: - cond = "$eq" - value = potential_value - # main chunk of condition checks - if cond == "$eq": - if field in metadata and metadata[field] == value: - bools.append(True) - else: - bools.append(False) - elif cond == "$ne": - if field in metadata and metadata[field] != value: - bools.append(True) - else: - bools.append(False) - elif cond == "$in": - if field in metadata and metadata[field] in value: - bools.append(True) - else: - bools.append(False) - elif cond == "$nin": - if field in metadata and metadata[field] not in value: - bools.append(True) - else: - bools.append(False) - elif cond == "$gt": - if field in metadata and metadata[field] > value: - bools.append(True) - else: - bools.append(False) - elif cond == "$lt": - if field in metadata and metadata[field] < value: - bools.append(True) - else: - bools.append(False) - elif cond == "$gte": - if field in metadata and metadata[field] >= value: - bools.append(True) - else: - bools.append(False) - elif cond == "$lte": - if field in metadata and metadata[field] <= value: - bools.append(True) - else: - bools.append(False) - if top_level: - final = [] - for item in bools: - if type(item) is dict: - for key, value in item.items(): - if key == "$and": - final.append(all(value)) - else: - final.append(any(value)) - else: - final.append(item) - if mode == "$and": - bools = all(final) - else: - bools = any(final) - else: - if mode == "$and": - return {"$and": bools} - else: - return {"$or": bools} - return bools - - def delete( - self, - ids: Optional[List[str]] = None, - namespace: str = "", - filters: Dict[str, Any] = None, - delete_all: bool = False, - ): - if filters: - # Get a filtered list of IDs - matches = self.query(filters=filters, namespace=namespace, include_values=False, include_metadata=False)[ - "vectors" - ] - filter_ids: List[str] = matches.keys() # .keys() returns an object that supports set operators already - elif delete_all: - self.index_config.namespaces[namespace] = {} - - if namespace not in self.index_config.namespaces: - pass - elif ids is not None: - id_list: List[str] = ids - if filters: - # We find the intersect between the IDs and filtered IDs - id_list = set(id_list).intersection(filter_ids) - records = self.index_config.namespaces[namespace] - for _id in list(records.keys()): # list() is needed to be able to del below - if _id in id_list: - del records[_id] - else: - # Delete all - self.index_config.namespaces[namespace] = {} - return {} - - def _get_config(self): - return self.index_config - - -# Mock core Pinecone client functions -def init(api_key: Optional[str] = None, environment: Optional[str] = None): - CONFIG["api_key"] = api_key - CONFIG["environment"] = environment - CONFIG["indexes"] = {} - - -def list_indexes(): - return list(CONFIG["indexes"].keys()) - - -def create_index( - name: str, - dimension: int, - metric: str = "cosine", - replicas: int = 1, - shards: int = 1, - metadata_config: Optional[dict] = None, -): - index_object = IndexObject( - api_key=CONFIG["api_key"], - environment=CONFIG["environment"], - index=name, - dimension=dimension, - metric=metric, - replicas=replicas, - shards=shards, - metadata_config=metadata_config, - ) - CONFIG["indexes"][name] = index_object - - -def delete_index(index: str): - del CONFIG["indexes"][index] diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py new file mode 100644 index 000000000..02462d422 --- /dev/null +++ b/integrations/pinecone/tests/test_count.py @@ -0,0 +1,7 @@ +from haystack.testing.document_store import ( + CountDocumentsTest, +) + + +class TestCountDocuments(CountDocumentsTest): + ... diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py new file mode 100644 index 000000000..88b145704 --- /dev/null +++ b/integrations/pinecone/tests/test_delete.py @@ -0,0 +1,7 @@ +from haystack.testing.document_store import ( + DeleteDocumentsTest, +) + + +class TestDeleteDocuments(DeleteDocumentsTest): + ... diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py new file mode 100644 index 000000000..74315aad2 --- /dev/null +++ b/integrations/pinecone/tests/test_document_store.py @@ -0,0 +1,20 @@ +import time + +from haystack import Document + +from pinecone_haystack.document_store import PineconeDocumentStore + + +class TestDocumentStore: + def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time): + docs = [ + Document(content="Most similar document", embedding=[1.0] * 10), + Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 0.8, 0.5, 0.8, 0.8, 0.8, 0.8, 0.5]), + Document(content="Not very similar document", embedding=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]), + ] + document_store.write_documents(docs) + time.sleep(sleep_time) + results = document_store._embedding_retrieval(query_embedding=[0.1] * 10, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].content == "Most similar document" + assert results[1].content == "2nd best document" diff --git a/integrations/pinecone/tests/test_pinecone_document_store.py b/integrations/pinecone/tests/test_pinecone_document_store.py deleted file mode 100644 index 8d0f1b097..000000000 --- a/integrations/pinecone/tests/test_pinecone_document_store.py +++ /dev/null @@ -1,335 +0,0 @@ -import os -from inspect import getmembers, isclass, isfunction -from typing import Any, Dict, List, Union -from unittest.mock import MagicMock - -import numpy as np -import pinecone -import pytest -from haystack.preview.dataclasses import Document -from haystack.preview.testing.document_store import DocumentStoreBaseTests - -from pinecone_haystack.document_store import PineconeDocumentStore -from pinecone_haystack.errors import ( - PineconeDocumentStoreError, - PineconeDocumentStoreFilterError, -) -from tests import pinecone_mock - - -class TestPineconeDocumentStore(DocumentStoreBaseTests): - @pytest.fixture - def ds(self, monkeypatch, request) -> PineconeDocumentStore: - """ - This fixture provides an empty document store and takes care of cleaning up after each test - """ - - for fname, function in getmembers(pinecone_mock, isfunction): - monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) - for cname, class_ in getmembers(pinecone_mock, isclass): - monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) - - return PineconeDocumentStore( - api_key=os.environ.get("PINECONE_API_KEY") or "pinecone-test-key", - embedding_dim=768, - embedding_field="embedding", - index="haystack_tests", - similarity="cosine", - recreate_index=True, - ) - - @pytest.fixture - def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStore: - """ - This fixture provides a pre-populated document store and takes care of cleaning up after each test - """ - documents = [ - Document( - content="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", - meta={ - "target": "Lloyds", - "sentiment_score": -0.532, - "format": "headline", - }, - ), - Document( - content="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", - meta={ - "target": "Glencore", - "sentiment_score": 0.037, - "format": "headline", - }, - ), - Document( - content="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", - meta={ - "target": "Shell", - "sentiment_score": -0.345, - "format": "headline", - }, - ), - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "TSLA", - "sentiment_score": 0.318, - "format": "post", - }, - ), - Document( - content="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", - meta={ - "target": "HSBC", - "sentiment_score": 0.154, - "format": "post", - }, - ), - # Without meta - Document( - content="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." - ), - Document( - content="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." - ), - Document( - content="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." - ), - ] - ds.write_documents(documents) - return ds - - @pytest.fixture - def mocked_ds(self): - class DSMock(PineconeDocumentStore): - pass - - pinecone.init = MagicMock() - DSMock._create_index = MagicMock() - mocked_ds = DSMock(api_key="MOCK") - - return mocked_ds - - def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]: - return [ - # Document object - Document( - content="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", - meta={ - "target": "Lloyds", - "sentiment_score": -0.532, - "format": "headline", - }, - ), - Document( - content="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", - meta={ - "target": "Glencore", - "sentiment_score": 0.037, - "format": "headline", - }, - ), - Document( - content="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", - meta={ - "target": "Shell", - "sentiment_score": -0.345, - "format": "headline", - }, - ), - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "TSLA", - "sentiment_score": 0.318, - "format": "post", - }, - ), - Document( - content="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", - meta={ - "target": "HSBC", - "sentiment_score": 0.154, - "format": "post", - }, - ), - # Without meta - Document( - content="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." - ), - Document( - content="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." - ), - Document( - content="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." - ), - ] - - @pytest.mark.integration - def test_ne_filters(self, ds, documents): - ds.write_documents(documents) - - result = ds.get_filter_documents(filters={"format": {"$ne": "headline"}}) - assert len(result) == 2 - - @pytest.mark.integration - def test_filter_documents_with_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore): - eq_docs = doc_store_with_docs.filter_documents(filters={"type": {"$eq": "article"}}) - normal_docs = doc_store_with_docs.filter_documents(filters={"type": "article"}) - assert eq_docs == normal_docs - - @pytest.mark.integration - def test_filter_documents_ids_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"target": {"$ne": "Glencore"}}) - assert all(d.meta.get("metadata", None) != "Glencore" for d in retrieved_docs) - - @pytest.mark.integration - def test_filter_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"format": {"$nin": ["target", "post"]}}) - assert {"target", "post"}.isdisjoint({d.meta.get("metadata", None) for d in retrieved_docs}) - - @pytest.mark.integration - def test_filter_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gt": 3.0}}) - assert all(d.meta["sentiment_score"] > 3.0 for d in retrieved_docs) - - @pytest.mark.integration - def test_filter_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gte": 3.0}}) - assert all(d.meta["sentiment_score"] >= 3.0 for d in retrieved_docs) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_and_other_field_simplified( - self, doc_store_with_docs: PineconeDocumentStore - ): - filters_simplified = { - "sentiment_score": {"$lte": 0.2, "$gte": 0.4}, - "target": ["Shell", "Glencore", "HSBC", "Lloyds", "TSLA"], - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters_simplified) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_and_or_explicit( - self, doc_store_with_docs: PineconeDocumentStore - ): - filters = { - "$and": { - "sentiment_score": {"$lte": 0.2, "$gte": 0.3}, - "target": { - "name": {"$in": ["HSBC", "Lloyds"]}, - "sentiment_score": {"$lte": 5.0}, - }, - } - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - @pytest.mark.integration - def test_filter_documents_extended_filter_and_or_simplified(self, doc_store_with_docs: PineconeDocumentStore): - filters_simplified = { - "sentiment_score": {"$lte": 0.2, "$gte": 0.3}, - "$or": {"format": ["headline", "post"], "sentiment_score": {"0.318"}}, - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters_simplified) - - @pytest.mark.integration - def test_filter_documents_extended_filter_and_or_and_not_explicit(self, doc_store_with_docs: PineconeDocumentStore): - filters = { - "$and": { - "sentiment_score": {"$gte": 0.037}, - "$or": { - "target": {"$in": ["LLyods", "Glencore", "HSBC", "TSLA", "Shell"]}, - "$and": {"format": {"$in": ["headline", "post"]}}, - }, - } - } - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - @pytest.mark.integration - def test_filter_documents_extended_filter_and_or_and_not_simplified( - self, doc_store_with_docs: PineconeDocumentStore - ): - filters_simplified = { - "sentiment_score": {"$lte": "0.037"}, - "$or": { - "target": ["LLyods", "Glencore"], - "$and": {"format": {"$lte": "headline"}, "$not": {"format": "post"}}, - }, - } - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters_simplified) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore): - # Test nested logical operations within "$not". - filters = { - "$not": { - "$or": { - "$and": {"target": {"Lloyds"}}, - "$not": {"format": {"healdine"}}, - } - } - } - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]t' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_same_level_not(self, doc_store_with_docs: PineconeDocumentStore): - # Test same logical operator twice on the same level. - filters = { - "$or": [ - { - "$and": { - "target": ["LLyods", "Glencore", "TSLA", "Shell"], - "format": {"$in": ["post"]}, - } - }, - { - "$and": { - "target": ["LLyods", "Glencore", "HSBC", "TSLA", "Shell"], - "format": {"$in": ["headline"]}, - } - }, - ] - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore): - """ - We expect 1 doc with an embeddings because all documents in already written in doc_store_with_docs contain no - embeddings. - """ - doc = Document( - content="Doc with embedding", - embedding=np.random.rand(768).astype(np.float32), - ) - doc_store_with_docs.write_documents([doc]) - assert doc_store_with_docs.get_embedding_count() == 1 diff --git a/integrations/pinecone/tests/test_retriever.py b/integrations/pinecone/tests/test_retriever.py deleted file mode 100644 index 4f21d357a..000000000 --- a/integrations/pinecone/tests/test_retriever.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -from inspect import getmembers, isclass, isfunction -from typing import Any, Dict, List, Union -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pinecone -import pytest -from haystack.preview import ( - DeserializationError, - Document, - component, - default_from_dict, - default_to_dict, -) -from haystack.preview.dataclasses import Document - -from pinecone_haystack.document_store import PineconeDocumentStore -from pinecone_haystack.retriever import PineconeRetriever -from tests import pinecone_mock - - -class TestPineconeRetriever: - @pytest.mark.unit - def test_init(self): - mock_store = Mock(spec=PineconeDocumentStore) - retriever = PineconeRetriever(document_store=mock_store) - assert retriever.document_store == mock_store - assert retriever.filters == None - assert retriever.top_k == 10 - assert retriever.scale_score == True - assert retriever.return_embedding == False - - @pytest.mark.unit - def test_run(self): - mock_store = Mock(spec=PineconeDocumentStore) - mock_store.query_by_embedding.return_value = [ - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "TSLA", - "sentiment_score": 0.318, - "format": "post", - }, - ) - ] - - retriever = PineconeRetriever(document_store=mock_store) - results = retriever.run(["How many cars is TSLA recalling?"]) - - assert len(results["documents"]) == 1 - assert ( - results["documents"][0].content - == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." - ) - - @pytest.mark.integration - def test_to_dict(self): - document_store = PineconeDocumentStore("pinecone-test-key") - retriever = PineconeRetriever(document_store=document_store) - doc_dict = retriever.to_dict() - assert doc_dict == { - "init_parameters": { - "document_store": "test_document_store", - "filters": None, - "top_k": 10, - "scale_score": "True", - "return_embedding": False, - } - } - - @pytest.mark.integration - def test_from_dict(self): - """ - Test deserialization of this component from a dictionary, using default initialization parameters. - """ - retriever_component_dict = { - "type": "PineconeRetriever", - "init_parameters": { - "document_store": "test_document_store", - "filters": None, - "top_k": 10, - "scale_score": True, - "return_embedding": False, - }, - } - retriever = PineconeRetriever.from_dict(retriever_component_dict) - - assert retriever.document_store == "test_document_store" - assert retriever.filters is None - assert retriever.top_k == 10 - assert retriever.scale_score is True - assert retriever.return_embedding is False diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py new file mode 100644 index 000000000..25641f7a4 --- /dev/null +++ b/integrations/pinecone/tests/test_write.py @@ -0,0 +1,40 @@ +import time + +import pytest +from haystack import Document +from haystack.document_stores import DuplicatePolicy +from haystack.testing.document_store import ( + WriteDocumentsTest, +) + +from pinecone_haystack.document_store import PineconeDocumentStore + + +class TestWriteDocuments(WriteDocumentsTest): + def test_write_documents(self, document_store: PineconeDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + + # overriden to wait for Pinecone to be updated + def test_write_documents_duplicate_overwrite(self, document_store: PineconeDocumentStore, sleep_time): + """ + Test write_documents() overwrites stored Document when trying to write one with same id + using DuplicatePolicy.OVERWRITE. + """ + doc1 = Document(id="1", content="test doc 1") + doc2 = Document(id="1", content="test doc 2") + + assert document_store.write_documents([doc2], policy=DuplicatePolicy.OVERWRITE) == 1 + time.sleep(sleep_time) + self.assert_documents_are_equal(document_store.filter_documents(), [doc2]) + assert document_store.write_documents(documents=[doc1], policy=DuplicatePolicy.OVERWRITE) == 1 + time.sleep(sleep_time) + self.assert_documents_are_equal(document_store.filter_documents(), [doc1]) + + @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... + + @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ...