diff --git a/.gitignore b/.gitignore index 4649d05a..c608b6b8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ uploads/ myenv/ venv/ *.pyc +.history diff --git a/config.py b/config.py index fc20e0d4..08b2de59 100644 --- a/config.py +++ b/config.py @@ -49,7 +49,8 @@ def get_env_variable( MONGO_VECTOR_COLLECTION = get_env_variable( "MONGO_VECTOR_COLLECTION", "vector_collection" ) - +PINECONE_API_KEY = get_env_variable("PINECONE_API_KEY", "") +PINECONE_NAMESPACE = get_env_variable("PINECONE_NAMESPACE", "rag_api") CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500")) CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100")) @@ -224,6 +225,13 @@ def init_embeddings(provider, model): collection_name=COLLECTION_NAME, mode="async", ) +elif VECTOR_DB_TYPE == "pinecone": + vector_store = get_vector_store( + connection_string=f'{PINECONE_NAMESPACE}@{PINECONE_API_KEY}', + embeddings=embeddings, + collection_name=COLLECTION_NAME, + mode="pinecone", + ) elif VECTOR_DB_TYPE == "atlas-mongo": # atlas-mongo vector: vector_store = get_vector_store( diff --git a/main.py b/main.py index d4c288b7..0dd15443 100644 --- a/main.py +++ b/main.py @@ -249,6 +249,7 @@ async def store_data_in_vector_db( return {"message": "Documents added successfully", "ids": ids} except Exception as e: + logger.error(e) return {"message": "An error occurred while adding documents.", "error": str(e)} diff --git a/requirements.txt b/requirements.txt index 605e09db..4eca1c75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,27 +1,123 @@ -langchain==0.1.12 -langchain_community==0.0.34 -langchain_openai==0.0.8 -langchain_core==0.1.45 -sqlalchemy==2.0.28 -python-dotenv==1.0.1 +aiofiles==23.2.1 +aiohttp==3.9.5 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==4.3.0 +async-timeout==4.0.3 +asyncpg==0.29.0 +attrs==23.2.0 +backoff==2.2.1 +beautifulsoup4==4.12.3 +certifi==2024.2.2 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +coloredlogs==15.0.1 +dataclasses-json==0.6.4 +dataclasses-json-speakeasy==0.5.11 +distro==1.9.0 +dnspython==2.6.1 +docx2txt==0.8 +ecdsa==0.19.0 +emoji==2.10.1 +et-xmlfile==1.1.0 +exceptiongroup==1.2.1 fastapi==0.110.0 -psycopg2-binary==2.9.9 -pgvector==0.2.5 -uvicorn==0.28.0 -pypdf==4.1.0 -unstructured==0.12.6 -markdown==3.6 +filelock==3.14.0 +filetype==1.2.0 +flatbuffers==24.3.25 +frozenlist==1.4.1 +fsspec==2024.5.0 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.0 +huggingface-hub==0.23.0 +humanfriendly==10.0 +idna==3.6 +importlib_metadata==7.1.0 +Jinja2==3.1.4 +joblib==1.3.2 +jsonpatch==1.33 +jsonpath-python==1.0.6 +jsonpointer==2.4 +langchain==0.1.12 +langchain-community==0.0.34 +langchain-core==0.1.52 +langchain-mongodb==0.1.3 +langchain-openai==0.0.8 +langchain-pinecone==0.1.1 +langchain-text-splitters==0.0.2 +langdetect==1.0.9 +langsmith==0.1.59 +lxml==5.1.0 +Markdown==3.6 +MarkupSafe==2.1.5 +marshmallow==3.20.2 +mpmath==1.3.0 +multidict==6.0.5 +mypy-extensions==1.0.0 networkx==3.2.1 -pandas==2.2.1 +nltk==3.8.1 +numpy==1.26.4 +onnxruntime==1.17.3 +openai==1.30.1 +opencv-python==4.9.0.80 +opencv-python-headless==4.9.0.80 openpyxl==3.1.2 -docx2txt==0.8 +orjson==3.10.3 +packaging==23.2 +pandas==2.2.1 +pgvector==0.2.5 +pillow==10.3.0 +pinecone-client==3.2.2 +protobuf==5.26.1 +psycopg2-binary==2.9.9 +pyasn1==0.6.0 +pyclipper==1.3.0.post5 +pydantic==2.7.1 +pydantic_core==2.18.2 +pymongo==4.6.3 pypandoc==1.13 +pypdf==4.1.0 +python-dateutil==2.8.2 +python-dotenv==1.0.1 +python-iso639==2024.2.7 python-jose==3.3.0 -asyncpg==0.29.0 +python-magic==0.4.27 python-multipart==0.0.9 -sentence_transformers==2.5.1 -aiofiles==23.2.1 +pytz==2024.1 +PyYAML==6.0.1 +rapidfuzz==3.6.1 rapidocr-onnxruntime==1.3.17 -opencv-python-headless==4.9.0.80 -pymongo==4.6.3 -langchain-mongodb==0.1.3 +regex==2023.12.25 +requests==2.31.0 +rsa==4.9 +safetensors==0.4.3 +scikit-learn==1.4.2 +scipy==1.13.0 +sentence-transformers==2.5.1 +shapely==2.0.4 +six==1.16.0 +sniffio==1.3.1 +soupsieve==2.5 +SQLAlchemy==2.0.28 +starlette==0.36.3 +sympy==1.12 +tabulate==0.9.0 +tenacity==8.3.0 +threadpoolctl==3.5.0 +tiktoken==0.7.0 +tokenizers==0.19.1 +torch==2.3.0 +tqdm==4.66.2 +transformers==4.40.2 +typing-inspect==0.9.0 +typing_extensions==4.9.0 +tzdata==2024.1 +unstructured==0.12.6 +unstructured-client==0.18.0 +urllib3==1.26.18 +uvicorn==0.28.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.18.2 diff --git a/store.py b/store.py index f6fd1e0e..edb6e6aa 100644 --- a/store.py +++ b/store.py @@ -4,8 +4,10 @@ from langchain_core.documents import Document from langchain_core.runnables.config import run_in_executor from sqlalchemy.orm import Session - +import pinecone +from langchain_pinecone._utilities import DistanceStrategy from langchain_mongodb import MongoDBAtlasVectorSearch +from langchain_pinecone import PineconeVectorStore, Pinecone from langchain_core.embeddings import Embeddings from typing import ( List, @@ -142,3 +144,59 @@ def delete(self, ids: Optional[list[str]] = None) -> None: # implement the deletion of documents by file_id in self._collection if ids is not None: self._collection.delete_many({"file_id": {"$in": ids}}) + +class PineconeVector(PineconeVectorStore): + @property + def embedding_function(self) -> Embeddings: + return self.embeddings + + def __init__(self, embedding: Embeddings, api_key: str, index_name: str, namespace: Optional[str] = None): + self.index_name = index_name + self.namespace = namespace + super().__init__(index_name=self.index_name, embedding=embedding, text_key="text", namespace=namespace, distance_strategy=DistanceStrategy.COSINE, pinecone_api_key=api_key) + + def get_all_ids(self) -> List[str]: + """ + Retrieve all vector IDs in the Pinecone index. + """ + return self._index.list(self.namespace) + + def get_documents_by_ids(self, ids: List[str]) -> List[Document]: + """ + Retrieve documents by their IDs from the Pinecone index. + """ + results = self._index.fetch(ids, namespace=self.namespace) + documents = [] + for result in results['vectors'].values(): + metadata = result['metadata'] + doc = Document(page_content=metadata['text'], metadata=metadata) + documents.append(doc) + return documents + + def delete(self, ids: Optional[List[str]] = None) -> None: + """ + Delete vectors by their IDs from the Pinecone index. + """ + if ids: + self._index.delete(ids, namespace=self.namespace) + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[dict] = None, + **kwargs: Any + ) -> List[Tuple[Document, float]]: + """ + Perform a similarity search with scores using an embedding vector. + """ + query_results = self._index.query(embedding, top_k=k, include_metadata=True, namespace=self.namespace, **kwargs) + docs = query_results['matches'] + processed_documents = [] + for match in docs: + metadata = match['metadata'] + if 'metadata' in metadata and '_id' in metadata['metadata']: + del metadata['metadata']['_id'] + doc = Document(page_content=metadata['text'], metadata=metadata) + processed_documents.append((doc, match['score'])) + return processed_documents \ No newline at end of file diff --git a/store_factory.py b/store_factory.py index 35b77fbd..5251f33b 100644 --- a/store_factory.py +++ b/store_factory.py @@ -1,6 +1,6 @@ from langchain_community.embeddings import OpenAIEmbeddings -from store import AsyncPgVector, ExtendedPgVector +from store import AsyncPgVector, ExtendedPgVector, PineconeVector from store import AtlasMongoVector from pymongo import MongoClient @@ -26,7 +26,9 @@ def get_vector_store( mongo_db = MongoClient(connection_string).get_database() mong_collection = mongo_db[collection_name] return AtlasMongoVector(collection=mong_collection, embedding=embeddings) - + elif mode == "pinecone": + namespace, api_key = connection_string.split("@") + return PineconeVector(embedding=embeddings, api_key=api_key, index_name=collection_name, namespace=namespace) else: raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.")