Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ uploads/
myenv/
venv/
*.pyc
.history
10 changes: 9 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def get_env_variable(
MONGO_VECTOR_COLLECTION = get_env_variable(
"MONGO_VECTOR_COLLECTION", "vector_collection"
)

PINECONE_API_KEY = get_env_variable("PINECONE_API_KEY", "")
PINECONE_NAMESPACE = get_env_variable("PINECONE_NAMESPACE", "rag_api")
CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500"))
CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100"))

Expand Down Expand Up @@ -224,6 +225,13 @@ def init_embeddings(provider, model):
collection_name=COLLECTION_NAME,
mode="async",
)
elif VECTOR_DB_TYPE == "pinecone":
vector_store = get_vector_store(
connection_string=f'{PINECONE_NAMESPACE}@{PINECONE_API_KEY}',
embeddings=embeddings,
collection_name=COLLECTION_NAME,
mode="pinecone",
)
elif VECTOR_DB_TYPE == "atlas-mongo":
# atlas-mongo vector:
vector_store = get_vector_store(
Expand Down
1 change: 1 addition & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ async def store_data_in_vector_db(
return {"message": "Documents added successfully", "ids": ids}

except Exception as e:

logger.error(e)
return {"message": "An error occurred while adding documents.", "error": str(e)}

Expand Down
136 changes: 116 additions & 20 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,27 +1,123 @@
langchain==0.1.12
langchain_community==0.0.34
langchain_openai==0.0.8
langchain_core==0.1.45
sqlalchemy==2.0.28
python-dotenv==1.0.1
aiofiles==23.2.1
aiohttp==3.9.5
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.3.0
async-timeout==4.0.3
asyncpg==0.29.0
attrs==23.2.0
backoff==2.2.1
beautifulsoup4==4.12.3
certifi==2024.2.2
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
dataclasses-json==0.6.4
dataclasses-json-speakeasy==0.5.11
distro==1.9.0
dnspython==2.6.1
docx2txt==0.8
ecdsa==0.19.0
emoji==2.10.1
et-xmlfile==1.1.0
exceptiongroup==1.2.1
fastapi==0.110.0
psycopg2-binary==2.9.9
pgvector==0.2.5
uvicorn==0.28.0
pypdf==4.1.0
unstructured==0.12.6
markdown==3.6
filelock==3.14.0
filetype==1.2.0
flatbuffers==24.3.25
frozenlist==1.4.1
fsspec==2024.5.0
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.23.0
humanfriendly==10.0
idna==3.6
importlib_metadata==7.1.0
Jinja2==3.1.4
joblib==1.3.2
jsonpatch==1.33
jsonpath-python==1.0.6
jsonpointer==2.4
langchain==0.1.12
langchain-community==0.0.34
langchain-core==0.1.52
langchain-mongodb==0.1.3
langchain-openai==0.0.8
langchain-pinecone==0.1.1
langchain-text-splitters==0.0.2
langdetect==1.0.9
langsmith==0.1.59
lxml==5.1.0
Markdown==3.6
MarkupSafe==2.1.5
marshmallow==3.20.2
mpmath==1.3.0
multidict==6.0.5
mypy-extensions==1.0.0
networkx==3.2.1
pandas==2.2.1
nltk==3.8.1
numpy==1.26.4
onnxruntime==1.17.3
openai==1.30.1
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
openpyxl==3.1.2
docx2txt==0.8
orjson==3.10.3
packaging==23.2
pandas==2.2.1
pgvector==0.2.5
pillow==10.3.0
pinecone-client==3.2.2
protobuf==5.26.1
psycopg2-binary==2.9.9
pyasn1==0.6.0
pyclipper==1.3.0.post5
pydantic==2.7.1
pydantic_core==2.18.2
pymongo==4.6.3
pypandoc==1.13
pypdf==4.1.0
python-dateutil==2.8.2
python-dotenv==1.0.1
python-iso639==2024.2.7
python-jose==3.3.0
asyncpg==0.29.0
python-magic==0.4.27
python-multipart==0.0.9
sentence_transformers==2.5.1
aiofiles==23.2.1
pytz==2024.1
PyYAML==6.0.1
rapidfuzz==3.6.1
rapidocr-onnxruntime==1.3.17
opencv-python-headless==4.9.0.80
pymongo==4.6.3
langchain-mongodb==0.1.3
regex==2023.12.25
requests==2.31.0
rsa==4.9
safetensors==0.4.3
scikit-learn==1.4.2
scipy==1.13.0
sentence-transformers==2.5.1
shapely==2.0.4
six==1.16.0
sniffio==1.3.1
soupsieve==2.5
SQLAlchemy==2.0.28
starlette==0.36.3
sympy==1.12
tabulate==0.9.0
tenacity==8.3.0
threadpoolctl==3.5.0
tiktoken==0.7.0
tokenizers==0.19.1
torch==2.3.0
tqdm==4.66.2
transformers==4.40.2
typing-inspect==0.9.0
typing_extensions==4.9.0
tzdata==2024.1
unstructured==0.12.6
unstructured-client==0.18.0
urllib3==1.26.18
uvicorn==0.28.0
wrapt==1.16.0
yarl==1.9.4
zipp==3.18.2
60 changes: 59 additions & 1 deletion store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from langchain_core.documents import Document
from langchain_core.runnables.config import run_in_executor
from sqlalchemy.orm import Session

import pinecone
from langchain_pinecone._utilities import DistanceStrategy
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.embeddings import Embeddings
from typing import (
List,
Expand Down Expand Up @@ -142,3 +144,59 @@ def delete(self, ids: Optional[list[str]] = None) -> None:
# implement the deletion of documents by file_id in self._collection
if ids is not None:
self._collection.delete_many({"file_id": {"$in": ids}})

class PineconeVector(PineconeVectorStore):
@property
def embedding_function(self) -> Embeddings:
return self.embeddings

def __init__(self, embedding: Embeddings, api_key: str, index_name: str, namespace: Optional[str] = None):
self.index_name = index_name
self.namespace = namespace
super().__init__(index_name=self.index_name, embedding=embedding, text_key="text", namespace=namespace, distance_strategy=DistanceStrategy.COSINE, pinecone_api_key=api_key)

def get_all_ids(self) -> List[str]:
"""
Retrieve all vector IDs in the Pinecone index.
"""
return self._index.list(self.namespace)

def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
"""
Retrieve documents by their IDs from the Pinecone index.
"""
results = self._index.fetch(ids, namespace=self.namespace)
documents = []
for result in results['vectors'].values():
metadata = result['metadata']
doc = Document(page_content=metadata['text'], metadata=metadata)
documents.append(doc)
return documents

def delete(self, ids: Optional[List[str]] = None) -> None:
"""
Delete vectors by their IDs from the Pinecone index.
"""
if ids:
self._index.delete(ids, namespace=self.namespace)

def similarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[dict] = None,
**kwargs: Any
) -> List[Tuple[Document, float]]:
"""
Perform a similarity search with scores using an embedding vector.
"""
query_results = self._index.query(embedding, top_k=k, include_metadata=True, namespace=self.namespace, **kwargs)
docs = query_results['matches']
processed_documents = []
for match in docs:
metadata = match['metadata']
if 'metadata' in metadata and '_id' in metadata['metadata']:
del metadata['metadata']['_id']
doc = Document(page_content=metadata['text'], metadata=metadata)
processed_documents.append((doc, match['score']))
return processed_documents
6 changes: 4 additions & 2 deletions store_factory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from langchain_community.embeddings import OpenAIEmbeddings

from store import AsyncPgVector, ExtendedPgVector
from store import AsyncPgVector, ExtendedPgVector, PineconeVector
from store import AtlasMongoVector
from pymongo import MongoClient

Expand All @@ -26,7 +26,9 @@ def get_vector_store(
mongo_db = MongoClient(connection_string).get_database()
mong_collection = mongo_db[collection_name]
return AtlasMongoVector(collection=mong_collection, embedding=embeddings)

elif mode == "pinecone":
namespace, api_key = connection_string.split("@")
return PineconeVector(embedding=embeddings, api_key=api_key, index_name=collection_name, namespace=namespace)
else:
raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.")

Expand Down