Update the chatbotcore modules (#47)

* Add contextual chunk module; Fix on llm prompts; * Update the configurations for worker and embedding model; * filter for history added and prompts updated * Only unique documents retrieval * chunk formatting * Add hybrid retrieval modules
toggle-corp · Jan 7, 2025 · e066f2c · e066f2c
1 parent 7f792fd
commit e066f2c
Show file tree

Hide file tree

Showing 12 changed files with 1,957 additions and 1,229 deletions.
diff --git a/chatbotcore/contextual_chunks.py b/chatbotcore/contextual_chunks.py
@@ -0,0 +1,109 @@
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Any, List
+
+from django.conf import settings
+from langchain.schema import Document
+from langchain_community.llms.ollama import Ollama
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+
+from chatbotcore.utils import LLMType
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OpenAIHandler:
+    """LLM handler using OpenAI for RAG"""
+
+    temperature: float = 0.1
+    llm: ChatOpenAI = field(init=False)
+
+    def __post_init__(self):
+        try:
+            self.llm = ChatOpenAI(model=settings.LLM_MODEL_NAME, temperature=self.temperature)
+        except Exception as e:
+            raise Exception(f"OpenAI LLM model is not successfully loaded. {str(e)}")
+
+
+@dataclass
+class OllamaHandler:
+    """LLM Handler using Ollama"""
+
+    temperature: float = 0.1
+    llm: Ollama = field(init=False)
+
+    def __post_init__(self):
+        try:
+            self.llm = Ollama(
+                model=settings.LLM_MODEL_NAME, base_url=settings.LLM_OLLAMA_BASE_URL, temperature=self.temperature
+            )
+        except Exception as e:
+            raise Exception(f"Ollama LLM model is not successfully loaded. {str(e)}")
+
+
+@dataclass
+class ContextualChunking:
+    """Context retrieval for the chunk documents"""
+
+    model: Any = field(init=False)
+    model_type: LLMType = LLMType.OLLAMA
+
+    def __post_init__(self):
+        if self.model_type == LLMType.OLLAMA:
+            self.model = OllamaHandler()
+        elif self.model_type == LLMType.OPENAI:
+            self.model = OpenAIHandler()
+        else:
+            logger.error("Wrong LLM Type")
+            raise ValueError("Wrong LLM Type")
+
+    def get_prompt(self):
+        """Creates a prompt"""
+        prompt = """
+        You are an AI assistant who can generate a short context of the chunk text from the document.
+        Here is the document:
+        <document>
+        {document}
+        </document>
+
+        Here is the chunk we want to situate within the whole document:
+        <chunk>
+        {chunk}
+        </chunk>
+
+        Please give a short succint context (within 30 tokens) to situate this chunk within the overall document\n
+        for the purposes of improving search retrieval of the chunk. Answer only with the succint context and nothing else.
+        Make sure that the context does not miss the factual informations in the chunk.
+        """
+        return prompt
+
+    def _generate_context(self, document: str, chunk: str):
+        """Generates contextualized document chunk response"""
+        prompt_template = ChatPromptTemplate.from_messages([("system", self.get_prompt())])
+        messages = prompt_template.format_messages(document=document, chunk=chunk)
+        response = self.model.llm.invoke(messages)
+        return response
+
+    def generate_contextualized_chunks(self, document: str, chunks: List[Document]):
+        """Generates contextualized document chunks"""
+        contextualized_chunks = []
+        for chunk in chunks:
+            context = self._generate_context(document, chunk.page_content)
+            context = getattr(context, "content")  # note: required when openai is used
+
+            # Strip both context and chunk content of leading/trailing spaces
+            context = context.strip()
+
+            chunk_content = chunk.page_content.strip()
+            # Remove " or . or both appearing at the beginning or end of text
+            context = re.sub(r'^"|[".]+$', "", context)
+
+            # Concatenate context with chunk content, ensuring no unwanted spaces or punctuation
+            contextualized_content = f"{context}. {chunk_content}"
+            # Add the cleaned-up content to the list of contextualized chunks
+            contextualized_chunks.append(Document(page_content=contextualized_content, metadata=chunk.metadata))
+
+        return contextualized_chunks
diff --git a/chatbotcore/custom_embeddings.py b/chatbotcore/custom_embeddings.py
@@ -27,24 +27,24 @@ def __post_init__(self):
             if not (self.url and self.model_name and self.base_url):
                 raise Exception("Url or base_url or both are not provided.")
 
-    def embed_query(self, text: str, timeout: int = 30) -> List[float]:
+    def embed_query(self, text: str, timeout: int = 45) -> List[float]:
         """
         Sends the request to Embedding module to
         embed the query to the vector representation
         """
-        payload = {"type_model": self.model_type, "name_model": self.model_name, "texts": text}
+        payload = {"texts": text}
         try:
             response = requests.post(url=self.url, json=payload, timeout=timeout)
         except requests.Timeout as e:
             raise Exception(e)
         return response.json()
 
-    def embed_documents(self, texts: List[str], timeout: int = 30) -> List[List[float]]:
+    def embed_documents(self, texts: List[str], timeout: int = 45) -> List[List[float]]:
         """
         Sends the request to Embedding module to
         embed multiple queries to the vector representation
         """
-        payload = {"type_model": self.model_type, "name_model": self.model_name, "texts": texts}
+        payload = {"texts": texts}
         try:
             response = requests.post(url=self.url, json=payload, timeout=timeout)
         except requests.Timeout as e:

diff --git a/chatbotcore/database.py b/chatbotcore/database.py
@@ -1,10 +1,11 @@
 import logging
 import uuid
 from dataclasses import dataclass, field
-from typing import Any
+from typing import Any, List
 
 import qdrant_client.http.models as q_models
 from django.conf import settings
+from langchain.schema import Document
 from qdrant_client import QdrantClient
 from qdrant_client.http.exceptions import UnexpectedResponse
 from qdrant_client.http.models import (
@@ -58,6 +59,29 @@ def store_data(self, data: list) -> None:
         response = self.db_client.upsert(collection_name=self.collection_name, points=point_vectors)
         return response
 
+    def retrieve_vectors(self, points: List[str]) -> List[List[float]]:
+        """Retrieve vectors"""
+        retrieved_data = self.db_client.retrieve(
+            collection_name=self.collection_name, ids=points, with_vectors=True, with_payload=False
+        )
+        return [v.vector for v in retrieved_data]
+
+    def search_vectors_by_id(self, uuid_to_search: str):
+        """
+        Search data vectors by id
+        """
+        filter_condition = Filter(must=[FieldCondition(key="_id", match=MatchValue(value=uuid_to_search))])
+        results = self.db_client.search(
+            collection_name=self.collection_name,
+            query_vector=[0.0] * settings.EMBEDDING_MODEL_VECTOR_SIZE,
+            query_filter=filter_condition,
+            limit=1,
+        )
+        if results:
+            vector = results[0].vector
+            return vector
+        return None
+
     def data_search(
         self, collection_names: list, query_vector: list, top_n_retrieval: int = 5, score_threshold: float = 0.7
     ):
@@ -74,12 +98,50 @@ def data_search(
         # Note the results shall contain score key; sort the results using score key and get top 5 among them.
         return results
 
-    def delete_data_by_src_uuid(self, collection_name: str, key: str, value: Any) -> bool:
+    def delete_data_by_src_uuid(self, key: str, value: Any) -> bool:
         """
         Delete data by source uuid
         Note that the document source key should be doc_uuid
         """
         points_selector = FilterSelector(filter=Filter(must=[FieldCondition(key=key, match=MatchValue(value=value))]))
-        result = self.db_client.delete(collection_name=collection_name, points_selector=points_selector)
+        result = self.db_client.delete(collection_name=self.collection_name, points_selector=points_selector)
 
         return result.status == q_models.UpdateStatus.COMPLETED
+
+    def convert_record_to_document(self, records):
+        """
+        Converts Record type to Document type
+        """
+        documents = []
+        for record in records:
+            page_content = record.payload.get("page_content", "")  # Adjust this to match your payload structure
+            if page_content:
+                page_content = page_content.replace("\n", "").strip()
+            metadata = {k: v for k, v in record.payload.items() if k != "text"}  # All other metadata
+            metadata["_id"] = record.id
+            # Create a LangChain Document
+            doc = Document(
+                page_content=page_content,
+                metadata=metadata,
+            )
+            documents.append(doc)
+        return documents
+
+    def load_all_documents(self):
+        """Load all the documents"""
+        all_docs = []
+        offset = 0
+        limit = 10_000
+
+        while True:
+            response = self.db_client.scroll(
+                collection_name=self.collection_name, offset=offset, limit=limit, with_payload=True, with_vectors=False
+            )
+            documents = self.convert_record_to_document(response[0])
+            offset = response[-1]
+
+            all_docs.extend(documents)
+
+            if len(documents) < limit:
+                break
+        return all_docs
diff --git a/chatbotcore/doc_loaders.py b/chatbotcore/doc_loaders.py
@@ -1,10 +1,16 @@
-from dataclasses import dataclass
+import json
+from dataclasses import dataclass, field
 from typing import List
 
+import requests
+from django.conf import settings
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import WebBaseLoader
 
+from chatbotcore.contextual_chunks import ContextualChunking
+from chatbotcore.utils import LLMType
+
 
 @dataclass(kw_only=True)
 class DocumentLoader:
@@ -13,17 +19,50 @@ class DocumentLoader:
     """
 
     chunk_size: int = 100
-    chunk_overlap: int = 20
+    chunk_overlap: int = 30
+    context_retrieval: ContextualChunking = field(init=False)
+
+    def __post_init__(self):
+        self.context_retrieval = ContextualChunking(model_type=LLMType(int(settings.LLM_TYPE)))
 
-    def _get_split_documents(self, documents: List[Document]):
+    def _get_split_documents_with_recursive_char(self, documents: List[Document], multiplier: int = 3):
         """
-        Splits documents into multiple chunks
+        Splits documents into multiple chunks using Recursive Character splitter
         """
         splitter = RecursiveCharacterTextSplitter(
-            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, length_function=len
+            chunk_size=self.chunk_size * multiplier, chunk_overlap=self.chunk_overlap * multiplier, length_function=len
         )
+        return splitter.split_documents(documents=documents)
+
+    def langchain_document_to_dict(self, doc: Document):
+        """
+        Converts langchain Document to dictionary
+        """
+        return {"page_content": doc.page_content, "metadata": doc.metadata}
 
-        return splitter.split_documents(documents)
+    def dict_to_langchain_document(self, doc: dict):
+        """
+        Converts dictionary to Langchain docuemnt
+        """
+        return Document(page_content=doc["page_content"], metadata=doc["metadata"])
+
+    def _get_split_documents_using_token_based(self, documents: List[Document], timeout: int = 60):
+        """
+        Splits documents into multiple chunks using Sentence Transformer
+        token based.
+        """
+        url = f"{settings.EMBEDDING_MODEL_URL}/split_docs_based_on_tokens"
+        documents_dict = [self.langchain_document_to_dict(d) for d in documents]
+        payload = {
+            "model": settings.EMBEDDING_MODEL_NAME,
+            "documents": json.dumps(documents_dict),
+            "chunk_size": self.chunk_size,
+            "chunk_overlap": self.chunk_overlap,
+        }
+        headers = {"Content-Type": "application/json"}
+        response = requests.post(url=url, headers=headers, json=payload, timeout=timeout)
+        data = response.json()
+        return [self.dict_to_langchain_document(d) for d in data]
 
 
 @dataclass
@@ -39,8 +78,11 @@ def create_document_chunks(self):
         Creates multiple documents from the input texts
         """
         documents = [Document(page_content=self.text)]
-        doc_chunks = self._get_split_documents(documents=documents)
-        return doc_chunks
+        # Note that the token based splitting can be used as shown below.
+        # doc_chunks = self._get_split_documents_using_token_based(documents=documents)
+        doc_chunks = self._get_split_documents_with_recursive_char(documents=documents)
+        contextualized_chunks = self.context_retrieval.generate_contextualized_chunks(document=self.text, chunks=doc_chunks)
+        return contextualized_chunks
 
 
 @dataclass
@@ -57,5 +99,6 @@ def create_document_chunks(self):
         """
         loader = WebBaseLoader(web_path=self.url)
         docs = loader.load()
-        doc_chunks = self._get_split_documents(documents=docs)
-        return doc_chunks
+        doc_chunks = self._get_split_documents_using_token_based(documents=docs)
+        contextualized_chunks = self.context_retrieval.generate_contextualized_chunks(document=docs, chunks=doc_chunks)
+        return contextualized_chunks