v2.0.22

phidatahq · Nov 1, 2023 · b86d7cb · b86d7cb
1 parent 186ddf3
commit b86d7cb
Show file tree

Hide file tree

Showing 10 changed files with 69 additions and 30 deletions.
diff --git a/phi/api/conversation.py b/phi/api/conversation.py
@@ -20,7 +20,7 @@ def create_conversation_monitor(monitor: ConversationMonitorCreate) -> bool:
     if not phi_cli_settings.api_enabled:
         return True
 
-    logger.debug("--o-o-- Creating Conversation Monitor")
+    # logger.debug("--o-o-- Creating Conversation Monitor")
     with api.AuthenticatedClient() as api_client:
         try:
             conversation_workspace = ConversationWorkspace(
@@ -42,7 +42,7 @@ def create_conversation_monitor(monitor: ConversationMonitorCreate) -> bool:
             if response_json is None:
                 return False
 
-            logger.debug(f"Response: {response_json}")
+            # logger.debug(f"Response: {response_json}")
             return True
         except Exception as e:
             logger.debug(f"Could not create conversation monitor: {e}")
@@ -53,7 +53,7 @@ def create_conversation_event(conversation: ConversationEventCreate) -> bool:
     if not phi_cli_settings.api_enabled:
         return True
 
-    logger.debug("--o-o-- Creating Conversation Event")
+    # logger.debug("--o-o-- Creating Conversation Event")
     with api.AuthenticatedClient() as api_client:
         try:
             conversation_workspace = ConversationWorkspace(
@@ -75,7 +75,7 @@ def create_conversation_event(conversation: ConversationEventCreate) -> bool:
             if response_json is None:
                 return False
 
-            logger.debug(f"Response: {response_json}")
+            # logger.debug(f"Response: {response_json}")
             return True
         except Exception as e:
             logger.debug(f"Could not log conversation event: {e}")

diff --git a/phi/conversation/conversation.py b/phi/conversation/conversation.py
@@ -477,6 +477,7 @@ def get_user_prompt(
         _user_prompt += "Respond to the following message"
         if self.user_type:
             _user_prompt += f" from a '{self.user_type}'"
+        _user_prompt += ":"
         _user_prompt += f"\nUSER: {message}"
         _user_prompt += "\nASSISTANT: "
 

diff --git a/phi/document/reader/pdf.py b/phi/document/reader/pdf.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List
+from typing import List, Union, IO, Any
 
 from phi.document.base import Document
 from phi.document.reader.base import Reader
@@ -9,21 +9,26 @@
 class PDFReader(Reader):
     """Reader for PDF files"""
 
-    def read(self, path: Path) -> List[Document]:
-        if not path:
-            raise ValueError("No path provided")
-
-        if not path.exists():
-            raise FileNotFoundError(f"Could not find file: {path}")
+    def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
+        if not pdf:
+            raise ValueError("No pdf provided")
 
         try:
             from pypdf import PdfReader as DocumentReader  # noqa: F401
         except ImportError:
             raise ImportError("`pypdf` not installed")
 
-        logger.info(f"Reading: {path}")
-        doc_name = path.name.split(".")[0]
-        doc_reader = DocumentReader(path)
+        doc_name = ""
+        try:
+            if isinstance(pdf, str):
+                doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
+            else:
+                doc_name = pdf.name.split(".")[0]
+        except Exception:
+            doc_name = "pdf"
+
+        logger.info(f"Reading: {doc_name}")
+        doc_reader = DocumentReader(pdf)
 
         documents = [
             Document(

diff --git a/phi/embedder/openai.py b/phi/embedder/openai.py
@@ -16,7 +16,7 @@ class OpenAIEmbedder(Embedder):
 
     def _response(self, text: str):
         if get_from_env("OPENAI_API_KEY") is None:
-            logger.debug("--o-o-- Using Phidata Servers")
+            logger.debug("--o-o-- Using Phidata Proxy")
             try:
                 from phi.api.llm import openai_embedding
 
@@ -36,11 +36,14 @@ def _response(self, text: str):
 
     def get_embedding(self, text: str) -> List[float]:
         response = self._response(text=text)
-        if "data" not in response:
+        try:
+            if "data" not in response:
+                return []
+            return response["data"][0]["embedding"]
+        except Exception as e:
+            logger.warning(e)
             return []
 
-        return response["data"][0]["embedding"]
-
     def get_embedding_and_usage(self, text: str) -> Tuple[List[float], Optional[Dict]]:
         response = self._response(text=text)
         if "data" not in response:

diff --git a/phi/knowledge/base.py b/phi/knowledge/base.py
@@ -41,10 +41,7 @@ def search(self, query: str, num_documents: Optional[int] = None) -> List[Docume
         return self.vector_db.search(query=query, limit=_num_documents)
 
     def load(self, recreate: bool = False) -> None:
-        """Load the knowledge base to the vector db
-
-        TODO: Use upsert instead of insert
-        """
+        """Load the knowledge base to the vector db"""
 
         if self.vector_db is None:
             logger.warning("No vector db provided")
@@ -73,6 +70,37 @@ def load(self, recreate: bool = False) -> None:
             logger.debug("Optimizing Vector DB")
             self.vector_db.optimize()
 
+    def load_documents(self, documents: List[List[Document]], recreate: bool = False) -> None:
+        """Load documents to the knowledge base
+
+        Args:
+            documents (List[List[Document]]): List of list of documents to load
+            recreate (bool, optional): Whether to recreate the documents. Defaults to False.
+        """
+
+        if self.vector_db is None:
+            logger.warning("No vector db provided")
+            return
+
+        logger.debug("Creating collection")
+        self.vector_db.create()
+
+        logger.info("Loading knowledge base")
+        num_documents = 0
+
+        for document_list in documents:
+            # Filter out documents which already exist in the vector db
+            if not recreate:
+                document_list = [document for document in document_list if not self.vector_db.doc_exists(document)]
+
+            self.vector_db.insert(documents=document_list)
+            num_documents += len(document_list)
+        logger.info(f"Loaded {num_documents} documents to knowledge base")
+
+        if self.optimize_on is not None and num_documents > self.optimize_on:
+            logger.debug("Optimizing Vector DB")
+            self.vector_db.optimize()
+
     def exists(self) -> bool:
         """Returns True if the knowledge base exists"""
         if self.vector_db is None:

diff --git a/phi/knowledge/combined.py b/phi/knowledge/combined.py
@@ -2,6 +2,7 @@
 
 from phi.document import Document
 from phi.knowledge.base import KnowledgeBase
+from phi.utils.log import logger
 
 
 class CombinedKnowledgeBase(KnowledgeBase):
@@ -17,5 +18,5 @@ def document_lists(self) -> Iterator[List[Document]]:
         """
 
         for kb in self.sources:
-            for document_list in kb.document_lists:
-                yield document_list
+            logger.debug(f"Loading knowledge base: {kb}")
+            yield from kb.document_lists
diff --git a/phi/knowledge/pdf.py b/phi/knowledge/pdf.py
@@ -23,9 +23,9 @@ def document_lists(self) -> Iterator[List[Document]]:
 
         if _pdf_path.exists() and _pdf_path.is_dir():
             for _pdf in _pdf_path.glob("**/*.pdf"):
-                yield self.reader.read(path=_pdf)
+                yield self.reader.read(pdf=_pdf)
         elif _pdf_path.exists() and _pdf_path.is_file() and _pdf_path.suffix == ".pdf":
-            yield self.reader.read(path=_pdf_path)
+            yield self.reader.read(pdf=_pdf_path)
 
 
 class PDFUrlKnowledgeBase(KnowledgeBase):

diff --git a/phi/llm/agent/website.py b/phi/llm/agent/website.py
@@ -19,7 +19,7 @@ def __init__(self, knowledge_base: Optional[WebsiteKnowledgeBase] = None):
 
     def add_website_to_knowledge_base(self, url: str) -> str:
         """This function adds a websites content to the knowledge base.
-        NOTE: The website must start wit http:// or https:// and should be a valid website.
+        NOTE: The website must start with https:// and should be a valid website.
 
         USE THIS FUNCTION TO GET INFORMATION ABOUT PRODUCTS FROM THE INTERNET.
 

diff --git a/phi/llm/openai.py b/phi/llm/openai.py
@@ -63,7 +63,7 @@ def api_kwargs(self) -> Dict[str, Any]:
 
     def invoke_model(self, messages: List[Message]) -> OpenAIObject:
         if get_from_env("OPENAI_API_KEY") is None:
-            logger.debug("--o-o-- Using Phidata Servers")
+            logger.debug("--o-o-- Using Phidata Proxy")
             try:
                 from phi.api.llm import openai_chat
 
@@ -91,7 +91,7 @@ def invoke_model(self, messages: List[Message]) -> OpenAIObject:
 
     def invoke_model_stream(self, messages: List[Message]) -> Iterator[OpenAIObject]:
         if get_from_env("OPENAI_API_KEY") is None:
-            logger.debug("--o-o-- Using Phidata Servers")
+            logger.debug("--o-o-- Using Phidata Proxy")
             try:
                 from phi.api.llm import openai_chat_stream
                 from openai import util as openai_util

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "phidata"
-version = "2.0.21"
+version = "2.0.22"
 description = "AI Toolkit for Engineers"
 requires-python = ">=3.7"
 readme = "README.md"
@@ -91,6 +91,7 @@ module = [
   "botocore.*",
   "bs4.*",
   "docker.*",
+  "duckdb.*",
   "kubernetes.*",
   "openai.*",
   "pgvector.*",