Merge branch 'main' of https://github.com/phidatahq/phidata

phidatahq · Dec 19, 2024 · ff1bca0 · ff1bca0
2 parents 7d5ceee + e4113a5
commit ff1bca0
Show file tree

Hide file tree

Showing 30 changed files with 849 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -48,4 +48,6 @@ data.db
 
 .ipynb_checkpoints
 
+audio_generations
+
 *.db
diff --git a/cookbook/agents/37_audio_input_output.py b/cookbook/agents/37_audio_input_output.py
@@ -2,6 +2,7 @@
 import requests
 from phi.agent import Agent
 from phi.model.openai import OpenAIChat
+from phi.utils.audio import write_audio_to_file
 
 # Fetch the audio file and convert it to a base64 encoded string
 url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
@@ -22,7 +23,5 @@
     audio={"data": encoded_string, "format": "wav"},
 )
 
-if agent.run_response.audio is not None and "data" in agent.run_response.audio:
-    wav_bytes = base64.b64decode(agent.run_response.audio["data"])
-    with open("dog.wav", "wb") as f:
-        f.write(wav_bytes)
+if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
+    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
diff --git a/cookbook/agents/38_audio_multi_turn.py b/cookbook/agents/38_audio_multi_turn.py
@@ -1,22 +1,19 @@
-import base64
 from phi.agent import Agent
 from phi.model.openai import OpenAIChat
+from phi.utils.audio import write_audio_to_file
 
 agent = Agent(
     model=OpenAIChat(
         id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"}
     ),
+    debug_mode=True,
     add_history_to_messages=True,
 )
 
 agent.run("Is a golden retriever a good family dog?")
-if agent.run_response.audio is not None and "data" in agent.run_response.audio:
-    wav_bytes = base64.b64decode(agent.run_response.audio["data"])
-    with open("tmp/answer_1.wav", "wb") as f:
-        f.write(wav_bytes)
+if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
+    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav")
 
 agent.run("Why do you say they are loyal?")
-if agent.run_response.audio is not None and "data" in agent.run_response.audio:
-    wav_bytes = base64.b64decode(agent.run_response.audio["data"])
-    with open("tmp/answer_2.wav", "wb") as f:
-        f.write(wav_bytes)
+if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
+    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav")
diff --git a/cookbook/agents/42_image_to_audio.py b/cookbook/agents/42_image_to_audio.py
@@ -1,10 +1,10 @@
-import base64
 from pathlib import Path
 from rich import print
 from rich.text import Text
 
 from phi.agent import Agent, RunResponse
 from phi.model.openai import OpenAIChat
+from phi.utils.audio import write_audio_to_file
 
 cwd = Path(__file__).parent.resolve()
 
@@ -23,7 +23,5 @@
 )
 
 audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}")
-if audio_story.audio is not None and "data" in audio_story.audio:
-    wav_bytes = base64.b64decode(audio_story.audio["data"])
-    with open(cwd.joinpath("tmp/multimodal-agents.wav"), "wb") as f:
-        f.write(wav_bytes)
+if audio_story.response_audio is not None and "data" in audio_story.response_audio:
+    write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav")
diff --git a/cookbook/playground/gemini_agents.py b/cookbook/playground/gemini_agents.py
@@ -0,0 +1,16 @@
+from phi.agent import Agent
+from phi.tools.yfinance import YFinanceTools
+from phi.playground import Playground, serve_playground_app
+from phi.model.google import Gemini
+
+finance_agent = Agent(
+    name="Finance Agent",
+    model=Gemini(id="gemini-2.0-flash-exp"),
+    tools=[YFinanceTools(stock_price=True)],
+    debug_mode=True,
+)
+
+app = Playground(agents=[finance_agent]).get_app(use_async=False)
+
+if __name__ == "__main__":
+    serve_playground_app("gemini_agents:app", reload=True)
diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py
@@ -9,6 +9,7 @@
 from phi.agent import Agent
 from phi.model.openai import OpenAIChat
 from phi.tools.dalle import Dalle
+from phi.tools.eleven_labs_tools import ElevenLabsTools
 from phi.tools.giphy import GiphyTools
 from phi.tools.models_labs import ModelsLabs
 from phi.model.response import FileType
@@ -88,6 +89,7 @@
 
 gif_agent = Agent(
     name="Gif Generator Agent",
+    agent_id="gif_agent",
     model=OpenAIChat(id="gpt-4o"),
     tools=[GiphyTools()],
     description="You are an AI agent that can generate gifs using Giphy.",
@@ -102,8 +104,34 @@
     storage=SqlAgentStorage(table_name="gif_agent", db_file=image_agent_storage_file),
 )
 
+audio_agent = Agent(
+    name="Audio Generator Agent",
+    agent_id="audio_agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[
+        ElevenLabsTools(
+            voice_id="JBFqnCBsd6RMkjVDRZzb", model_id="eleven_multilingual_v2", target_directory="audio_generations"
+        )
+    ],
+    description="You are an AI agent that can generate audio using the ElevenLabs API.",
+    instructions=[
+        "When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio.",
+        "You'll generate the appropriate prompt to send to the tool to generate audio.",
+        "You don't need to find the appropriate voice first, I already specified the voice to user."
+        "Don't return file name or file url in your response or markdown just tell the audio was created successfully.",
+        "The audio should be long and detailed.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    add_history_to_messages=True,
+    add_datetime_to_instructions=True,
+    storage=SqlAgentStorage(table_name="audio_agent", db_file=image_agent_storage_file),
+)
 
-app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent]).get_app(use_async=False)
+
+app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent]).get_app(
+    use_async=False
+)
 
 if __name__ == "__main__":
     serve_playground_app("multimodal_agent:app", reload=True)
diff --git a/cookbook/providers/ollama/agent_stream.py b/cookbook/providers/ollama/agent_stream.py
@@ -3,6 +3,7 @@
 from typing import Iterator  # noqa
 from phi.agent import Agent, RunResponse  # noqa
 from phi.model.ollama import Ollama
+from phi.tools.crawl4ai_tools import Crawl4aiTools
 from phi.tools.yfinance import YFinanceTools
 
 agent = Agent(
@@ -20,3 +21,10 @@
 
 # Print the response in the terminal
 agent.print_response("What are analyst recommendations for NVDA and TSLA", stream=True)
+
+
+agent = Agent(model=Ollama(id="llama3.1:8b"), tools=[Crawl4aiTools(max_length=1000)], show_tool_calls=True)
+agent.print_response(
+    "Summarize me the key points in bullet points of this: https://blog.google/products/gemini/google-gemini-deep-research/",
+    stream=True,
+)
diff --git a/cookbook/storage/json_storage.py b/cookbook/storage/json_storage.py
@@ -0,0 +1,13 @@
+"""Run `pip install duckduckgo-search openai` to install dependencies."""
+
+from phi.agent import Agent
+from phi.tools.duckduckgo import DuckDuckGo
+from phi.storage.agent.json import JsonFileAgentStorage
+
+agent = Agent(
+    storage=JsonFileAgentStorage(dir_path="tmp/agent_sessions_json"),
+    tools=[DuckDuckGo()],
+    add_history_to_messages=True,
+)
+agent.print_response("How many people live in Canada?")
+agent.print_response("What is their national anthem called?")
diff --git a/cookbook/storage/yaml_storage.py b/cookbook/storage/yaml_storage.py
@@ -0,0 +1,13 @@
+"""Run `pip install duckduckgo-search openai` to install dependencies."""
+
+from phi.agent import Agent
+from phi.tools.duckduckgo import DuckDuckGo
+from phi.storage.agent.yaml import YamlFileAgentStorage
+
+agent = Agent(
+    storage=YamlFileAgentStorage(dir_path="tmp/agent_sessions_yaml"),
+    tools=[DuckDuckGo()],
+    add_history_to_messages=True,
+)
+agent.print_response("How many people live in Canada?")
+agent.print_response("What is their national anthem called?")
diff --git a/cookbook/tools/confluence_tools.py b/cookbook/tools/confluence_tools.py
@@ -0,0 +1,22 @@
+from phi.agent import Agent
+from phi.tools.confluence import ConfluenceTools
+
+
+agent = Agent(
+    name="Confluence agent",
+    tools=[ConfluenceTools()],
+    show_tool_calls=True,
+    markdown=True,
+)
+
+## getting space details
+agent.print_response("How many spaces are there and what are their names?")
+
+## getting page_content
+agent.print_response("What is the content present in page 'Large language model in LLM space'")
+
+## getting page details in a particular space
+agent.print_response("Can you extract all the page names from 'LLM' space")
+
+## creating a new page in a space
+agent.print_response("Can you create a new page named 'TESTING' in 'LLM' space")
diff --git a/cookbook/tools/elevenlabs_tools.py b/cookbook/tools/elevenlabs_tools.py
@@ -0,0 +1,32 @@
+"""
+pip install elevenlabs
+"""
+
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.tools.eleven_labs_tools import ElevenLabsTools
+
+audio_agent = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[
+        ElevenLabsTools(
+            voice_id="21m00Tcm4TlvDq8ikWAM", model_id="eleven_multilingual_v2", target_directory="audio_generations"
+        )
+    ],
+    description="You are an AI agent that can generate audio using the ElevenLabs API.",
+    instructions=[
+        "When the user asks you to generate audio, use the `generate_audio` tool to generate the audio.",
+        "You'll generate the appropriate prompt to send to the tool to generate audio.",
+        "You don't need to find the appropriate voice first, I already specified the voice to user."
+        "Return the audio file name in your response. Don't convert it to markdown.",
+        "The audio should be long and detailed.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+)
+
+audio_agent.print_response("Generate a very long audio of history of french revolution")
+
+
+audio_agent.print_response("Generate a kick sound effect")
diff --git a/phi/agent/agent.py b/phi/agent/agent.py
@@ -28,7 +28,7 @@
 
 from phi.document import Document
 from phi.agent.session import AgentSession
-from phi.model.content import Image, Video
+from phi.model.content import Image, Video, Audio
 from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction
 from phi.run.response import RunEvent, RunResponse, RunResponseExtraData
 from phi.knowledge.agent import AgentKnowledge
@@ -61,6 +61,8 @@ class Agent(BaseModel):
     images: Optional[List[Image]] = None
     # Videos associated with this agent
     videos: Optional[List[Video]] = None
+    # Audio associated with this agent
+    audio: Optional[List[Audio]] = None
 
     # Data associated with this agent
     # name, model, images and videos are automatically added to the agent_data
@@ -577,6 +579,8 @@ def get_agent_data(self) -> Dict[str, Any]:
             agent_data["images"] = [img if isinstance(img, dict) else img.model_dump() for img in self.images]
         if self.videos is not None:
             agent_data["videos"] = [vid if isinstance(vid, dict) else vid.model_dump() for vid in self.videos]
+        if self.audio is not None:
+            agent_data["audio"] = [aud if isinstance(aud, dict) else aud.model_dump() for aud in self.audio]
         return agent_data
 
     def get_session_data(self) -> Dict[str, Any]:
@@ -641,6 +645,12 @@ def from_agent_session(self, session: AgentSession):
                     self.videos.extend([Video.model_validate(vid) for vid in self.videos])
                 else:
                     self.videos = videos_from_db
+            if "audio" in session.agent_data:
+                audio_from_db = session.agent_data.get("audio")
+                if self.audio is not None and isinstance(self.audio, list):
+                    self.audio.extend([Audio.model_validate(aud) for aud in self.audio])
+                else:
+                    self.audio = audio_from_db
 
             # If agent_data is set in the agent, update the database agent_data with the agent's agent_data
             if self.agent_data is not None:
@@ -1706,8 +1716,10 @@ def generic_run_response(
             agent_id=self.agent_id,
             content=content,
             tools=self.run_response.tools,
+            audio=self.run_response.audio,
             images=self.run_response.images,
             videos=self.run_response.videos,
+            response_audio=self.run_response.response_audio,
             model=self.run_response.model,
             messages=self.run_response.messages,
             extra_data=self.run_response.extra_data,
@@ -1798,6 +1810,7 @@ def _run(
                         self.run_response.content = model_response_chunk.content
                         self.run_response.created_at = model_response_chunk.created_at
                         yield self.run_response
+
                 elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
                     # Add tool call to the run_response
                     tool_call_dict = model_response_chunk.tool_call
@@ -1834,7 +1847,7 @@ def _run(
             else:
                 self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.audio = model_response.audio
+                self.run_response.response_audio = model_response.audio
             self.run_response.messages = messages_for_model
             self.run_response.created_at = model_response.created_at
 
@@ -1848,6 +1861,8 @@ def _run(
         # Update the run_response content if streaming as run_response will only contain the last chunk
         if self.stream:
             self.run_response.content = model_response.content
+            if model_response.audio is not None:
+                self.run_response.response_audio = model_response.audio
 
         # 6. Update Memory
         if self.stream_intermediate_steps:
@@ -2186,6 +2201,8 @@ async def _arun(
         # Update the run_response content if streaming as run_response will only contain the last chunk
         if self.stream:
             self.run_response.content = model_response.content
+            if model_response.audio is not None:
+                self.run_response.response_audio = model_response.audio
 
         # 6. Update Memory
         if self.stream_intermediate_steps:
@@ -2469,12 +2486,24 @@ def add_video(self, video: Video) -> None:
                 self.run_response.videos = []
             self.run_response.videos.append(video)
 
+    def add_audio(self, audio: Audio) -> None:
+        if self.audio is None:
+            self.audio = []
+        self.audio.append(audio)
+        if self.run_response is not None:
+            if self.run_response.audio is None:
+                self.run_response.audio = []
+            self.run_response.audio.append(audio)
+
     def get_images(self) -> Optional[List[Image]]:
         return self.images
 
     def get_videos(self) -> Optional[List[Video]]:
         return self.videos
 
+    def get_audio(self) -> Optional[List[Audio]]:
+        return self.audio
+
     ###########################################################################
     # Default Tools
     ###########################################################################

diff --git a/phi/agent/session.py b/phi/agent/session.py
@@ -27,17 +27,25 @@ class AgentSession(BaseModel):
     model_config = ConfigDict(from_attributes=True)
 
     def monitoring_data(self) -> Dict[str, Any]:
-        monitoring_data = self.model_dump(exclude={"memory"})
         # Google Gemini adds a "parts" field to the messages, which is not serializable
-        # If there are runs in the memory, remove the "parts" from the messages
-        if self.memory is not None and "runs" in self.memory:
-            _runs = self.memory["runs"]
-            if len(_runs) > 0:
-                for _run in _runs:
-                    if "messages" in _run:
-                        for m in _run["messages"]:
-                            if isinstance(m, dict):
-                                m.pop("parts", None)
+        # If the provider is Google, remove the "parts" from the messages
+        if self.agent_data is not None:
+            if self.agent_data.get("model", {}).get("provider") == "Google" and self.memory is not None:
+                # Remove parts from runs' response messages
+                if "runs" in self.memory:
+                    for _run in self.memory["runs"]:
+                        if "response" in _run and "messages" in _run["response"]:
+                            for m in _run["response"]["messages"]:
+                                if isinstance(m, dict):
+                                    m.pop("parts", None)
+
+                # Remove parts from top-level memory messages
+                if "messages" in self.memory:
+                    for m in self.memory["messages"]:
+                        if isinstance(m, dict):
+                            m.pop("parts", None)
+
+        monitoring_data = self.model_dump()
         return monitoring_data
 
     def telemetry_data(self) -> Dict[str, Any]:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -48,4 +48,6 @@ data.db

		.ipynb_checkpoints

		audio_generations

		*.db