phidatahq · dirkbrnd · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/cookbook/agents/37_audio_input_output.py b/cookbook/agents/37_audio_input_output.py
@@ -23,5 +23,5 @@
     audio={"data": encoded_string, "format": "wav"},
 )
 
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
+if agent.run_response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/dog.wav")
diff --git a/cookbook/agents/38_audio_multi_turn.py b/cookbook/agents/38_audio_multi_turn.py
@@ -11,9 +11,9 @@
 )
 
 agent.run("Is a golden retriever a good family dog?")
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav")
+if agent.run_response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_1.wav")
 
 agent.run("Why do you say they are loyal?")
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav")
+if agent.run_response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_2.wav")
diff --git a/cookbook/agents/42_image_to_audio.py b/cookbook/agents/42_image_to_audio.py
@@ -23,5 +23,5 @@
 )
 
 audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}")
-if audio_story.response_audio is not None and "data" in audio_story.response_audio:
-    write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav")
+if audio_story.response_audio is not None:
+    write_audio_to_file(audio=audio_story.response_audio.base64_audio, filename="tmp/multimodal-agents.wav")
diff --git a/cookbook/playground/audio_conversation_agent.py b/cookbook/playground/audio_conversation_agent.py
@@ -0,0 +1,24 @@
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.playground import Playground, serve_playground_app
+from phi.storage.agent.sqlite import SqlAgentStorage
+
+
+audio_agent = Agent(
+    name="Audio Chat Agent",
+    model=OpenAIChat(
+        id="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "pcm16"},  # Wav not supported for streaming
+    ),
+    debug_mode=True,
+    add_history_to_messages=True,
+    add_datetime_to_instructions=True,
+    storage=SqlAgentStorage(table_name="audio_agent", db_file="tmp/audio_agent.db"),
+)
+
+
+app = Playground(agents=[audio_agent]).get_app()
+
+if __name__ == "__main__":
+    serve_playground_app("audio_conversation_agent:app", reload=True)
diff --git a/cookbook/tools/elevenlabs_tools.py b/cookbook/tools/elevenlabs_tools.py
@@ -28,5 +28,4 @@
 
 audio_agent.print_response("Generate a very long audio of history of french revolution")
 
-
 audio_agent.print_response("Generate a kick sound effect")
diff --git a/cookbook/tools/zoom_tools.py b/cookbook/tools/zoom_tools.py
@@ -31,10 +31,10 @@ def get_access_token(self) -> str:
         """
         Obtain or refresh the access token for Zoom API.
 
-        to get the  account_id  ,client_id  ,client_secret
+        To get the account_id, client_id, client_secret
         https://developers.zoom.us/docs/internal-apps/create/
 
-        for oauth 2.0
+        For oauth 2.0
         https://developers.zoom.us/docs/integrations/oauth/
         Returns:
             A string containing the access token or an empty string if token retrieval fails.
@@ -47,7 +47,10 @@ def get_access_token(self) -> str:
 
         try:
             response = requests.post(
-                self.token_url, headers=headers, data=data, auth=(self.client_id, self.client_secret)
+                self.token_url,
+                headers=headers,
+                data=data,
+                auth=(self.client_id, self.client_secret),  # type: ignore
             )
             response.raise_for_status()
 

diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py
@@ -1,6 +1,6 @@
 """
-1. Install dependencies using: `pip install openai exa_py sqlalchemy phidata`
-2. Run the script using: `python cookbook/workflows/blog_post_generator.py`
+1. Install dependencies using: `pip install openai googlesearch-python pycountry phidata`
+2. Run the script using: `python cookbook/workflows/startup_idea_validator.py`
 """
 
 import json

diff --git a/phi/agent/agent.py b/phi/agent/agent.py
@@ -28,7 +28,7 @@
 
 from phi.document import Document
 from phi.agent.session import AgentSession
-from phi.model.content import Image, Video, Audio
+from phi.model.content import Image, Video, Audio, ModelResponseAudio
 from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction
 from phi.run.response import RunEvent, RunResponse, RunResponseExtraData
 from phi.knowledge.agent import AgentKnowledge
@@ -1811,6 +1811,14 @@ def _run(
                         self.run_response.created_at = model_response_chunk.created_at
                         yield self.run_response
 
+                    if model_response_chunk.audio is not None:
+                        if model_response.audio is None:
+                            model_response.audio = {"data": "", "transcript": ""}
+
+                        model_response.audio["data"] += model_response_chunk.audio.get("data", "")
+                        model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "")
+                        yield self.run_response
+
                 elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
                     # Add tool call to the run_response
                     tool_call_dict = model_response_chunk.tool_call
@@ -1847,7 +1855,10 @@ def _run(
             else:
                 self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.response_audio = model_response.audio
+                self.run_response.response_audio = ModelResponseAudio(
+                    base64_audio=model_response.audio.get("data"),
+                    transcript=model_response.audio.get("transcript"),
+                )
             self.run_response.messages = messages_for_model
             self.run_response.created_at = model_response.created_at
 
@@ -1862,7 +1873,10 @@ def _run(
         if self.stream:
             self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.response_audio = model_response.audio
+                self.run_response.response_audio = ModelResponseAudio(
+                    base64_audio=model_response.audio.get("data"),
+                    transcript=model_response.audio.get("transcript"),
+                )
 
         # 6. Update Memory
         if self.stream_intermediate_steps:
@@ -2153,6 +2167,16 @@ async def _arun(
                         self.run_response.content = model_response_chunk.content
                         self.run_response.created_at = model_response_chunk.created_at
                         yield self.run_response
+
+                    if model_response_chunk.audio is not None:
+                        if model_response.audio is None:
+                            model_response.audio = {"data": "", "transcript": ""}
+
+                        model_response.audio["data"] += model_response_chunk.audio.get("data", "")
+                        model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "")
+                        # self.run_response.response_audio = model_response_chunk.audio
+                        # self.run_response.created_at = model_response_chunk.created_at
+                        yield self.run_response
                 elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
                     # Add tool call to the run_response
                     tool_call_dict = model_response_chunk.tool_call
@@ -2188,21 +2212,31 @@ async def _arun(
                 self.run_response.content_type = self.response_model.__name__
             else:
                 self.run_response.content = model_response.content
+                if model_response.audio is not None:
+                    self.run_response.response_audio = ModelResponseAudio(
+                        base64_audio=model_response.audio.get("data"),
+                        transcript=model_response.audio.get("transcript"),
+                    )
             self.run_response.messages = messages_for_model
             self.run_response.created_at = model_response.created_at
 
         # Build a list of messages that belong to this particular run
         run_messages = user_messages + messages_for_model[num_input_messages:]
         if system_message is not None:
             run_messages.insert(0, system_message)
+
         # Update the run_response
         self.run_response.messages = run_messages
         self.run_response.metrics = self._aggregate_metrics_from_run_messages(run_messages)
+
         # Update the run_response content if streaming as run_response will only contain the last chunk
         if self.stream:
             self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.response_audio = model_response.audio
+                self.run_response.response_audio = ModelResponseAudio(
+                    base64_audio=model_response.audio.get("data"),
+                    transcript=model_response.audio.get("transcript"),
+                )
 
         # 6. Update Memory
         if self.stream_intermediate_steps:

diff --git a/phi/model/cohere/chat.py b/phi/model/cohere/chat.py
@@ -12,7 +12,7 @@
 from phi.utils.tools import get_function_call_for_tool_call
 
 try:
-    from cohere import Client as CohereClient
+    from cohere import Client as CohereClient, ToolCall
     from cohere.types.tool import Tool as CohereTool
     from cohere.types.non_streamed_chat_response import NonStreamedChatResponse
     from cohere.types.streamed_chat_response import (
@@ -286,7 +286,7 @@ def _handle_tool_calls(
         self,
         assistant_message: Message,
         messages: List[Message],
-        response_tool_calls: List[Any],
+        response_tool_calls: List[ToolCall],
         model_response: ModelResponse,
     ) -> Optional[Any]:
         """
@@ -295,7 +295,7 @@ def _handle_tool_calls(
         Args:
             assistant_message (Message): The assistant message.
             messages (List[Message]): The list of messages.
-            response_tool_calls (List[Any]): The list of response tool calls.
+            response_tool_calls (List[ToolCall]): The list of response tool calls.
             model_response (ModelResponse): The model response.
 
         Returns:
@@ -420,7 +420,7 @@ def response(self, messages: List[Message], tool_results: Optional[List[ToolResu
             tool_results = self._handle_tool_calls(
                 assistant_message=assistant_message,
                 messages=messages,
-                response_tool_calls=response_tool_calls,
+                response_tool_calls=response_tool_calls,  # type: ignore
                 model_response=model_response,
             )
 

diff --git a/phi/model/content.py b/phi/model/content.py
@@ -24,6 +24,7 @@ class Audio(Media):
     url: Optional[str] = None  # Remote location for file
     base64_audio: Optional[str] = None  # Base64-encoded audio data
     length: Optional[str] = None
+    mime_type: Optional[str] = None
 
     @model_validator(mode="before")
     def validate_exclusive_audio(cls, data: Any):
@@ -35,3 +36,8 @@ def validate_exclusive_audio(cls, data: Any):
         if not data.get("url") and not data.get("base64_audio"):
             raise ValueError("Either `url` or `base64_audio` must be provided.")
         return data
+
+
+class ModelResponseAudio(BaseModel):
+    base64_audio: str
+    transcript: Optional[str] = None
diff --git a/phi/playground/router.py b/phi/playground/router.py
@@ -398,6 +398,13 @@ async def chat_response_streamer(
     ) -> AsyncGenerator:
         run_response = await agent.arun(message, images=images, stream=True, stream_intermediate_steps=True)
         async for run_response_chunk in run_response:
+            print(
+                run_response_chunk.event,
+                "|",
+                run_response_chunk.content,
+                "|",
+                run_response_chunk.response_audio.base64_audio[:10] if run_response_chunk.response_audio else "-",
+            )
             run_response_chunk = cast(RunResponse, run_response_chunk)
             yield run_response_chunk.to_json()
 

diff --git a/phi/run/response.py b/phi/run/response.py
@@ -5,7 +5,7 @@
 
 from pydantic import BaseModel, ConfigDict, Field
 
-from phi.model.content import Video, Image, Audio
+from phi.model.content import Video, Image, Audio, ModelResponseAudio
 from phi.reasoning.step import ReasoningStep
 from phi.model.message import Message, MessageReferences
 
@@ -53,7 +53,7 @@ class RunResponse(BaseModel):
     images: Optional[List[Image]] = None  # Images attached to the response
     videos: Optional[List[Video]] = None  # Videos attached to the response
     audio: Optional[List[Audio]] = None  # Audio attached to the response
-    response_audio: Optional[Dict] = None  # Model audio response
+    response_audio: Optional[ModelResponseAudio] = None  # Model audio response
     extra_data: Optional[RunResponseExtraData] = None
     created_at: int = Field(default_factory=lambda: int(time()))
 

diff --git a/phi/tools/calcom.py b/phi/tools/calcom.py
@@ -110,7 +110,7 @@ def get_available_slots(
                 "eventTypeId": self.event_type_id,
             }
 
-            response = requests.get(url, headers=self._get_headers(), params=querystring)
+            response = requests.get(url, headers=self._get_headers(), params=querystring)  # type: ignore
             if response.status_code == 200:
                 slots = response.json()["data"]["slots"]
                 available_slots = []

diff --git a/phi/tools/eleven_labs_tools.py b/phi/tools/eleven_labs_tools.py
@@ -62,10 +62,14 @@ def __init__(
 
     def get_voices(self) -> str:
         """
-        Use this function to get all the voices available.
-
-        Returns:
-            result (list): A list of voices that have an ID, name and description.
+        <<<<<<< HEAD
+                Use this function to generate sound effect audio from a text prompt.
+        =======
+                Use this function to get all the voices available.
+        >>>>>>> 48addb496442892c21382ff27d03578b3f9d7ac6
+
+                Returns:
+                    result (list): A list of voices that have an ID, name and description.
         """
         try:
             voices = self.eleven_labs_client.voices.get_all()
@@ -152,13 +156,17 @@ def generate_sound_effect(self, agent: Agent, prompt: str, duration_seconds: Opt
 
     def text_to_speech(self, agent: Agent, prompt: str, voice_id: Optional[str] = None) -> str:
         """
-        Use this function to convert text to speech audio.
-
-        Args:
-            prompt (str): Text to generate audio from.
-            voice_id (Optional[str]): The ID of the voice to use for audio generation. Uses default if none is specified.
-        Returns:
-            str: Return the path to the generated audio file.
+                Use this function to convert text to speech audio.
+
+                Args:
+                    prompt (str): Text to generate audio from.
+        <<<<<<< HEAD
+                    voice_id (str): The ID of the voice to use for audio generation.
+        =======
+                    voice_id (Optional[str]): The ID of the voice to use for audio generation. Uses default if none is specified.
+        >>>>>>> 48addb496442892c21382ff27d03578b3f9d7ac6
+                Returns:
+                    str: Return the path to the generated audio file.
         """
         try:
             audio_generator = self.eleven_labs_client.text_to_speech.convert(

diff --git a/phi/tools/github.py b/phi/tools/github.py
@@ -48,7 +48,7 @@ def __init__(
 
     def authenticate(self):
         """Authenticate with GitHub using the provided access token."""
-        auth = Auth.Token(self.access_token)
+        auth = Auth.Token(self.access_token)  # type: ignore
         if self.base_url:
             logger.debug(f"Authenticating with GitHub Enterprise at {self.base_url}")
             return Github(base_url=self.base_url, auth=auth)

diff --git a/phi/tools/zoom.py b/phi/tools/zoom.py
@@ -123,7 +123,7 @@ def get_upcoming_meetings(self, user_id: str = "me") -> str:
         params = {"type": "upcoming", "page_size": 30}
 
         try:
-            response = requests.get(url, headers=headers, params=params)
+            response = requests.get(url, headers=headers, params=params)  # type: ignore
             response.raise_for_status()
             meetings = response.json()
 

diff --git a/phi/vectordb/qdrant/qdrant.py b/phi/vectordb/qdrant/qdrant.py
@@ -86,7 +86,7 @@ def client(self) -> QdrantClient:
                 https=self.https,
                 api_key=self.api_key,
                 prefix=self.prefix,
-                timeout=self.timeout,
+                timeout=self.timeout,  # type: ignore
                 host=self.host,
                 path=self.path,
                 **self.kwargs,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,5 +28,4 @@

		audio_agent.print_response("Generate a very long audio of history of french revolution")


		audio_agent.print_response("Generate a kick sound effect")