From 178001f2706cd8541f0154657d11980cc9165d7d Mon Sep 17 00:00:00 2001
From: Dirk Brand <dirkbrnd@gmail.com>
Date: Fri, 20 Dec 2024 11:51:14 +0200
Subject: [PATCH] Update run_response to be structured

---
 cookbook/agents/37_audio_input_output.py     |  2 +-
 cookbook/agents/38_audio_multi_turn.py       |  4 +-
 cookbook/agents/42_image_to_audio.py         |  2 +-
 cookbook/workflows/startup_idea_validator.py |  4 +-
 phi/agent/agent.py                           | 40 +++++++++++++-------
 phi/model/content.py                         |  5 +++
 phi/playground/router.py                     |  1 +
 phi/run/response.py                          |  4 +-
 8 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/cookbook/agents/37_audio_input_output.py b/cookbook/agents/37_audio_input_output.py
index f7e91f65e..a1038b6bf 100644
--- a/cookbook/agents/37_audio_input_output.py
+++ b/cookbook/agents/37_audio_input_output.py
@@ -24,4 +24,4 @@
 )
 
 if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
+    write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/dog.wav")
diff --git a/cookbook/agents/38_audio_multi_turn.py b/cookbook/agents/38_audio_multi_turn.py
index 92e0fbfd1..b8ad72053 100644
--- a/cookbook/agents/38_audio_multi_turn.py
+++ b/cookbook/agents/38_audio_multi_turn.py
@@ -12,8 +12,8 @@
 
 agent.run("Is a golden retriever a good family dog?")
 if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav")
+    write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_1.wav")
 
 agent.run("Why do you say they are loyal?")
 if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav")
+    write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_2.wav")
diff --git a/cookbook/agents/42_image_to_audio.py b/cookbook/agents/42_image_to_audio.py
index 8cbe4f11b..ba0b3400e 100644
--- a/cookbook/agents/42_image_to_audio.py
+++ b/cookbook/agents/42_image_to_audio.py
@@ -24,4 +24,4 @@
 
 audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}")
 if audio_story.response_audio is not None and "data" in audio_story.response_audio:
-    write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav")
+    write_audio_to_file(audio=audio_story.response_audio.base64_audio, filename="tmp/multimodal-agents.wav")
diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py
index c4070f622..52bbb4cc6 100644
--- a/cookbook/workflows/startup_idea_validator.py
+++ b/cookbook/workflows/startup_idea_validator.py
@@ -1,6 +1,6 @@
 """
-1. Install dependencies using: `pip install openai exa_py sqlalchemy phidata`
-2. Run the script using: `python cookbook/workflows/blog_post_generator.py`
+1. Install dependencies using: `pip install openai googlesearch-python pycountry phidata`
+2. Run the script using: `python cookbook/workflows/startup_idea_validator.py`
 """
 
 import json
diff --git a/phi/agent/agent.py b/phi/agent/agent.py
index bbac753ba..5a928020d 100644
--- a/phi/agent/agent.py
+++ b/phi/agent/agent.py
@@ -28,7 +28,7 @@
 
 from phi.document import Document
 from phi.agent.session import AgentSession
-from phi.model.content import Image, Video, Audio
+from phi.model.content import Image, Video, Audio, ModelResponseAudio
 from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction
 from phi.run.response import RunEvent, RunResponse, RunResponseExtraData
 from phi.knowledge.agent import AgentKnowledge
@@ -1815,11 +1815,8 @@ def _run(
                         if model_response.audio is None:
                             model_response.audio = {"data": "", "transcript": ""}
 
-                        model_response.audio["data"] += model_response.audio.get("data", "")
-                        model_response.audio["transcript"] += model_response.audio.get("transcript", "")
-                        self.run_response.response_audio = model_response_chunk.audio
-                        self.run_response.created_at = model_response_chunk.created_at
-                        # TODO add all to final event
+                        model_response.audio["data"] += model_response_chunk.audio.get("data", "")
+                        model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "")
                         yield self.run_response
 
                 elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
@@ -1858,7 +1855,10 @@ def _run(
             else:
                 self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.response_audio = model_response.audio
+                self.run_response.response_audio = ModelResponseAudio(
+                        base64_audio=model_response.audio.get("data"),
+                        transcript=model_response.audio.get("transcript"),
+                    )
             self.run_response.messages = messages_for_model
             self.run_response.created_at = model_response.created_at
 
@@ -1873,7 +1873,10 @@ def _run(
         if self.stream:
             self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.response_audio = model_response.audio
+                self.run_response.response_audio = ModelResponseAudio(
+                        base64_audio=model_response.audio.get("data"),
+                        transcript=model_response.audio.get("transcript"),
+                    )
 
         # 6. Update Memory
         if self.stream_intermediate_steps:
@@ -2164,14 +2167,15 @@ async def _arun(
                         self.run_response.content = model_response_chunk.content
                         self.run_response.created_at = model_response_chunk.created_at
                         yield self.run_response
+
                     if model_response_chunk.audio is not None:
                         if model_response.audio is None:
                             model_response.audio = {"data": "", "transcript": ""}
 
-                        model_response.audio["data"] += model_response.audio.get("data", "")
-                        model_response.audio["transcript"] += model_response.audio.get("transcript", "")
-                        self.run_response.response_audio = model_response_chunk.audio
-                        self.run_response.created_at = model_response_chunk.created_at
+                        model_response.audio["data"] += model_response_chunk.audio.get("data", "")
+                        model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "")
+                        # self.run_response.response_audio = model_response_chunk.audio
+                        # self.run_response.created_at = model_response_chunk.created_at
                         yield self.run_response
                 elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
                     # Add tool call to the run_response
@@ -2208,6 +2212,11 @@ async def _arun(
                 self.run_response.content_type = self.response_model.__name__
             else:
                 self.run_response.content = model_response.content
+                if model_response.audio is not None:
+                    self.run_response.response_audio = ModelResponseAudio(
+                        base64_audio=model_response.audio.get("data"),
+                        transcript=model_response.audio.get("transcript"),
+                    )
             self.run_response.messages = messages_for_model
             self.run_response.created_at = model_response.created_at
 
@@ -2215,14 +2224,19 @@ async def _arun(
         run_messages = user_messages + messages_for_model[num_input_messages:]
         if system_message is not None:
             run_messages.insert(0, system_message)
+
         # Update the run_response
         self.run_response.messages = run_messages
         self.run_response.metrics = self._aggregate_metrics_from_run_messages(run_messages)
+
         # Update the run_response content if streaming as run_response will only contain the last chunk
         if self.stream:
             self.run_response.content = model_response.content
             if model_response.audio is not None:
-                self.run_response.response_audio = model_response.audio
+                self.run_response.response_audio = ModelResponseAudio(
+                        base64_audio=model_response.audio.get("data"),
+                        transcript=model_response.audio.get("transcript"),
+                    )
 
         # 6. Update Memory
         if self.stream_intermediate_steps:
diff --git a/phi/model/content.py b/phi/model/content.py
index 77a1bcdf0..92f3a22e0 100644
--- a/phi/model/content.py
+++ b/phi/model/content.py
@@ -36,3 +36,8 @@ def validate_exclusive_audio(cls, data: Any):
         if not data.get("url") and not data.get("base64_audio"):
             raise ValueError("Either `url` or `base64_audio` must be provided.")
         return data
+
+
+class ModelResponseAudio(BaseModel):
+    base64_audio: str
+    transcript: Optional[str] = None
diff --git a/phi/playground/router.py b/phi/playground/router.py
index 0cf81bdba..91f0be1e0 100644
--- a/phi/playground/router.py
+++ b/phi/playground/router.py
@@ -399,6 +399,7 @@ async def chat_response_streamer(
         run_response = await agent.arun(message, images=images, stream=True, stream_intermediate_steps=True)
         async for run_response_chunk in run_response:
             run_response_chunk = cast(RunResponse, run_response_chunk)
+            # print(run_response_chunk.event, "|", run_response_chunk.content, "|", run_response_chunk.response_audio)
             yield run_response_chunk.to_json()
 
     async def process_image(file: UploadFile) -> List[Union[str, Dict]]:
diff --git a/phi/run/response.py b/phi/run/response.py
index 13e0fee5f..6ae020d38 100644
--- a/phi/run/response.py
+++ b/phi/run/response.py
@@ -5,7 +5,7 @@
 
 from pydantic import BaseModel, ConfigDict, Field
 
-from phi.model.content import Video, Image, Audio
+from phi.model.content import Video, Image, Audio, ModelResponseAudio
 from phi.reasoning.step import ReasoningStep
 from phi.model.message import Message, MessageReferences
 
@@ -53,7 +53,7 @@ class RunResponse(BaseModel):
     images: Optional[List[Image]] = None  # Images attached to the response
     videos: Optional[List[Video]] = None  # Videos attached to the response
     audio: Optional[List[Audio]] = None  # Audio attached to the response
-    response_audio: Optional[Dict] = None  # Model audio response
+    response_audio: Optional[ModelResponseAudio] = None  # Model audio response
     extra_data: Optional[RunResponseExtraData] = None
     created_at: int = Field(default_factory=lambda: int(time()))