From 178001f2706cd8541f0154657d11980cc9165d7d Mon Sep 17 00:00:00 2001 From: Dirk Brand <dirkbrnd@gmail.com> Date: Fri, 20 Dec 2024 11:51:14 +0200 Subject: [PATCH] Update run_response to be structured --- cookbook/agents/37_audio_input_output.py | 2 +- cookbook/agents/38_audio_multi_turn.py | 4 +- cookbook/agents/42_image_to_audio.py | 2 +- cookbook/workflows/startup_idea_validator.py | 4 +- phi/agent/agent.py | 40 +++++++++++++------- phi/model/content.py | 5 +++ phi/playground/router.py | 1 + phi/run/response.py | 4 +- 8 files changed, 41 insertions(+), 21 deletions(-) diff --git a/cookbook/agents/37_audio_input_output.py b/cookbook/agents/37_audio_input_output.py index f7e91f65e..a1038b6bf 100644 --- a/cookbook/agents/37_audio_input_output.py +++ b/cookbook/agents/37_audio_input_output.py @@ -24,4 +24,4 @@ ) if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio: - write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav") + write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/dog.wav") diff --git a/cookbook/agents/38_audio_multi_turn.py b/cookbook/agents/38_audio_multi_turn.py index 92e0fbfd1..b8ad72053 100644 --- a/cookbook/agents/38_audio_multi_turn.py +++ b/cookbook/agents/38_audio_multi_turn.py @@ -12,8 +12,8 @@ agent.run("Is a golden retriever a good family dog?") if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio: - write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav") + write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_1.wav") agent.run("Why do you say they are loyal?") if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio: - write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav") + write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_2.wav") diff --git a/cookbook/agents/42_image_to_audio.py b/cookbook/agents/42_image_to_audio.py index 8cbe4f11b..ba0b3400e 100644 --- a/cookbook/agents/42_image_to_audio.py +++ b/cookbook/agents/42_image_to_audio.py @@ -24,4 +24,4 @@ audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}") if audio_story.response_audio is not None and "data" in audio_story.response_audio: - write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav") + write_audio_to_file(audio=audio_story.response_audio.base64_audio, filename="tmp/multimodal-agents.wav") diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py index c4070f622..52bbb4cc6 100644 --- a/cookbook/workflows/startup_idea_validator.py +++ b/cookbook/workflows/startup_idea_validator.py @@ -1,6 +1,6 @@ """ -1. Install dependencies using: `pip install openai exa_py sqlalchemy phidata` -2. Run the script using: `python cookbook/workflows/blog_post_generator.py` +1. Install dependencies using: `pip install openai googlesearch-python pycountry phidata` +2. Run the script using: `python cookbook/workflows/startup_idea_validator.py` """ import json diff --git a/phi/agent/agent.py b/phi/agent/agent.py index bbac753ba..5a928020d 100644 --- a/phi/agent/agent.py +++ b/phi/agent/agent.py @@ -28,7 +28,7 @@ from phi.document import Document from phi.agent.session import AgentSession -from phi.model.content import Image, Video, Audio +from phi.model.content import Image, Video, Audio, ModelResponseAudio from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction from phi.run.response import RunEvent, RunResponse, RunResponseExtraData from phi.knowledge.agent import AgentKnowledge @@ -1815,11 +1815,8 @@ def _run( if model_response.audio is None: model_response.audio = {"data": "", "transcript": ""} - model_response.audio["data"] += model_response.audio.get("data", "") - model_response.audio["transcript"] += model_response.audio.get("transcript", "") - self.run_response.response_audio = model_response_chunk.audio - self.run_response.created_at = model_response_chunk.created_at - # TODO add all to final event + model_response.audio["data"] += model_response_chunk.audio.get("data", "") + model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "") yield self.run_response elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value: @@ -1858,7 +1855,10 @@ def _run( else: self.run_response.content = model_response.content if model_response.audio is not None: - self.run_response.response_audio = model_response.audio + self.run_response.response_audio = ModelResponseAudio( + base64_audio=model_response.audio.get("data"), + transcript=model_response.audio.get("transcript"), + ) self.run_response.messages = messages_for_model self.run_response.created_at = model_response.created_at @@ -1873,7 +1873,10 @@ def _run( if self.stream: self.run_response.content = model_response.content if model_response.audio is not None: - self.run_response.response_audio = model_response.audio + self.run_response.response_audio = ModelResponseAudio( + base64_audio=model_response.audio.get("data"), + transcript=model_response.audio.get("transcript"), + ) # 6. Update Memory if self.stream_intermediate_steps: @@ -2164,14 +2167,15 @@ async def _arun( self.run_response.content = model_response_chunk.content self.run_response.created_at = model_response_chunk.created_at yield self.run_response + if model_response_chunk.audio is not None: if model_response.audio is None: model_response.audio = {"data": "", "transcript": ""} - model_response.audio["data"] += model_response.audio.get("data", "") - model_response.audio["transcript"] += model_response.audio.get("transcript", "") - self.run_response.response_audio = model_response_chunk.audio - self.run_response.created_at = model_response_chunk.created_at + model_response.audio["data"] += model_response_chunk.audio.get("data", "") + model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "") + # self.run_response.response_audio = model_response_chunk.audio + # self.run_response.created_at = model_response_chunk.created_at yield self.run_response elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value: # Add tool call to the run_response @@ -2208,6 +2212,11 @@ async def _arun( self.run_response.content_type = self.response_model.__name__ else: self.run_response.content = model_response.content + if model_response.audio is not None: + self.run_response.response_audio = ModelResponseAudio( + base64_audio=model_response.audio.get("data"), + transcript=model_response.audio.get("transcript"), + ) self.run_response.messages = messages_for_model self.run_response.created_at = model_response.created_at @@ -2215,14 +2224,19 @@ async def _arun( run_messages = user_messages + messages_for_model[num_input_messages:] if system_message is not None: run_messages.insert(0, system_message) + # Update the run_response self.run_response.messages = run_messages self.run_response.metrics = self._aggregate_metrics_from_run_messages(run_messages) + # Update the run_response content if streaming as run_response will only contain the last chunk if self.stream: self.run_response.content = model_response.content if model_response.audio is not None: - self.run_response.response_audio = model_response.audio + self.run_response.response_audio = ModelResponseAudio( + base64_audio=model_response.audio.get("data"), + transcript=model_response.audio.get("transcript"), + ) # 6. Update Memory if self.stream_intermediate_steps: diff --git a/phi/model/content.py b/phi/model/content.py index 77a1bcdf0..92f3a22e0 100644 --- a/phi/model/content.py +++ b/phi/model/content.py @@ -36,3 +36,8 @@ def validate_exclusive_audio(cls, data: Any): if not data.get("url") and not data.get("base64_audio"): raise ValueError("Either `url` or `base64_audio` must be provided.") return data + + +class ModelResponseAudio(BaseModel): + base64_audio: str + transcript: Optional[str] = None diff --git a/phi/playground/router.py b/phi/playground/router.py index 0cf81bdba..91f0be1e0 100644 --- a/phi/playground/router.py +++ b/phi/playground/router.py @@ -399,6 +399,7 @@ async def chat_response_streamer( run_response = await agent.arun(message, images=images, stream=True, stream_intermediate_steps=True) async for run_response_chunk in run_response: run_response_chunk = cast(RunResponse, run_response_chunk) + # print(run_response_chunk.event, "|", run_response_chunk.content, "|", run_response_chunk.response_audio) yield run_response_chunk.to_json() async def process_image(file: UploadFile) -> List[Union[str, Dict]]: diff --git a/phi/run/response.py b/phi/run/response.py index 13e0fee5f..6ae020d38 100644 --- a/phi/run/response.py +++ b/phi/run/response.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, ConfigDict, Field -from phi.model.content import Video, Image, Audio +from phi.model.content import Video, Image, Audio, ModelResponseAudio from phi.reasoning.step import ReasoningStep from phi.model.message import Message, MessageReferences @@ -53,7 +53,7 @@ class RunResponse(BaseModel): images: Optional[List[Image]] = None # Images attached to the response videos: Optional[List[Video]] = None # Videos attached to the response audio: Optional[List[Audio]] = None # Audio attached to the response - response_audio: Optional[Dict] = None # Model audio response + response_audio: Optional[ModelResponseAudio] = None # Model audio response extra_data: Optional[RunResponseExtraData] = None created_at: int = Field(default_factory=lambda: int(time()))