diff --git a/cookbook/playground/audio_conversation_agent.py b/cookbook/playground/audio_conversation_agent.py new file mode 100644 index 000000000..ab2959db3 --- /dev/null +++ b/cookbook/playground/audio_conversation_agent.py @@ -0,0 +1,22 @@ +from phi.agent import Agent +from phi.model.openai import OpenAIChat +from phi.playground import Playground, serve_playground_app +from phi.storage.agent.sqlite import SqlAgentStorage + + +audio_agent = Agent( + name="Audio Chat Agent", + model=OpenAIChat( + id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "pcm16"} # Wav not supported for streaming + ), + debug_mode=True, + add_history_to_messages=True, + add_datetime_to_instructions=True, + storage=SqlAgentStorage(table_name="audio_agent", db_file="tmp/audio_agent.db"), +) + + +app = Playground(agents=[audio_agent]).get_app() + +if __name__ == "__main__": + serve_playground_app("audio_conversation_agent:app", reload=True) diff --git a/phi/agent/agent.py b/phi/agent/agent.py index eaed62116..bbac753ba 100644 --- a/phi/agent/agent.py +++ b/phi/agent/agent.py @@ -1811,6 +1811,17 @@ def _run( self.run_response.created_at = model_response_chunk.created_at yield self.run_response + if model_response_chunk.audio is not None: + if model_response.audio is None: + model_response.audio = {"data": "", "transcript": ""} + + model_response.audio["data"] += model_response.audio.get("data", "") + model_response.audio["transcript"] += model_response.audio.get("transcript", "") + self.run_response.response_audio = model_response_chunk.audio + self.run_response.created_at = model_response_chunk.created_at + # TODO add all to final event + yield self.run_response + elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value: # Add tool call to the run_response tool_call_dict = model_response_chunk.tool_call @@ -2153,6 +2164,15 @@ async def _arun( self.run_response.content = model_response_chunk.content self.run_response.created_at = model_response_chunk.created_at yield self.run_response + if model_response_chunk.audio is not None: + if model_response.audio is None: + model_response.audio = {"data": "", "transcript": ""} + + model_response.audio["data"] += model_response.audio.get("data", "") + model_response.audio["transcript"] += model_response.audio.get("transcript", "") + self.run_response.response_audio = model_response_chunk.audio + self.run_response.created_at = model_response_chunk.created_at + yield self.run_response elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value: # Add tool call to the run_response tool_call_dict = model_response_chunk.tool_call