From a4cf57f48eae54946a3b1a63b52e471603774f54 Mon Sep 17 00:00:00 2001 From: Yash Pratap Solanky <101447028+ysolanky@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:27:00 -0500 Subject: [PATCH] OpenAI audio agent (#1631) ## Description Added OpenAI audio agent cookbook example ## Type of change Please check the options that are relevant: - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Model update - [ ] Infrastructure change ## Checklist - [ ] My code follows Phidata's style guidelines and best practices - [ ] I have performed a self-review of my code - [ ] I have added docstrings and comments for complex logic - [ ] My changes generate no new warnings or errors - [ ] I have added cookbook examples for my new addition (if needed) - [ ] I have updated requirements.txt/pyproject.toml (if needed) - [ ] I have verified my changes in a clean environment --- cookbook/providers/openai/.gitignore | 6 +++++ .../providers/openai/audio_input_agent.py | 18 +++++++++++++ .../providers/openai/audio_output_agent.py | 25 +++++++++++++++++++ cookbook/workflows/startup_idea_validator.py | 2 +- phi/document/reader/csv_reader.py | 2 +- phi/llm/ollama/hermes.py | 2 +- phi/llm/ollama/tools.py | 2 +- 7 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 cookbook/providers/openai/.gitignore create mode 100644 cookbook/providers/openai/audio_input_agent.py create mode 100644 cookbook/providers/openai/audio_output_agent.py diff --git a/cookbook/providers/openai/.gitignore b/cookbook/providers/openai/.gitignore new file mode 100644 index 000000000..525cad7b9 --- /dev/null +++ b/cookbook/providers/openai/.gitignore @@ -0,0 +1,6 @@ +*.jpg +*.png +*.mp3 +*.wav +*.mp4 +*.mp3 diff --git a/cookbook/providers/openai/audio_input_agent.py b/cookbook/providers/openai/audio_input_agent.py new file mode 100644 index 000000000..7c43863f8 --- /dev/null +++ b/cookbook/providers/openai/audio_input_agent.py @@ -0,0 +1,18 @@ +import base64 +import requests +from phi.agent import Agent, RunResponse # noqa +from phi.model.openai import OpenAIChat + +# Fetch the audio file and convert it to a base64 encoded string +url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" +response = requests.get(url) +response.raise_for_status() +wav_data = response.content +encoded_string = base64.b64encode(wav_data).decode("utf-8") + +# Provide the agent with the audio file and get result as text +agent = Agent( + model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]), + markdown=True, +) +agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"}) diff --git a/cookbook/providers/openai/audio_output_agent.py b/cookbook/providers/openai/audio_output_agent.py new file mode 100644 index 000000000..f08dae193 --- /dev/null +++ b/cookbook/providers/openai/audio_output_agent.py @@ -0,0 +1,25 @@ +import base64 +import requests +from phi.agent import Agent, RunResponse # noqa +from phi.model.openai import OpenAIChat +from phi.utils.audio import write_audio_to_file + +# Fetch the audio file and convert it to a base64 encoded string +url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" +response = requests.get(url) +response.raise_for_status() +wav_data = response.content +encoded_string = base64.b64encode(wav_data).decode("utf-8") + +# Provide the agent with the audio file and audio configuration and get result as text + audio +agent = Agent( + model=OpenAIChat( + id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"} + ), + markdown=True, +) +agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"}) + +# Save the response audio to a file +if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio: + write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav") diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py index c4070f622..99bc89dfc 100644 --- a/cookbook/workflows/startup_idea_validator.py +++ b/cookbook/workflows/startup_idea_validator.py @@ -205,7 +205,7 @@ def run(self, startup_idea: str) -> Iterator[RunResponse]: table_name="validate_startup_ideas_workflow", db_file="tmp/workflows.db", ), - debug_mode=True + debug_mode=True, ) final_report: Iterator[RunResponse] = startup_idea_validator.run(startup_idea=idea) diff --git a/phi/document/reader/csv_reader.py b/phi/document/reader/csv_reader.py index f9ed770d3..5274007b9 100644 --- a/phi/document/reader/csv_reader.py +++ b/phi/document/reader/csv_reader.py @@ -26,7 +26,7 @@ def read(self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str else: logger.info(f"Reading uploaded file: {file.name}") file.seek(0) - file_content = io.StringIO(file.read().decode("utf-8")) + file_content = io.StringIO(file.read().decode("utf-8")) # type: ignore csv_name = Path(file.name).stem if isinstance(file, Path) else file.name.split(".")[0] csv_content = "" diff --git a/phi/llm/ollama/hermes.py b/phi/llm/ollama/hermes.py index aa5b78647..3dd6cf17d 100644 --- a/phi/llm/ollama/hermes.py +++ b/phi/llm/ollama/hermes.py @@ -258,7 +258,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]: # logger.info(f"Ollama partial response: {response}") # logger.info(f"Ollama partial response type: {type(response)}") response_message: Optional[dict] = response.get("message") - response_content = response_message.get("content") if response_message else None + response_content: str = response_message.get("content", "") if response_message else "" # logger.info(f"Ollama partial response content: {response_content}") # Add response content to assistant message diff --git a/phi/llm/ollama/tools.py b/phi/llm/ollama/tools.py index 6c7ef683e..9d0072bd1 100644 --- a/phi/llm/ollama/tools.py +++ b/phi/llm/ollama/tools.py @@ -259,7 +259,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]: # logger.info(f"Ollama partial response: {response}") # logger.info(f"Ollama partial response type: {type(response)}") response_message: Optional[dict] = response.get("message") - response_content = response_message.get("content") if response_message else None + response_content: str = response_message.get("content", "") if response_message else "" # logger.info(f"Ollama partial response content: {response_content}") # Add response content to assistant message