From a4cf57f48eae54946a3b1a63b52e471603774f54 Mon Sep 17 00:00:00 2001
From: Yash Pratap Solanky <101447028+ysolanky@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:27:00 -0500
Subject: [PATCH] OpenAI audio agent (#1631)

## Description

Added OpenAI audio agent cookbook example

## Type of change

Please check the options that are relevant:

- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] Model update
- [ ] Infrastructure change

## Checklist

- [ ] My code follows Phidata's style guidelines and best practices
- [ ] I have performed a self-review of my code
- [ ] I have added docstrings and comments for complex logic
- [ ] My changes generate no new warnings or errors
- [ ] I have added cookbook examples for my new addition (if needed)
- [ ] I have updated requirements.txt/pyproject.toml (if needed)
- [ ] I have verified my changes in a clean environment
---
 cookbook/providers/openai/.gitignore          |  6 +++++
 .../providers/openai/audio_input_agent.py     | 18 +++++++++++++
 .../providers/openai/audio_output_agent.py    | 25 +++++++++++++++++++
 cookbook/workflows/startup_idea_validator.py  |  2 +-
 phi/document/reader/csv_reader.py             |  2 +-
 phi/llm/ollama/hermes.py                      |  2 +-
 phi/llm/ollama/tools.py                       |  2 +-
 7 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 cookbook/providers/openai/.gitignore
 create mode 100644 cookbook/providers/openai/audio_input_agent.py
 create mode 100644 cookbook/providers/openai/audio_output_agent.py

diff --git a/cookbook/providers/openai/.gitignore b/cookbook/providers/openai/.gitignore
new file mode 100644
index 000000000..525cad7b9
--- /dev/null
+++ b/cookbook/providers/openai/.gitignore
@@ -0,0 +1,6 @@
+*.jpg
+*.png
+*.mp3
+*.wav
+*.mp4
+*.mp3
diff --git a/cookbook/providers/openai/audio_input_agent.py b/cookbook/providers/openai/audio_input_agent.py
new file mode 100644
index 000000000..7c43863f8
--- /dev/null
+++ b/cookbook/providers/openai/audio_input_agent.py
@@ -0,0 +1,18 @@
+import base64
+import requests
+from phi.agent import Agent, RunResponse  # noqa
+from phi.model.openai import OpenAIChat
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode("utf-8")
+
+# Provide the agent with the audio file and get result as text
+agent = Agent(
+    model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]),
+    markdown=True,
+)
+agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
diff --git a/cookbook/providers/openai/audio_output_agent.py b/cookbook/providers/openai/audio_output_agent.py
new file mode 100644
index 000000000..f08dae193
--- /dev/null
+++ b/cookbook/providers/openai/audio_output_agent.py
@@ -0,0 +1,25 @@
+import base64
+import requests
+from phi.agent import Agent, RunResponse  # noqa
+from phi.model.openai import OpenAIChat
+from phi.utils.audio import write_audio_to_file
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode("utf-8")
+
+# Provide the agent with the audio file and audio configuration and get result as text + audio
+agent = Agent(
+    model=OpenAIChat(
+        id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"}
+    ),
+    markdown=True,
+)
+agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
+
+# Save the response audio to a file
+if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
+    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py
index c4070f622..99bc89dfc 100644
--- a/cookbook/workflows/startup_idea_validator.py
+++ b/cookbook/workflows/startup_idea_validator.py
@@ -205,7 +205,7 @@ def run(self, startup_idea: str) -> Iterator[RunResponse]:
             table_name="validate_startup_ideas_workflow",
             db_file="tmp/workflows.db",
         ),
-        debug_mode=True
+        debug_mode=True,
     )
 
     final_report: Iterator[RunResponse] = startup_idea_validator.run(startup_idea=idea)
diff --git a/phi/document/reader/csv_reader.py b/phi/document/reader/csv_reader.py
index f9ed770d3..5274007b9 100644
--- a/phi/document/reader/csv_reader.py
+++ b/phi/document/reader/csv_reader.py
@@ -26,7 +26,7 @@ def read(self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str
             else:
                 logger.info(f"Reading uploaded file: {file.name}")
                 file.seek(0)
-                file_content = io.StringIO(file.read().decode("utf-8"))
+                file_content = io.StringIO(file.read().decode("utf-8"))  # type: ignore
 
             csv_name = Path(file.name).stem if isinstance(file, Path) else file.name.split(".")[0]
             csv_content = ""
diff --git a/phi/llm/ollama/hermes.py b/phi/llm/ollama/hermes.py
index aa5b78647..3dd6cf17d 100644
--- a/phi/llm/ollama/hermes.py
+++ b/phi/llm/ollama/hermes.py
@@ -258,7 +258,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]:
             # logger.info(f"Ollama partial response: {response}")
             # logger.info(f"Ollama partial response type: {type(response)}")
             response_message: Optional[dict] = response.get("message")
-            response_content = response_message.get("content") if response_message else None
+            response_content: str = response_message.get("content", "") if response_message else ""
             # logger.info(f"Ollama partial response content: {response_content}")
 
             # Add response content to assistant message
diff --git a/phi/llm/ollama/tools.py b/phi/llm/ollama/tools.py
index 6c7ef683e..9d0072bd1 100644
--- a/phi/llm/ollama/tools.py
+++ b/phi/llm/ollama/tools.py
@@ -259,7 +259,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]:
             # logger.info(f"Ollama partial response: {response}")
             # logger.info(f"Ollama partial response type: {type(response)}")
             response_message: Optional[dict] = response.get("message")
-            response_content = response_message.get("content") if response_message else None
+            response_content: str = response_message.get("content", "") if response_message else ""
             # logger.info(f"Ollama partial response content: {response_content}")
 
             # Add response content to assistant message