OpenAI audio agent (#1631)

## Description Added OpenAI audio agent cookbook example ## Type of change Please check the options that are relevant: - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Model update - [ ] Infrastructure change ## Checklist - [ ] My code follows Phidata's style guidelines and best practices - [ ] I have performed a self-review of my code - [ ] I have added docstrings and comments for complex logic - [ ] My changes generate no new warnings or errors - [ ] I have added cookbook examples for my new addition (if needed) - [ ] I have updated requirements.txt/pyproject.toml (if needed) - [ ] I have verified my changes in a clean environment
phidatahq · Dec 26, 2024 · a4cf57f · a4cf57f
1 parent b65c7a7
commit a4cf57f
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 4 deletions.
diff --git a/cookbook/providers/openai/.gitignore b/cookbook/providers/openai/.gitignore
@@ -0,0 +1,6 @@
+*.jpg
+*.png
+*.mp3
+*.wav
+*.mp4
+*.mp3
diff --git a/cookbook/providers/openai/audio_input_agent.py b/cookbook/providers/openai/audio_input_agent.py
@@ -0,0 +1,18 @@
+import base64
+import requests
+from phi.agent import Agent, RunResponse  # noqa
+from phi.model.openai import OpenAIChat
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode("utf-8")
+
+# Provide the agent with the audio file and get result as text
+agent = Agent(
+    model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]),
+    markdown=True,
+)
+agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
diff --git a/cookbook/providers/openai/audio_output_agent.py b/cookbook/providers/openai/audio_output_agent.py
@@ -0,0 +1,25 @@
+import base64
+import requests
+from phi.agent import Agent, RunResponse  # noqa
+from phi.model.openai import OpenAIChat
+from phi.utils.audio import write_audio_to_file
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode("utf-8")
+
+# Provide the agent with the audio file and audio configuration and get result as text + audio
+agent = Agent(
+    model=OpenAIChat(
+        id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"}
+    ),
+    markdown=True,
+)
+agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
+
+# Save the response audio to a file
+if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
+    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py
@@ -205,7 +205,7 @@ def run(self, startup_idea: str) -> Iterator[RunResponse]:
             table_name="validate_startup_ideas_workflow",
             db_file="tmp/workflows.db",
         ),
-        debug_mode=True
+        debug_mode=True,
     )
 
     final_report: Iterator[RunResponse] = startup_idea_validator.run(startup_idea=idea)

diff --git a/phi/document/reader/csv_reader.py b/phi/document/reader/csv_reader.py
@@ -26,7 +26,7 @@ def read(self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str
             else:
                 logger.info(f"Reading uploaded file: {file.name}")
                 file.seek(0)
-                file_content = io.StringIO(file.read().decode("utf-8"))
+                file_content = io.StringIO(file.read().decode("utf-8"))  # type: ignore
 
             csv_name = Path(file.name).stem if isinstance(file, Path) else file.name.split(".")[0]
             csv_content = ""

diff --git a/phi/llm/ollama/hermes.py b/phi/llm/ollama/hermes.py
@@ -258,7 +258,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]:
             # logger.info(f"Ollama partial response: {response}")
             # logger.info(f"Ollama partial response type: {type(response)}")
             response_message: Optional[dict] = response.get("message")
-            response_content = response_message.get("content") if response_message else None
+            response_content: str = response_message.get("content", "") if response_message else ""
             # logger.info(f"Ollama partial response content: {response_content}")
 
             # Add response content to assistant message

diff --git a/phi/llm/ollama/tools.py b/phi/llm/ollama/tools.py
@@ -259,7 +259,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]:
             # logger.info(f"Ollama partial response: {response}")
             # logger.info(f"Ollama partial response type: {type(response)}")
             response_message: Optional[dict] = response.get("message")
-            response_content = response_message.get("content") if response_message else None
+            response_content: str = response_message.get("content", "") if response_message else ""
             # logger.info(f"Ollama partial response content: {response_content}")
 
             # Add response content to assistant message
-Original file line number
+Diff line change
@@ -0,0 +1,6 @@
+    *.jpg
+    *.png
+    *.mp3
+    *.wav
+    *.mp4
+    *.mp3