diff --git a/cookbook/providers/openai/.gitignore b/cookbook/providers/openai/.gitignore new file mode 100644 index 000000000..525cad7b9 --- /dev/null +++ b/cookbook/providers/openai/.gitignore @@ -0,0 +1,6 @@ +*.jpg +*.png +*.mp3 +*.wav +*.mp4 +*.mp3 diff --git a/cookbook/providers/openai/audio_input_agent.py b/cookbook/providers/openai/audio_input_agent.py new file mode 100644 index 000000000..7c43863f8 --- /dev/null +++ b/cookbook/providers/openai/audio_input_agent.py @@ -0,0 +1,18 @@ +import base64 +import requests +from phi.agent import Agent, RunResponse # noqa +from phi.model.openai import OpenAIChat + +# Fetch the audio file and convert it to a base64 encoded string +url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" +response = requests.get(url) +response.raise_for_status() +wav_data = response.content +encoded_string = base64.b64encode(wav_data).decode("utf-8") + +# Provide the agent with the audio file and get result as text +agent = Agent( + model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]), + markdown=True, +) +agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"}) diff --git a/cookbook/providers/openai/audio_output_agent.py b/cookbook/providers/openai/audio_output_agent.py new file mode 100644 index 000000000..f08dae193 --- /dev/null +++ b/cookbook/providers/openai/audio_output_agent.py @@ -0,0 +1,25 @@ +import base64 +import requests +from phi.agent import Agent, RunResponse # noqa +from phi.model.openai import OpenAIChat +from phi.utils.audio import write_audio_to_file + +# Fetch the audio file and convert it to a base64 encoded string +url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" +response = requests.get(url) +response.raise_for_status() +wav_data = response.content +encoded_string = base64.b64encode(wav_data).decode("utf-8") + +# Provide the agent with the audio file and audio configuration and get result as text + audio +agent = Agent( + model=OpenAIChat( + id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"} + ), + markdown=True, +) +agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"}) + +# Save the response audio to a file +if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio: + write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav") diff --git a/phi/document/reader/csv_reader.py b/phi/document/reader/csv_reader.py index f9ed770d3..5274007b9 100644 --- a/phi/document/reader/csv_reader.py +++ b/phi/document/reader/csv_reader.py @@ -26,7 +26,7 @@ def read(self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str else: logger.info(f"Reading uploaded file: {file.name}") file.seek(0) - file_content = io.StringIO(file.read().decode("utf-8")) + file_content = io.StringIO(file.read().decode("utf-8")) # type: ignore csv_name = Path(file.name).stem if isinstance(file, Path) else file.name.split(".")[0] csv_content = "" diff --git a/phi/llm/ollama/hermes.py b/phi/llm/ollama/hermes.py index aa5b78647..3dd6cf17d 100644 --- a/phi/llm/ollama/hermes.py +++ b/phi/llm/ollama/hermes.py @@ -258,7 +258,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]: # logger.info(f"Ollama partial response: {response}") # logger.info(f"Ollama partial response type: {type(response)}") response_message: Optional[dict] = response.get("message") - response_content = response_message.get("content") if response_message else None + response_content: str = response_message.get("content", "") if response_message else "" # logger.info(f"Ollama partial response content: {response_content}") # Add response content to assistant message diff --git a/phi/llm/ollama/tools.py b/phi/llm/ollama/tools.py index 6c7ef683e..9d0072bd1 100644 --- a/phi/llm/ollama/tools.py +++ b/phi/llm/ollama/tools.py @@ -259,7 +259,7 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]: # logger.info(f"Ollama partial response: {response}") # logger.info(f"Ollama partial response type: {type(response)}") response_message: Optional[dict] = response.get("message") - response_content = response_message.get("content") if response_message else None + response_content: str = response_message.get("content", "") if response_message else "" # logger.info(f"Ollama partial response content: {response_content}") # Add response content to assistant message