Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Audio output from models (i.e. voice mode) #1598

Draft
wants to merge 39 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0aec262
feat: eleven labs audio_generation tool
anuragts Dec 12, 2024
94c73ee
Update cookbook/agents/46_generate_audio.py
anuragts Dec 12, 2024
55bb0b1
Update phi/tools/eleven_labs_tools.py
anuragts Dec 12, 2024
4b6818d
Update phi/tools/eleven_labs_tools.py
anuragts Dec 12, 2024
964b0b1
fix: use target_directory
anuragts Dec 12, 2024
1fd8485
fix: move client initialization to constructor
anuragts Dec 12, 2024
ede0783
Merge branch 'main' into multimodal-add-audiomusic-gen-tools-and-cook…
manthanguptaa Dec 13, 2024
27a7c3b
Merge branch 'main' into multimodal-add-audiomusic-gen-tools-and-cook…
anuragts Dec 16, 2024
32acd9a
Merge branch 'main' into multimodal-add-audiomusic-gen-tools-and-cook…
anuragts Dec 17, 2024
4214ccb
fix: update eleven labs
anuragts Dec 17, 2024
fd58550
Merge
dirkbrnd Dec 17, 2024
531177f
Update audio handling
dirkbrnd Dec 17, 2024
9b5b80d
Merge
dirkbrnd Dec 17, 2024
8105464
Change to
dirkbrnd Dec 17, 2024
9f10309
Fix style
dirkbrnd Dec 17, 2024
348b344
fix: voice id and add a default voice id
anuragts Dec 17, 2024
a1d4bb9
fix: format
anuragts Dec 17, 2024
8872f21
fix: storage in audio agent
anuragts Dec 17, 2024
e702c72
fix: don't show tool calls
anuragts Dec 17, 2024
f784fd3
Update
dirkbrnd Dec 17, 2024
559c792
Merge
dirkbrnd Dec 17, 2024
7291024
fix: send a mime_type
anuragts Dec 17, 2024
16b0bab
Add audio convo agent
dirkbrnd Dec 17, 2024
fa54fd5
Merge branch 'multimodal-add-audiomusic-gen-tools-and-cookbooks-phi-2…
dirkbrnd Dec 17, 2024
a6a1d26
Update
dirkbrnd Dec 17, 2024
df5ffb7
Update
dirkbrnd Dec 17, 2024
396ff7c
Add also for sync responses
dirkbrnd Dec 17, 2024
c8d266b
Merge branch 'main' into multimodal-add-audiomusic-gen-tools-and-cook…
anuragts Dec 17, 2024
d7fe059
fix: mime type
anuragts Dec 17, 2024
b193d69
Remove voice mode code
dirkbrnd Dec 18, 2024
d436c35
Update
dirkbrnd Dec 18, 2024
0163d64
Merge
dirkbrnd Dec 20, 2024
178001f
Update run_response to be structured
dirkbrnd Dec 20, 2024
494cf00
Update
dirkbrnd Dec 20, 2024
8ad8712
Merge branch 'main' into voice-mode
dirkbrnd Dec 20, 2024
37340f9
Style fix
dirkbrnd Dec 20, 2024
c50eeb6
Merge branch 'main' into voice-mode
dirkbrnd Dec 21, 2024
b2915ed
Style fixes
dirkbrnd Dec 21, 2024
e51ed15
Merge branch 'main' of https://github.com/phidatahq/phidata into voic…
dirkbrnd Jan 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cookbook/agents/37_audio_input_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@
audio={"data": encoded_string, "format": "wav"},
)

if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
if agent.run_response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/dog.wav")
8 changes: 4 additions & 4 deletions cookbook/agents/38_audio_multi_turn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
)

agent.run("Is a golden retriever a good family dog?")
if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav")
if agent.run_response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_1.wav")

agent.run("Why do you say they are loyal?")
if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav")
if agent.run_response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.base64_audio, filename="tmp/answer_2.wav")
4 changes: 2 additions & 2 deletions cookbook/agents/42_image_to_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@
)

audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}")
if audio_story.response_audio is not None and "data" in audio_story.response_audio:
write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav")
if audio_story.response_audio is not None:
write_audio_to_file(audio=audio_story.response_audio.base64_audio, filename="tmp/multimodal-agents.wav")
24 changes: 24 additions & 0 deletions cookbook/playground/audio_conversation_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from phi.agent import Agent
from phi.model.openai import OpenAIChat
from phi.playground import Playground, serve_playground_app
from phi.storage.agent.sqlite import SqlAgentStorage


audio_agent = Agent(
name="Audio Chat Agent",
model=OpenAIChat(
id="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"}, # Wav not supported for streaming
),
debug_mode=True,
add_history_to_messages=True,
add_datetime_to_instructions=True,
storage=SqlAgentStorage(table_name="audio_agent", db_file="tmp/audio_agent.db"),
)


app = Playground(agents=[audio_agent]).get_app()

if __name__ == "__main__":
serve_playground_app("audio_conversation_agent:app", reload=True)
1 change: 0 additions & 1 deletion cookbook/tools/elevenlabs_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,4 @@

audio_agent.print_response("Generate a very long audio of history of french revolution")


audio_agent.print_response("Generate a kick sound effect")
9 changes: 6 additions & 3 deletions cookbook/tools/zoom_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def get_access_token(self) -> str:
"""
Obtain or refresh the access token for Zoom API.

to get the account_id ,client_id ,client_secret
To get the account_id, client_id, client_secret
https://developers.zoom.us/docs/internal-apps/create/

for oauth 2.0
For oauth 2.0
https://developers.zoom.us/docs/integrations/oauth/
Returns:
A string containing the access token or an empty string if token retrieval fails.
Expand All @@ -47,7 +47,10 @@ def get_access_token(self) -> str:

try:
response = requests.post(
self.token_url, headers=headers, data=data, auth=(self.client_id, self.client_secret)
self.token_url,
headers=headers,
data=data,
auth=(self.client_id, self.client_secret), # type: ignore
)
response.raise_for_status()

Expand Down
4 changes: 2 additions & 2 deletions cookbook/workflows/startup_idea_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
1. Install dependencies using: `pip install openai exa_py sqlalchemy phidata`
2. Run the script using: `python cookbook/workflows/blog_post_generator.py`
1. Install dependencies using: `pip install openai googlesearch-python pycountry phidata`
2. Run the script using: `python cookbook/workflows/startup_idea_validator.py`
"""

import json
Expand Down
42 changes: 38 additions & 4 deletions phi/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from phi.document import Document
from phi.agent.session import AgentSession
from phi.model.content import Image, Video, Audio
from phi.model.content import Image, Video, Audio, ModelResponseAudio
from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction
from phi.run.response import RunEvent, RunResponse, RunResponseExtraData
from phi.knowledge.agent import AgentKnowledge
Expand Down Expand Up @@ -1811,6 +1811,14 @@ def _run(
self.run_response.created_at = model_response_chunk.created_at
yield self.run_response

if model_response_chunk.audio is not None:
if model_response.audio is None:
model_response.audio = {"data": "", "transcript": ""}

model_response.audio["data"] += model_response_chunk.audio.get("data", "")
model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "")
yield self.run_response

elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
# Add tool call to the run_response
tool_call_dict = model_response_chunk.tool_call
Expand Down Expand Up @@ -1847,7 +1855,10 @@ def _run(
else:
self.run_response.content = model_response.content
if model_response.audio is not None:
self.run_response.response_audio = model_response.audio
self.run_response.response_audio = ModelResponseAudio(
base64_audio=model_response.audio.get("data"),
transcript=model_response.audio.get("transcript"),
)
self.run_response.messages = messages_for_model
self.run_response.created_at = model_response.created_at

Expand All @@ -1862,7 +1873,10 @@ def _run(
if self.stream:
self.run_response.content = model_response.content
if model_response.audio is not None:
self.run_response.response_audio = model_response.audio
self.run_response.response_audio = ModelResponseAudio(
base64_audio=model_response.audio.get("data"),
transcript=model_response.audio.get("transcript"),
)

# 6. Update Memory
if self.stream_intermediate_steps:
Expand Down Expand Up @@ -2153,6 +2167,16 @@ async def _arun(
self.run_response.content = model_response_chunk.content
self.run_response.created_at = model_response_chunk.created_at
yield self.run_response

if model_response_chunk.audio is not None:
if model_response.audio is None:
model_response.audio = {"data": "", "transcript": ""}

model_response.audio["data"] += model_response_chunk.audio.get("data", "")
model_response.audio["transcript"] += model_response_chunk.audio.get("transcript", "")
# self.run_response.response_audio = model_response_chunk.audio
# self.run_response.created_at = model_response_chunk.created_at
yield self.run_response
elif model_response_chunk.event == ModelResponseEvent.tool_call_started.value:
# Add tool call to the run_response
tool_call_dict = model_response_chunk.tool_call
Expand Down Expand Up @@ -2188,21 +2212,31 @@ async def _arun(
self.run_response.content_type = self.response_model.__name__
else:
self.run_response.content = model_response.content
if model_response.audio is not None:
self.run_response.response_audio = ModelResponseAudio(
base64_audio=model_response.audio.get("data"),
transcript=model_response.audio.get("transcript"),
)
self.run_response.messages = messages_for_model
self.run_response.created_at = model_response.created_at

# Build a list of messages that belong to this particular run
run_messages = user_messages + messages_for_model[num_input_messages:]
if system_message is not None:
run_messages.insert(0, system_message)

# Update the run_response
self.run_response.messages = run_messages
self.run_response.metrics = self._aggregate_metrics_from_run_messages(run_messages)

# Update the run_response content if streaming as run_response will only contain the last chunk
if self.stream:
self.run_response.content = model_response.content
if model_response.audio is not None:
self.run_response.response_audio = model_response.audio
self.run_response.response_audio = ModelResponseAudio(
base64_audio=model_response.audio.get("data"),
transcript=model_response.audio.get("transcript"),
)

# 6. Update Memory
if self.stream_intermediate_steps:
Expand Down
8 changes: 4 additions & 4 deletions phi/model/cohere/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from phi.utils.tools import get_function_call_for_tool_call

try:
from cohere import Client as CohereClient
from cohere import Client as CohereClient, ToolCall
from cohere.types.tool import Tool as CohereTool
from cohere.types.non_streamed_chat_response import NonStreamedChatResponse
from cohere.types.streamed_chat_response import (
Expand Down Expand Up @@ -286,7 +286,7 @@ def _handle_tool_calls(
self,
assistant_message: Message,
messages: List[Message],
response_tool_calls: List[Any],
response_tool_calls: List[ToolCall],
model_response: ModelResponse,
) -> Optional[Any]:
"""
Expand All @@ -295,7 +295,7 @@ def _handle_tool_calls(
Args:
assistant_message (Message): The assistant message.
messages (List[Message]): The list of messages.
response_tool_calls (List[Any]): The list of response tool calls.
response_tool_calls (List[ToolCall]): The list of response tool calls.
model_response (ModelResponse): The model response.

Returns:
Expand Down Expand Up @@ -420,7 +420,7 @@ def response(self, messages: List[Message], tool_results: Optional[List[ToolResu
tool_results = self._handle_tool_calls(
assistant_message=assistant_message,
messages=messages,
response_tool_calls=response_tool_calls,
response_tool_calls=response_tool_calls, # type: ignore
model_response=model_response,
)

Expand Down
6 changes: 6 additions & 0 deletions phi/model/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Audio(Media):
url: Optional[str] = None # Remote location for file
base64_audio: Optional[str] = None # Base64-encoded audio data
length: Optional[str] = None
mime_type: Optional[str] = None

@model_validator(mode="before")
def validate_exclusive_audio(cls, data: Any):
Expand All @@ -35,3 +36,8 @@ def validate_exclusive_audio(cls, data: Any):
if not data.get("url") and not data.get("base64_audio"):
raise ValueError("Either `url` or `base64_audio` must be provided.")
return data


class ModelResponseAudio(BaseModel):
base64_audio: str
transcript: Optional[str] = None
7 changes: 7 additions & 0 deletions phi/playground/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,13 @@ async def chat_response_streamer(
) -> AsyncGenerator:
run_response = await agent.arun(message, images=images, stream=True, stream_intermediate_steps=True)
async for run_response_chunk in run_response:
print(
run_response_chunk.event,
"|",
run_response_chunk.content,
"|",
run_response_chunk.response_audio.base64_audio[:10] if run_response_chunk.response_audio else "-",
)
run_response_chunk = cast(RunResponse, run_response_chunk)
yield run_response_chunk.to_json()

Expand Down
4 changes: 2 additions & 2 deletions phi/run/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from pydantic import BaseModel, ConfigDict, Field

from phi.model.content import Video, Image, Audio
from phi.model.content import Video, Image, Audio, ModelResponseAudio
from phi.reasoning.step import ReasoningStep
from phi.model.message import Message, MessageReferences

Expand Down Expand Up @@ -53,7 +53,7 @@ class RunResponse(BaseModel):
images: Optional[List[Image]] = None # Images attached to the response
videos: Optional[List[Video]] = None # Videos attached to the response
audio: Optional[List[Audio]] = None # Audio attached to the response
response_audio: Optional[Dict] = None # Model audio response
response_audio: Optional[ModelResponseAudio] = None # Model audio response
extra_data: Optional[RunResponseExtraData] = None
created_at: int = Field(default_factory=lambda: int(time()))

Expand Down
2 changes: 1 addition & 1 deletion phi/tools/calcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def get_available_slots(
"eventTypeId": self.event_type_id,
}

response = requests.get(url, headers=self._get_headers(), params=querystring)
response = requests.get(url, headers=self._get_headers(), params=querystring) # type: ignore
if response.status_code == 200:
slots = response.json()["data"]["slots"]
available_slots = []
Expand Down
30 changes: 19 additions & 11 deletions phi/tools/eleven_labs_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,14 @@ def __init__(

def get_voices(self) -> str:
"""
Use this function to get all the voices available.

Returns:
result (list): A list of voices that have an ID, name and description.
<<<<<<< HEAD
Use this function to generate sound effect audio from a text prompt.
=======
Use this function to get all the voices available.
>>>>>>> 48addb496442892c21382ff27d03578b3f9d7ac6

Returns:
result (list): A list of voices that have an ID, name and description.
"""
try:
voices = self.eleven_labs_client.voices.get_all()
Expand Down Expand Up @@ -152,13 +156,17 @@ def generate_sound_effect(self, agent: Agent, prompt: str, duration_seconds: Opt

def text_to_speech(self, agent: Agent, prompt: str, voice_id: Optional[str] = None) -> str:
"""
Use this function to convert text to speech audio.

Args:
prompt (str): Text to generate audio from.
voice_id (Optional[str]): The ID of the voice to use for audio generation. Uses default if none is specified.
Returns:
str: Return the path to the generated audio file.
Use this function to convert text to speech audio.

Args:
prompt (str): Text to generate audio from.
<<<<<<< HEAD
voice_id (str): The ID of the voice to use for audio generation.
=======
voice_id (Optional[str]): The ID of the voice to use for audio generation. Uses default if none is specified.
>>>>>>> 48addb496442892c21382ff27d03578b3f9d7ac6
Returns:
str: Return the path to the generated audio file.
"""
try:
audio_generator = self.eleven_labs_client.text_to_speech.convert(
Expand Down
2 changes: 1 addition & 1 deletion phi/tools/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(

def authenticate(self):
"""Authenticate with GitHub using the provided access token."""
auth = Auth.Token(self.access_token)
auth = Auth.Token(self.access_token) # type: ignore
if self.base_url:
logger.debug(f"Authenticating with GitHub Enterprise at {self.base_url}")
return Github(base_url=self.base_url, auth=auth)
Expand Down
2 changes: 1 addition & 1 deletion phi/tools/zoom.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def get_upcoming_meetings(self, user_id: str = "me") -> str:
params = {"type": "upcoming", "page_size": 30}

try:
response = requests.get(url, headers=headers, params=params)
response = requests.get(url, headers=headers, params=params) # type: ignore
response.raise_for_status()
meetings = response.json()

Expand Down
2 changes: 1 addition & 1 deletion phi/vectordb/qdrant/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def client(self) -> QdrantClient:
https=self.https,
api_key=self.api_key,
prefix=self.prefix,
timeout=self.timeout,
timeout=self.timeout, # type: ignore
host=self.host,
path=self.path,
**self.kwargs,
Expand Down
Loading