Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Desi Vocal audio tool #1608

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions cookbook/playground/multimodal_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from phi.playground import Playground, serve_playground_app
from phi.storage.agent.sqlite import SqlAgentStorage
from phi.tools.fal_tools import FalTools
from phi.tools.desi_vocal_tools import DesiVocalTools

image_agent_storage_file: str = "tmp/image_agent.db"

Expand Down Expand Up @@ -128,10 +129,31 @@
storage=SqlAgentStorage(table_name="audio_agent", db_file=image_agent_storage_file),
)


app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent]).get_app(
use_async=False
hindi_audio_agent = Agent(
name="Hindi Audio Generator Agent",
agent_id="hindi_audio_agent",
model=OpenAIChat(id="gpt-4o"),
tools=[DesiVocalTools()],
description="You are an AI agent that can generate audio using the DesiVocal API.",
instructions=[
"When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio."
"Send the prompt in hindi language.",
"You'll generate the appropriate prompt to send to the tool to generate audio.",
"You don't need to find the appropriate voice first, I already specified the voice to user."
"Don't return file name or file url in your response or markdown just tell the audio was created successfully.",
"The audio should be short.",
],
markdown=True,
debug_mode=True,
add_history_to_messages=True,
add_datetime_to_instructions=True,
storage=SqlAgentStorage(table_name="hindi_audio_agent", db_file=image_agent_storage_file),
)


app = Playground(
agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent, hindi_audio_agent]
).get_app(use_async=False)

if __name__ == "__main__":
serve_playground_app("multimodal_agent:app", reload=True)
25 changes: 25 additions & 0 deletions cookbook/tools/desi_vocal_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
pip install requests
"""

from phi.agent import Agent
from phi.model.openai import OpenAIChat
from phi.tools.desi_vocal_tools import DesiVocalTools

audio_agent = Agent(
model=OpenAIChat(id="gpt-4o"),
tools=[DesiVocalTools()],
description="You are an AI agent that can generate audio using the DesiVocal API.",
instructions=[
"When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio.",
"You'll generate the appropriate prompt to send to the tool to generate audio.",
"You don't need to find the appropriate voice first, I already specified the voice to user.",
"Return the audio file name in your response. Don't convert it to markdown.",
"Generate the text prompt we send in hindi language",
],
markdown=True,
debug_mode=True,
show_tool_calls=True,
)

audio_agent.print_response("Generate a very small audio of history of french revolution")
2 changes: 1 addition & 1 deletion cookbook/workflows/startup_idea_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def run(self, startup_idea: str) -> Iterator[RunResponse]:
table_name="validate_startup_ideas_workflow",
db_file="tmp/workflows.db",
),
debug_mode=True
debug_mode=True,
)

final_report: Iterator[RunResponse] = startup_idea_validator.run(startup_idea=idea)
Expand Down
92 changes: 92 additions & 0 deletions phi/tools/desi_vocal_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from phi.tools import Toolkit
from os import getenv
from typing import Optional
from phi.utils.log import logger
from phi.agent import Agent
from phi.model.content import Audio
from uuid import uuid4

import requests


class DesiVocalTools(Toolkit):
def __init__(
self,
api_key: Optional[str] = None,
voice_id: Optional[str] = "f27d74e5-ea71-4697-be3e-f04bbd80c1a8",
):
super().__init__(name="desi_vocal_tools")

self.api_key = api_key or getenv("DESI_VOCAL_API_KEY")
if not self.api_key:
logger.error("DESI_VOCAL_API_KEY not set. Please set the DESI_VOCAL_API_KEY environment variable.")

self.voice_id = voice_id

self.register(self.get_voices)
self.register(self.text_to_speech)

def get_voices(self) -> str:
"""
Use this function to get all the voices available.

Returns:
result (list): A list of voices that have an ID, name and description.
"""
try:
url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/voices"
response = requests.get(url)
voices_data = response.json()

response = []
for voice_id, voice_info in voices_data.items():
response.append(
{
"id": voice_id,
"name": voice_info["name"],
"gender": voice_info['audio_gender'],
"type": voice_info['voice_type'],
"language": ', '.join(voice_info['languages']),
"preview_url": next(iter(voice_info["preview_path"].values()))
if voice_info["preview_path"]
else None,
}
)

return str(response)
except Exception as e:
logger.error(f"Failed to get voices: {e}")
return f"Error: {e}"

def text_to_speech(self, agent: Agent, prompt: str, voice_id: Optional[str] = None) -> str:
"""
Use this function to generate audio from text.

Args:
prompt (str): The text to generate audio from.
Returns:
result (str): The URL of the generated audio.
"""
try:
url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/generate"

payload = {
"text": prompt,
"voice_id": voice_id or self.voice_id,
}

headers = {
"X_API_KEY": self.api_key,
"Content-Type": "application/json",
}

response = requests.post(url, headers=headers, json=payload)

audio_url = response.json()["s3_path"]

agent.add_audio(Audio(id=str(uuid4()), url=audio_url))

return audio_url
except Exception as e:
logger.error(f"Failed to generate audio: {e}")
return f"Error: {e}"
Loading