diff --git a/cookbook/agents/47_desi_vocal_audio_gen.py b/cookbook/agents/47_desi_vocal_audio_gen.py index 4aaace832..b5db826a4 100644 --- a/cookbook/agents/47_desi_vocal_audio_gen.py +++ b/cookbook/agents/47_desi_vocal_audio_gen.py @@ -22,4 +22,4 @@ show_tool_calls=True, ) -audio_agent.print_response("Generate a very small audio of history of french revolution") \ No newline at end of file +audio_agent.print_response("Generate a very small audio of history of french revolution") diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py index 412168ec5..ca87d9075 100644 --- a/cookbook/playground/multimodal_agent.py +++ b/cookbook/playground/multimodal_agent.py @@ -16,6 +16,7 @@ from phi.playground import Playground, serve_playground_app from phi.storage.agent.sqlite import SqlAgentStorage from phi.tools.fal_tools import FalTools +from phi.tools.desi_vocal_tools import DesiVocalTools image_agent_storage_file: str = "tmp/image_agent.db" @@ -128,10 +129,31 @@ storage=SqlAgentStorage(table_name="audio_agent", db_file=image_agent_storage_file), ) - -app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent]).get_app( - use_async=False +hindi_audio_agent = Agent( + name="Hindi Audio Generator Agent", + agent_id="hindi_audio_agent", + model=OpenAIChat(id="gpt-4o"), + tools=[DesiVocalTools()], + description="You are an AI agent that can generate audio using the DesiVocal API.", + instructions=[ + "When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio." + "Send the prompt in hindi language.", + "You'll generate the appropriate prompt to send to the tool to generate audio.", + "You don't need to find the appropriate voice first, I already specified the voice to user." + "Don't return file name or file url in your response or markdown just tell the audio was created successfully.", + "The audio should be short.", + ], + markdown=True, + debug_mode=True, + add_history_to_messages=True, + add_datetime_to_instructions=True, + storage=SqlAgentStorage(table_name="hindi_audio_agent", db_file=image_agent_storage_file), ) + +app = Playground( + agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent, hindi_audio_agent] +).get_app(use_async=False) + if __name__ == "__main__": serve_playground_app("multimodal_agent:app", reload=True) diff --git a/phi/tools/desi_vocal_tools.py b/phi/tools/desi_vocal_tools.py index 145c81221..9c89e5788 100644 --- a/phi/tools/desi_vocal_tools.py +++ b/phi/tools/desi_vocal_tools.py @@ -26,24 +26,51 @@ def __init__( self.register(self.get_voices) self.register(self.text_to_speech) - # def get_voices(self) -> str: - # try: - # url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/voices" - # response = requests.get(url) - # for voice in response.json(): - - # return str(response.text) - # except Exception as e: - # logger.error(f"Failed to get voices: {e}") - # return f"Error: {e}" - - def text_to_speech(self, agent: Agent, text: str) -> str: + def get_voices(self) -> str: + """ + Use this function to get all the voices available. + + Returns: + result (list): A list of voices that have an ID, name and description. + """ + try: + url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/voices" + response = requests.get(url) + voices_data = response.json() + + response = [] + for voice_id, voice_info in voices_data.items(): + response.append( + { + "id": voice_id, + "name": voice_info["name"], + "description": f"Gender: {voice_info['audio_gender']}, Type: {voice_info['voice_type']}, Languages: {', '.join(voice_info['languages'])}", + "preview_url": next(iter(voice_info["preview_path"].values())) + if voice_info["preview_path"] + else None, + } + ) + + return str(response) + except Exception as e: + logger.error(f"Failed to get voices: {e}") + return f"Error: {e}" + + def text_to_speech(self, agent: Agent, prompt: str, voice_id: Optional[str] = None) -> str: + """ + Use this function to generate audio from text. + + Args: + prompt (str): The text to generate audio from. + Returns: + result (str): The URL of the generated audio. + """ try: url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/generate" payload = { - "text": text, - "voice_id": self.voice_id, + "text": prompt, + "voice_id": voice_id or self.voice_id, } headers = {