fix: get voices

phidatahq · Dec 19, 2024 · 829394a · 829394a
1 parent c88f629
commit 829394a
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 18 deletions.
diff --git a/cookbook/agents/47_desi_vocal_audio_gen.py b/cookbook/agents/47_desi_vocal_audio_gen.py
@@ -22,4 +22,4 @@
     show_tool_calls=True,
 )
 
-audio_agent.print_response("Generate a very small audio of history of french revolution")
+audio_agent.print_response("Generate a very small audio of history of french revolution")
diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py
@@ -16,6 +16,7 @@
 from phi.playground import Playground, serve_playground_app
 from phi.storage.agent.sqlite import SqlAgentStorage
 from phi.tools.fal_tools import FalTools
+from phi.tools.desi_vocal_tools import DesiVocalTools
 
 image_agent_storage_file: str = "tmp/image_agent.db"
 
@@ -128,10 +129,31 @@
     storage=SqlAgentStorage(table_name="audio_agent", db_file=image_agent_storage_file),
 )
 
-
-app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent]).get_app(
-    use_async=False
+hindi_audio_agent = Agent(
+    name="Hindi Audio Generator Agent",
+    agent_id="hindi_audio_agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[DesiVocalTools()],
+    description="You are an AI agent that can generate audio using the DesiVocal API.",
+    instructions=[
+        "When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio."
+        "Send the prompt in hindi language.",
+        "You'll generate the appropriate prompt to send to the tool to generate audio.",
+        "You don't need to find the appropriate voice first, I already specified the voice to user."
+        "Don't return file name or file url in your response or markdown just tell the audio was created successfully.",
+        "The audio should be short.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    add_history_to_messages=True,
+    add_datetime_to_instructions=True,
+    storage=SqlAgentStorage(table_name="hindi_audio_agent", db_file=image_agent_storage_file),
 )
 
+
+app = Playground(
+    agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent, hindi_audio_agent]
+).get_app(use_async=False)
+
 if __name__ == "__main__":
     serve_playground_app("multimodal_agent:app", reload=True)
diff --git a/phi/tools/desi_vocal_tools.py b/phi/tools/desi_vocal_tools.py
@@ -26,24 +26,51 @@ def __init__(
         self.register(self.get_voices)
         self.register(self.text_to_speech)
 
-    # def get_voices(self) -> str:
-    #     try:
-    #         url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/voices"
-    #         response = requests.get(url)
-    #         for voice in response.json():
-
-    #         return str(response.text)
-    #     except Exception as e:
-    #         logger.error(f"Failed to get voices: {e}")
-    #         return f"Error: {e}"
-
-    def text_to_speech(self, agent: Agent, text: str) -> str:
+    def get_voices(self) -> str:
+        """
+        Use this function to get all the voices available.
+
+        Returns:
+            result (list): A list of voices that have an ID, name and description.
+        """
+        try:
+            url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/voices"
+            response = requests.get(url)
+            voices_data = response.json()
+
+            response = []
+            for voice_id, voice_info in voices_data.items():
+                response.append(
+                    {
+                        "id": voice_id,
+                        "name": voice_info["name"],
+                        "description": f"Gender: {voice_info['audio_gender']}, Type: {voice_info['voice_type']}, Languages: {', '.join(voice_info['languages'])}",
+                        "preview_url": next(iter(voice_info["preview_path"].values()))
+                        if voice_info["preview_path"]
+                        else None,
+                    }
+                )
+
+            return str(response)
+        except Exception as e:
+            logger.error(f"Failed to get voices: {e}")
+            return f"Error: {e}"
+
+    def text_to_speech(self, agent: Agent, prompt: str, voice_id: Optional[str] = None) -> str:
+        """
+        Use this function to generate audio from text.
+
+        Args:
+            prompt (str): The text to generate audio from.
+        Returns:
+            result (str): The URL of the generated audio.
+        """
         try:
             url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/generate"
 
             payload = {
-                "text": text,
-                "voice_id": self.voice_id,
+                "text": prompt,
+                "voice_id": voice_id or self.voice_id,
             }
 
             headers = {