phidatahq · anuragts · Dec 19, 2024 · Dec 19, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py
@@ -16,6 +16,7 @@
 from phi.playground import Playground, serve_playground_app
 from phi.storage.agent.sqlite import SqlAgentStorage
 from phi.tools.fal_tools import FalTools
+from phi.tools.desi_vocal_tools import DesiVocalTools
 
 image_agent_storage_file: str = "tmp/image_agent.db"
 
@@ -128,10 +129,31 @@
     storage=SqlAgentStorage(table_name="audio_agent", db_file=image_agent_storage_file),
 )
 
-
-app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent]).get_app(
-    use_async=False
+hindi_audio_agent = Agent(
+    name="Hindi Audio Generator Agent",
+    agent_id="hindi_audio_agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[DesiVocalTools()],
+    description="You are an AI agent that can generate audio using the DesiVocal API.",
+    instructions=[
+        "When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio."
+        "Send the prompt in hindi language.",
+        "You'll generate the appropriate prompt to send to the tool to generate audio.",
+        "You don't need to find the appropriate voice first, I already specified the voice to user."
+        "Don't return file name or file url in your response or markdown just tell the audio was created successfully.",
+        "The audio should be short.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    add_history_to_messages=True,
+    add_datetime_to_instructions=True,
+    storage=SqlAgentStorage(table_name="hindi_audio_agent", db_file=image_agent_storage_file),
 )
 
+
+app = Playground(
+    agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent, gif_agent, audio_agent, hindi_audio_agent]
+).get_app(use_async=False)
+
 if __name__ == "__main__":
     serve_playground_app("multimodal_agent:app", reload=True)
diff --git a/cookbook/tools/desi_vocal_tools.py b/cookbook/tools/desi_vocal_tools.py
@@ -0,0 +1,25 @@
+"""
+pip install requests
+"""
+
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.tools.desi_vocal_tools import DesiVocalTools
+
+audio_agent = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[DesiVocalTools()],
+    description="You are an AI agent that can generate audio using the DesiVocal API.",
+    instructions=[
+        "When the user asks you to generate audio, use the `text_to_speech` tool to generate the audio.",
+        "You'll generate the appropriate prompt to send to the tool to generate audio.",
+        "You don't need to find the appropriate voice first, I already specified the voice to user.",
+        "Return the audio file name in your response. Don't convert it to markdown.",
+        "Generate the text prompt we send in hindi language",
+    ],
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+)
+
+audio_agent.print_response("Generate a very small audio of history of french revolution")
diff --git a/cookbook/workflows/startup_idea_validator.py b/cookbook/workflows/startup_idea_validator.py
@@ -205,7 +205,7 @@ def run(self, startup_idea: str) -> Iterator[RunResponse]:
             table_name="validate_startup_ideas_workflow",
             db_file="tmp/workflows.db",
         ),
-        debug_mode=True
+        debug_mode=True,
     )
 
     final_report: Iterator[RunResponse] = startup_idea_validator.run(startup_idea=idea)

diff --git a/phi/tools/desi_vocal_tools.py b/phi/tools/desi_vocal_tools.py
@@ -0,0 +1,92 @@
+from phi.tools import Toolkit
+from os import getenv
+from typing import Optional
+from phi.utils.log import logger
+from phi.agent import Agent
+from phi.model.content import Audio
+from uuid import uuid4
+
+import requests
+
+
+class DesiVocalTools(Toolkit):
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        voice_id: Optional[str] = "f27d74e5-ea71-4697-be3e-f04bbd80c1a8",
+    ):
+        super().__init__(name="desi_vocal_tools")
+
+        self.api_key = api_key or getenv("DESI_VOCAL_API_KEY")
+        if not self.api_key:
+            logger.error("DESI_VOCAL_API_KEY not set. Please set the DESI_VOCAL_API_KEY environment variable.")
+
+        self.voice_id = voice_id
+
+        self.register(self.get_voices)
+        self.register(self.text_to_speech)
+
+    def get_voices(self) -> str:
+        """
+        Use this function to get all the voices available.
+
+        Returns:
+            result (list): A list of voices that have an ID, name and description.
+        """
+        try:
+            url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/voices"
+            response = requests.get(url)
+            voices_data = response.json()
+
+            response = []
+            for voice_id, voice_info in voices_data.items():
+                response.append(
+                    {
+                        "id": voice_id,
+                        "name": voice_info["name"],
+                        "gender": voice_info['audio_gender'],
+                        "type": voice_info['voice_type'],
+                        "language": ', '.join(voice_info['languages']),
+                        "preview_url": next(iter(voice_info["preview_path"].values()))
+                        if voice_info["preview_path"]
+                        else None,
+                    }
+                )
+
+            return str(response)
+        except Exception as e:
+            logger.error(f"Failed to get voices: {e}")
+            return f"Error: {e}"
+
+    def text_to_speech(self, agent: Agent, prompt: str, voice_id: Optional[str] = None) -> str:
+        """
+        Use this function to generate audio from text.
+
+        Args:
+            prompt (str): The text to generate audio from.
+        Returns:
+            result (str): The URL of the generated audio.
+        """
+        try:
+            url = "https://prod-api2.desivocal.com/dv/api/v0/tts_api/generate"
+
+            payload = {
+                "text": prompt,
+                "voice_id": voice_id or self.voice_id,
+            }
+
+            headers = {
+                "X_API_KEY": self.api_key,
+                "Content-Type": "application/json",
+            }
+
+            response = requests.post(url, headers=headers, json=payload)
+
+            audio_url = response.json()["s3_path"]
+
+            agent.add_audio(Audio(id=str(uuid4()), url=audio_url))
+
+            return audio_url
+        except Exception as e:
+            logger.error(f"Failed to generate audio: {e}")
+            return f"Error: {e}"