diff --git a/cookbook/agents/48_video_caption_agent.py b/cookbook/agents/48_video_caption_agent.py index 31ede7d4c..bab39bca3 100644 --- a/cookbook/agents/48_video_caption_agent.py +++ b/cookbook/agents/48_video_caption_agent.py @@ -1,3 +1,7 @@ +"""Please install dependencies using: +pip install openai moviepy ffmpeg +""" + from phi.agent import Agent from phi.model.openai import OpenAIChat from phi.tools.moviepy_video_tools import MoviePyVideoTools diff --git a/phi/tools/moviepy_video_tools.py b/phi/tools/moviepy_video_tools.py index 1c420036c..7e86a7bf6 100644 --- a/phi/tools/moviepy_video_tools.py +++ b/phi/tools/moviepy_video_tools.py @@ -1,6 +1,6 @@ +from typing import List, Dict, Optional from phi.tools import Toolkit from phi.utils.log import logger -from typing import List, Dict, Optional try: from moviepy import VideoFileClip, TextClip, CompositeVideoClip, ColorClip # type: ignore @@ -27,7 +27,14 @@ def __init__( self.register(self.embed_captions) def split_text_into_lines(self, words: List[Dict]) -> List[Dict]: - """Split words into lines based on duration and length constraints""" + """Split transcribed words into lines based on duration and length constraints + + Args: + words: List of dictionaries containing word data with 'word', 'start', and 'end' keys + + Returns: + List[Dict]: List of subtitle lines, each containing word, start time, end time, and text contents + """ MAX_CHARS = 30 MAX_DURATION = 2.5 MAX_GAP = 1.5 @@ -79,7 +86,20 @@ def create_caption_clips( stroke_color="black", stroke_width=1.5, ) -> List[TextClip]: - """Create word-level caption clips with highlighting""" + """Create word-level caption clips with highlighting effects + + Args: + text_json: Dictionary containing text and timing information + frame_size: Tuple of (width, height) for the video frame + font: Font family to use for captions + color: Base text color + highlight_color: Color for highlighted words + stroke_color: Color for text outline + stroke_width: Width of text outline + + Returns: + List[TextClip]: List of MoviePy TextClip objects for each word and highlight + """ word_clips = [] x_pos = 0 y_pos = 0 @@ -157,7 +177,14 @@ def create_caption_clips( return word_clips def parse_srt(self, srt_content: str) -> List[Dict]: - """Parse SRT format and extract word timing""" + """Convert SRT formatted content into word-level timing data + + Args: + srt_content: String containing SRT formatted subtitles + + Returns: + List[Dict]: List of words with their timing information + """ words = [] lines = srt_content.strip().split("\n\n") @@ -216,14 +243,14 @@ def extract_audio(self, video_path: str, output_path: str) -> str: return f"Failed to extract audio: {str(e)}" def create_srt(self, transcription: str, output_path: str) -> str: - """Convert transcription to SRT format + """Save transcription text to SRT formatted file Args: - transcription: Text transcription + transcription: Text transcription in SRT format output_path: Path where the SRT file will be saved Returns: - str: Path to the created SRT file + str: Path to the created SRT file, or error message if failed """ try: # Since we're getting SRT format from Whisper API now, @@ -245,7 +272,20 @@ def embed_captions( stroke_color: str = "black", stroke_width: int = 1, ) -> str: - """Embed scrolling captions with word-level highlighting into video""" + """Create a new video with embedded scrolling captions and word-level highlighting + + Args: + video_path: Path to the input video file + srt_path: Path to the SRT caption file + output_path: Path for the output video (optional) + font_size: Size of caption text + font_color: Color of caption text + stroke_color: Color of text outline + stroke_width: Width of text outline + + Returns: + str: Path to the captioned video file, or error message if failed + """ try: # If no output path provided, create one based on input video if output_path is None: