diff --git a/.gitignore b/.gitignore index f10a692..d0a8e81 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,12 @@ node_modules # 模型目录 /models/ ./models/* -resource/scripts/* -resource/videos/* -resource/songs/* -resource/fonts/* +resource/scripts/*.json +resource/videos/*.mp4 +resource/songs/*.mp3 +resource/songs/*.flac +resource/fonts/*.ttc +resource/fonts/*.ttf +resource/fonts/*.otf +resource/srt/*.srt app/models/faster-whisper-large-v2/* \ No newline at end of file diff --git a/README.md b/README.md index 7f44876..5a1812f 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,9 @@ NarratoAI 是一个自动化影视解说工具,基于LLM实现文案撰写、 - [x] 发布 0.3.5 整合包 - [ ] 支持阿里 Qwen2-VL 大模型理解视频 - [ ] 支持短剧解说 + - [x] 合并素材 + - [ ] 一键转录 + - [ ] 一键清理缓存 - [ ] ... ## 配置要求 📦 diff --git a/app/controllers/v1/video.py b/app/controllers/v1/video.py index 0430707..336084f 100644 --- a/app/controllers/v1/video.py +++ b/app/controllers/v1/video.py @@ -163,109 +163,109 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID ) -@router.get( - "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files" -) -def get_bgm_list(request: Request): - suffix = "*.mp3" - song_dir = utils.song_dir() - files = glob.glob(os.path.join(song_dir, suffix)) - bgm_list = [] - for file in files: - bgm_list.append( - { - "name": os.path.basename(file), - "size": os.path.getsize(file), - "file": file, - } - ) - response = {"files": bgm_list} - return utils.get_response(200, response) - - -@router.post( - "/musics", - response_model=BgmUploadResponse, - summary="Upload the BGM file to the songs directory", -) -def upload_bgm_file(request: Request, file: UploadFile = File(...)): - request_id = base.get_task_id(request) - # check file ext - if file.filename.endswith("mp3"): - song_dir = utils.song_dir() - save_path = os.path.join(song_dir, file.filename) - # save file - with open(save_path, "wb+") as buffer: - # If the file already exists, it will be overwritten - file.file.seek(0) - buffer.write(file.file.read()) - response = {"file": save_path} - return utils.get_response(200, response) - - raise HttpException( - "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded" - ) - - -@router.get("/stream/{file_path:path}") -async def stream_video(request: Request, file_path: str): - tasks_dir = utils.task_dir() - video_path = os.path.join(tasks_dir, file_path) - range_header = request.headers.get("Range") - video_size = os.path.getsize(video_path) - start, end = 0, video_size - 1 - - length = video_size - if range_header: - range_ = range_header.split("bytes=")[1] - start, end = [int(part) if part else None for part in range_.split("-")] - if start is None: - start = video_size - end - end = video_size - 1 - if end is None: - end = video_size - 1 - length = end - start + 1 - - def file_iterator(file_path, offset=0, bytes_to_read=None): - with open(file_path, "rb") as f: - f.seek(offset, os.SEEK_SET) - remaining = bytes_to_read or video_size - while remaining > 0: - bytes_to_read = min(4096, remaining) - data = f.read(bytes_to_read) - if not data: - break - remaining -= len(data) - yield data - - response = StreamingResponse( - file_iterator(video_path, start, length), media_type="video/mp4" - ) - response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}" - response.headers["Accept-Ranges"] = "bytes" - response.headers["Content-Length"] = str(length) - response.status_code = 206 # Partial Content - - return response - - -@router.get("/download/{file_path:path}") -async def download_video(_: Request, file_path: str): - """ - download video - :param _: Request request - :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4 - :return: video file - """ - tasks_dir = utils.task_dir() - video_path = os.path.join(tasks_dir, file_path) - file_path = pathlib.Path(video_path) - filename = file_path.stem - extension = file_path.suffix - headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"} - return FileResponse( - path=video_path, - headers=headers, - filename=f"{filename}{extension}", - media_type=f"video/{extension[1:]}", - ) +# @router.get( +# "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files" +# ) +# def get_bgm_list(request: Request): +# suffix = "*.mp3" +# song_dir = utils.song_dir() +# files = glob.glob(os.path.join(song_dir, suffix)) +# bgm_list = [] +# for file in files: +# bgm_list.append( +# { +# "name": os.path.basename(file), +# "size": os.path.getsize(file), +# "file": file, +# } +# ) +# response = {"files": bgm_list} +# return utils.get_response(200, response) +# + +# @router.post( +# "/musics", +# response_model=BgmUploadResponse, +# summary="Upload the BGM file to the songs directory", +# ) +# def upload_bgm_file(request: Request, file: UploadFile = File(...)): +# request_id = base.get_task_id(request) +# # check file ext +# if file.filename.endswith("mp3"): +# song_dir = utils.song_dir() +# save_path = os.path.join(song_dir, file.filename) +# # save file +# with open(save_path, "wb+") as buffer: +# # If the file already exists, it will be overwritten +# file.file.seek(0) +# buffer.write(file.file.read()) +# response = {"file": save_path} +# return utils.get_response(200, response) +# +# raise HttpException( +# "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded" +# ) +# +# +# @router.get("/stream/{file_path:path}") +# async def stream_video(request: Request, file_path: str): +# tasks_dir = utils.task_dir() +# video_path = os.path.join(tasks_dir, file_path) +# range_header = request.headers.get("Range") +# video_size = os.path.getsize(video_path) +# start, end = 0, video_size - 1 +# +# length = video_size +# if range_header: +# range_ = range_header.split("bytes=")[1] +# start, end = [int(part) if part else None for part in range_.split("-")] +# if start is None: +# start = video_size - end +# end = video_size - 1 +# if end is None: +# end = video_size - 1 +# length = end - start + 1 +# +# def file_iterator(file_path, offset=0, bytes_to_read=None): +# with open(file_path, "rb") as f: +# f.seek(offset, os.SEEK_SET) +# remaining = bytes_to_read or video_size +# while remaining > 0: +# bytes_to_read = min(4096, remaining) +# data = f.read(bytes_to_read) +# if not data: +# break +# remaining -= len(data) +# yield data +# +# response = StreamingResponse( +# file_iterator(video_path, start, length), media_type="video/mp4" +# ) +# response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}" +# response.headers["Accept-Ranges"] = "bytes" +# response.headers["Content-Length"] = str(length) +# response.status_code = 206 # Partial Content +# +# return response +# +# +# @router.get("/download/{file_path:path}") +# async def download_video(_: Request, file_path: str): +# """ +# download video +# :param _: Request request +# :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4 +# :return: video file +# """ +# tasks_dir = utils.task_dir() +# video_path = os.path.join(tasks_dir, file_path) +# file_path = pathlib.Path(video_path) +# filename = file_path.stem +# extension = file_path.suffix +# headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"} +# return FileResponse( +# path=video_path, +# headers=headers, +# filename=f"{filename}{extension}", +# media_type=f"video/{extension[1:]}", +# ) diff --git a/app/controllers/v2/base.py b/app/controllers/v2/base.py new file mode 100644 index 0000000..4612983 --- /dev/null +++ b/app/controllers/v2/base.py @@ -0,0 +1,11 @@ +from fastapi import APIRouter, Depends + + +def v2_router(dependencies=None): + router = APIRouter() + router.tags = ["V2"] + router.prefix = "/api/v2" + # 将认证依赖项应用于所有路由 + if dependencies: + router.dependencies = dependencies + return router diff --git a/app/controllers/v2/script.py b/app/controllers/v2/script.py new file mode 100644 index 0000000..c3501eb --- /dev/null +++ b/app/controllers/v2/script.py @@ -0,0 +1,170 @@ +from fastapi import APIRouter, BackgroundTasks +from loguru import logger +import os + +from app.models.schema_v2 import ( + GenerateScriptRequest, + GenerateScriptResponse, + CropVideoRequest, + CropVideoResponse, + DownloadVideoRequest, + DownloadVideoResponse, + StartSubclipRequest, + StartSubclipResponse +) +from app.models.schema import VideoClipParams +from app.services.script_service import ScriptGenerator +from app.services.video_service import VideoService +from app.utils import utils +from app.controllers.v2.base import v2_router +from app.models.schema import VideoClipParams +from app.services.youtube_service import YoutubeService +from app.services import task as task_service + +router = v2_router() + + +@router.post( + "/scripts/generate", + response_model=GenerateScriptResponse, + summary="同步请求;生成视频脚本 (V2)" +) +async def generate_script( + request: GenerateScriptRequest, + background_tasks: BackgroundTasks +): + """ + 生成视频脚本的V2版本API + """ + task_id = utils.get_uuid() + + try: + generator = ScriptGenerator() + script = await generator.generate_script( + video_path=request.video_path, + video_theme=request.video_theme, + custom_prompt=request.custom_prompt, + skip_seconds=request.skip_seconds, + threshold=request.threshold, + vision_batch_size=request.vision_batch_size, + vision_llm_provider=request.vision_llm_provider + ) + + return { + "task_id": task_id, + "script": script + } + + except Exception as e: + logger.exception(f"Generate script failed: {str(e)}") + raise + + +@router.post( + "/scripts/crop", + response_model=CropVideoResponse, + summary="同步请求;裁剪视频 (V2)" +) +async def crop_video( + request: CropVideoRequest, + background_tasks: BackgroundTasks +): + """ + 根据脚本裁剪视频的V2版本API + """ + try: + # 调用视频裁剪服务 + video_service = VideoService() + task_id, subclip_videos = await video_service.crop_video( + video_path=request.video_origin_path, + video_script=request.video_script + ) + logger.debug(f"裁剪视频成功,视频片段路径: {subclip_videos}") + logger.debug(type(subclip_videos)) + return { + "task_id": task_id, + "subclip_videos": subclip_videos + } + + except Exception as e: + logger.exception(f"Crop video failed: {str(e)}") + raise + + +@router.post( + "/youtube/download", + response_model=DownloadVideoResponse, + summary="同步请求;下载YouTube视频 (V2)" +) +async def download_youtube_video( + request: DownloadVideoRequest, + background_tasks: BackgroundTasks +): + """ + 下载指定分辨率的YouTube视频 + """ + try: + youtube_service = YoutubeService() + task_id, output_path, filename = await youtube_service.download_video( + url=request.url, + resolution=request.resolution, + output_format=request.output_format, + rename=request.rename + ) + + return { + "task_id": task_id, + "output_path": output_path, + "resolution": request.resolution, + "format": request.output_format, + "filename": filename + } + + except Exception as e: + logger.exception(f"Download YouTube video failed: {str(e)}") + raise + + +@router.post( + "/scripts/start-subclip", + response_model=StartSubclipResponse, + summary="异步请求;开始视频剪辑任务 (V2)" +) +async def start_subclip( + request: VideoClipParams, + task_id: str, + subclip_videos: dict, + background_tasks: BackgroundTasks +): + """ + 开始视频剪辑任务的V2版本API + """ + try: + # 构建参数对象 + params = VideoClipParams( + video_origin_path=request.video_origin_path, + video_clip_json_path=request.video_clip_json_path, + voice_name=request.voice_name, + voice_rate=request.voice_rate, + voice_pitch=request.voice_pitch, + subtitle_enabled=request.subtitle_enabled, + video_aspect=request.video_aspect, + n_threads=request.n_threads + ) + + # 在后台任务中执行视频剪辑 + background_tasks.add_task( + task_service.start_subclip, + task_id=task_id, + params=params, + subclip_path_videos=subclip_videos + ) + + return { + "task_id": task_id, + "state": "PROCESSING" # 初始状态 + } + + except Exception as e: + logger.exception(f"Start subclip task failed: {str(e)}") + raise diff --git a/app/models/schema.py b/app/models/schema.py index 9d0c5d4..6621772 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -366,6 +366,8 @@ class VideoClipParams(BaseModel): custom_position: float = Field(default=70.0, description="自定义位置") n_threads: Optional[int] = 8 # 线程数,有助于提升视频处理速度 + tts_volume: float = 1.0 # TTS音频音量 + video_volume: float = 0.1 # 视频原声音量 class VideoTranscriptionRequest(BaseModel): video_name: str diff --git a/app/models/schema_v2.py b/app/models/schema_v2.py new file mode 100644 index 0000000..1611a3b --- /dev/null +++ b/app/models/schema_v2.py @@ -0,0 +1,62 @@ +from typing import Optional, List +from pydantic import BaseModel + + +class GenerateScriptRequest(BaseModel): + video_path: str + video_theme: Optional[str] = "" + custom_prompt: Optional[str] = "" + skip_seconds: Optional[int] = 0 + threshold: Optional[int] = 30 + vision_batch_size: Optional[int] = 5 + vision_llm_provider: Optional[str] = "gemini" + + +class GenerateScriptResponse(BaseModel): + task_id: str + script: List[dict] + + +class CropVideoRequest(BaseModel): + video_origin_path: str + video_script: List[dict] + + +class CropVideoResponse(BaseModel): + task_id: str + subclip_videos: dict + + +class DownloadVideoRequest(BaseModel): + url: str + resolution: str + output_format: Optional[str] = "mp4" + rename: Optional[str] = None + + +class DownloadVideoResponse(BaseModel): + task_id: str + output_path: str + resolution: str + format: str + filename: str + + +class StartSubclipRequest(BaseModel): + task_id: str + video_origin_path: str + video_clip_json_path: str + voice_name: Optional[str] = None + voice_rate: Optional[int] = 0 + voice_pitch: Optional[int] = 0 + subtitle_enabled: Optional[bool] = True + video_aspect: Optional[str] = "16:9" + n_threads: Optional[int] = 4 + subclip_videos: list # 从裁剪视频接口获取的视频片段字典 + + +class StartSubclipResponse(BaseModel): + task_id: str + state: str + videos: Optional[List[str]] = None + combined_videos: Optional[List[str]] = None diff --git a/app/router.py b/app/router.py index cf84037..df60500 100644 --- a/app/router.py +++ b/app/router.py @@ -10,8 +10,12 @@ from fastapi import APIRouter from app.controllers.v1 import llm, video +from app.controllers.v2 import script root_api_router = APIRouter() # v1 root_api_router.include_router(video.router) root_api_router.include_router(llm.router) + +# v2 +root_api_router.include_router(script.router) diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py index f0face0..c7edc77 100644 --- a/app/services/audio_merger.py +++ b/app/services/audio_merger.py @@ -18,95 +18,119 @@ def check_ffmpeg(): return False -def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list): +def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list): """ - 合并多个音频文件到一个指定总时长的音频文件中,并生成相应的字幕 - :param task_id: 任务ID - :param audio_file_paths: 音频文件路径列表 - :param total_duration: 最终音频文件的总时长(秒) - :param video_script: JSON格式的视频脚本 + 合并音频文件,根据OST设置处理不同的音频轨道 + + Args: + task_id: 任务ID + audio_files: TTS生成的音频文件列表 + total_duration: 总时长 + list_script: 完整脚本信息,包含OST设置 + + Returns: + str: 合并后的音频文件路径 """ - output_dir = utils.task_dir(task_id) - + # 检查FFmpeg是否安装 if not check_ffmpeg(): - logger.error("错误:FFmpeg未安装。请安装FFmpeg后再运行此脚本。") - return None, None - - # 创建一个总时长为total_duration的空白音频 - blank_audio = AudioSegment.silent(duration=total_duration * 1000) # pydub使用毫秒 + logger.error("FFmpeg未安装,无法合并音频文件") + return None - for audio_path in audio_file_paths: - if not os.path.exists(audio_path): - logger.info(f"警告:文件 {audio_path} 不存在,已跳过。") - continue - - # 从文件名中提取时间戳 - filename = os.path.basename(audio_path) - start_time, end_time = extract_timestamp(filename) + # 创建一个空的音频片段 + final_audio = AudioSegment.silent(duration=total_duration * 1000) # 总时长以毫秒为单位 - # 读取音频文件 + # 遍历脚本中的每个片段 + for segment, audio_file in zip(list_script, audio_files): try: - audio = AudioSegment.from_mp3(audio_path) - except Exception as e: - logger.error(f"错误:无法读取文件 {audio_path}。错误信息:{str(e)}") - continue - - # 将音频插入到空白音频的指定位置 - blank_audio = blank_audio.overlay(audio, position=start_time * 1000) + # 加载TTS音频文件 + tts_audio = AudioSegment.from_file(audio_file) + + # 获取片段的开始和结束时间 + start_time, end_time = segment['new_timestamp'].split('-') + start_seconds = utils.time_to_seconds(start_time) + end_seconds = utils.time_to_seconds(end_time) + + # 根据OST设置处理音频 + if segment['OST'] == 0: + # 只使用TTS音频 + final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000) + elif segment['OST'] == 1: + # 只使用原声(假设原声已经在视频中) + continue + elif segment['OST'] == 2: + # 混合TTS音频和原声 + original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000) + mixed_audio = original_audio.overlay(tts_audio) + final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000) - # 尝试导出为WAV格式 - try: - output_file = os.path.join(output_dir, "audio.wav") - blank_audio.export(output_file, format="wav") - logger.info(f"音频合并完成,已保存为 {output_file}") - except Exception as e: - logger.info(f"导出为WAV格式失败,尝试使用MP3格式:{str(e)}") - try: - output_file = os.path.join(output_dir, "audio.mp3") - blank_audio.export(output_file, format="mp3", codec="libmp3lame") - logger.info(f"音频合并完成,已保存为 {output_file}") except Exception as e: - logger.error(f"导出音频失败:{str(e)}") - return None, None + logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}") + continue - return output_file + # 保存合并后的音频文件 + output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3") + final_audio.export(output_audio_path, format="mp3") + logger.info(f"合并后的音频文件已保存: {output_audio_path}") -def parse_timestamp(timestamp: str): - """解析时间戳字符串为秒数""" - # 确保使用冒号作为分隔符 - timestamp = timestamp.replace('_', ':') - return time_to_seconds(timestamp) - -def extract_timestamp(filename): - """从文件名中提取开始和结束时间戳""" - # 从 "audio_00_06-00_24.mp3" 这样的格式中提取时间 - time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06-00_24" 部分 - start_time, end_time = time_part.split('-') # 分割成 "00_06" 和 "00_24" - - # 将下划线格式转换回冒号格式 - start_time = start_time.replace('_', ':') - end_time = end_time.replace('_', ':') - - # 将时间戳转换为秒 - start_seconds = time_to_seconds(start_time) - end_seconds = time_to_seconds(end_time) - - return start_seconds, end_seconds + return output_audio_path def time_to_seconds(time_str): - """将 "00:06" 或 "00_06" 格式转换为总秒数""" - # 确保使用冒号作为分隔符 - time_str = time_str.replace('_', ':') + """ + 将时间字符串转换为秒数,支持多种格式: + 1. 'HH:MM:SS,mmm' (时:分:秒,毫秒) + 2. 'MM:SS,mmm' (分:秒,毫秒) + 3. 'SS,mmm' (秒,毫秒) + """ try: - parts = time_str.split(':') - if len(parts) != 2: - logger.error(f"Invalid time format: {time_str}") - return 0 - return int(parts[0]) * 60 + int(parts[1]) + # 处理毫秒部分 + if ',' in time_str: + time_part, ms_part = time_str.split(',') + ms = float(ms_part) / 1000 + else: + time_part = time_str + ms = 0 + + # 分割时间部分 + parts = time_part.split(':') + + if len(parts) == 3: # HH:MM:SS + h, m, s = map(int, parts) + seconds = h * 3600 + m * 60 + s + elif len(parts) == 2: # MM:SS + m, s = map(int, parts) + seconds = m * 60 + s + else: # SS + seconds = int(parts[0]) + + return seconds + ms except (ValueError, IndexError) as e: logger.error(f"Error parsing time {time_str}: {str(e)}") - return 0 + return 0.0 + + +def extract_timestamp(filename): + """ + 从文件名中提取开始和结束时间戳 + 例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8) + """ + try: + # 从文件名中提取时间部分 + time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06,500-00_24,800" 部分 + start_time, end_time = time_part.split('-') # 分割成开始和结束时间 + + # 将下划线格式转换回冒号格式 + start_time = start_time.replace('_', ':') + end_time = end_time.replace('_', ':') + + # 将时间戳转换为秒 + start_seconds = time_to_seconds(start_time) + end_seconds = time_to_seconds(end_time) + + return start_seconds, end_seconds + except Exception as e: + logger.error(f"Error extracting timestamp from {filename}: {str(e)}") + return 0.0, 0.0 if __name__ == "__main__": diff --git a/app/services/material.py b/app/services/material.py index bab1aba..2a84f85 100644 --- a/app/services/material.py +++ b/app/services/material.py @@ -3,6 +3,7 @@ import random import traceback from urllib.parse import urlencode +from datetime import datetime import requests from typing import List @@ -254,70 +255,105 @@ def download_videos( def time_to_seconds(time_str: str) -> float: """ 将时间字符串转换为秒数 - 支持格式: - 1. "MM:SS" (分:秒) - 2. "SS" (纯秒数) + 支持格式: 'HH:MM:SS,mmm' (时:分:秒,毫秒) + + Args: + time_str: 时间字符串,如 "00:00:20,100" + + Returns: + float: 转换后的秒数(包含毫秒) """ - parts = time_str.split(':') - if len(parts) == 2: - minutes, seconds = map(float, parts) - return minutes * 60 + seconds - return float(time_str) + try: + # 处理毫秒部分 + if ',' in time_str: + time_part, ms_part = time_str.split(',') + ms = int(ms_part) / 1000 + else: + time_part = time_str + ms = 0 + + # 处理时分秒 + parts = time_part.split(':') + if len(parts) == 3: # HH:MM:SS + h, m, s = map(int, parts) + seconds = h * 3600 + m * 60 + s + else: + raise ValueError("时间格式必须为 HH:MM:SS,mmm") + + return seconds + ms + + except ValueError as e: + logger.error(f"时间格式错误: {time_str}") + raise ValueError(f"时间格式错误: 必须为 HH:MM:SS,mmm 格式") from e def format_timestamp(seconds: float) -> str: """ - 将秒数转换为 "MM:SS" 格式的时间字符串 + 将秒数转换为可读的时间格式 (HH:MM:SS,mmm) + + Args: + seconds: 秒数(可包含毫秒) + + Returns: + str: 格式化的时间字符串,如 "00:00:20,100" """ - minutes = int(seconds) // 60 - secs = int(seconds) % 60 - return f"{minutes:02d}:{secs:02d}" + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds_remain = seconds % 60 + whole_seconds = int(seconds_remain) + milliseconds = int((seconds_remain - whole_seconds) * 1000) + + return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict: """ 保存剪辑后的视频 + Args: - timestamp: 需要裁剪的单个时间戳,支持两种格式: - 1. '00:36-00:40' (分:秒-分:秒) - 2. 'SS-SS' (秒-秒) + timestamp: 需要裁剪的时间戳,格式为 'HH:MM:SS,mmm-HH:MM:SS,mmm' + 例如: '00:00:00,000-00:00:20,100' origin_video: 原视频路径 save_dir: 存储目录 Returns: - 裁剪后的视频路径,格式为 {timestamp: video_path} + dict: 裁剪后的视频路径,格式为 {timestamp: video_path} """ + # 使用新的路径结构 if not save_dir: - save_dir = utils.storage_dir("cache_videos") + base_dir = os.path.join(utils.temp_dir(), "clip_video") + video_hash = utils.md5(origin_video) + save_dir = os.path.join(base_dir, video_hash) if not os.path.exists(save_dir): os.makedirs(save_dir) - video_id = f"vid-{timestamp.replace(':', '_')}" - video_path = f"{save_dir}/{video_id}.mp4" + # 生成更规范的视频文件名 + video_id = f"vid-{timestamp.replace(':', '-').replace(',', '_')}" + video_path = os.path.join(save_dir, f"{video_id}.mp4") if os.path.exists(video_path) and os.path.getsize(video_path) > 0: logger.info(f"video already exists: {video_path}") return {timestamp: video_path} try: - # 先加载视频获取总时长 + # 加载视频获取总时长 video = VideoFileClip(origin_video) total_duration = video.duration - # 获取目标时间段 + # 解析时间戳 start_str, end_str = timestamp.split('-') start = time_to_seconds(start_str) end = time_to_seconds(end_str) - # 验证时间段是否有效 + # 验证时间段 if start >= total_duration: - logger.warning(f"起始时间 {format_timestamp(start)} ({start:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)") + logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)") video.close() return {} if end > total_duration: - logger.warning(f"结束时间 {format_timestamp(end)} ({end:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒),将自动调整为视频结尾") + logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒),将自动调整为视频结尾") end = total_duration if end <= start: @@ -328,11 +364,21 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di # 剪辑视频 duration = end - start logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)},时长 {format_timestamp(duration)}") + + # 剪辑视频 subclip = video.subclip(start, end) try: # 检查视频是否有音频轨道并写入文件 - subclip.write_videofile(video_path, audio=(subclip.audio is not None), logger=None) + subclip.write_videofile( + video_path, + codec='libx264', + audio_codec='aac', + temp_audiofile='temp-audio.m4a', + remove_temp=True, + audio=(subclip.audio is not None), + logger=None + ) # 验证生成的视频文件 if os.path.exists(video_path) and os.path.getsize(video_path) > 0: @@ -363,12 +409,12 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di return {} -def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None): +def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict: """ 剪辑视频 Args: task_id: 任务id - timestamp_terms: 需要剪辑的时间戳列表,如:['00:00-00:20', '00:36-00:40', '07:07-07:22'] + timestamp_terms: 需要剪辑的时间戳列表,如:['00:00:00,000-00:00:20,100', '00:00:43,039-00:00:46,959'] origin_video: 原视频路径 progress_callback: 进度回调函数 @@ -379,11 +425,6 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro total_items = len(timestamp_terms) for index, item in enumerate(timestamp_terms): material_directory = config.app.get("material_directory", "").strip() - if material_directory == "task": - material_directory = utils.task_dir(task_id) - elif material_directory and not os.path.isdir(material_directory): - material_directory = "" - try: saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory) if saved_video_path: @@ -396,6 +437,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro except Exception as e: logger.error(f"视频裁剪失败: {utils.to_json(item)} =>\n{str(traceback.format_exc())}") return {} + logger.success(f"裁剪 {len(video_paths)} videos") return video_paths @@ -455,29 +497,3 @@ def merge_videos(video_paths, ost_list): os.remove(silent_video) return output_file - - -# 使用示例 -# if __name__ == "__main__": -# video_paths = ['/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_17-01_37.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_00-00_06.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_06-00_09.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_03-01_10.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_10-01_17.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_24-00_27.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_28-01_36.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_32-00_41.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_36-01_58.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_12-00_15.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_09-00_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_12-02_25.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_03-02_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_58-02_03.mp4', -# '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_14-03_18.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_18-03_20.mp4'] -# -# ost_list = [True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, -# False] -# -# result = merge_videos(video_paths, ost_list) -# if result: -# print(f"合并后的视频文件:{result}") -# else: -# print("视频合并失败") -# - - -if __name__ == "__main__": - save_clip_video('00:50-01:41', 'E:\\projects\\NarratoAI\\resource\\videos\\WeChat_20241110144511.mp4') diff --git a/app/services/script_service.py b/app/services/script_service.py new file mode 100644 index 0000000..37644a7 --- /dev/null +++ b/app/services/script_service.py @@ -0,0 +1,405 @@ +import os +import json +import time +import asyncio +import requests +from loguru import logger +from typing import List, Dict, Any, Callable + +from app.utils import utils, gemini_analyzer, video_processor, video_processor_v2 +from app.utils.script_generator import ScriptProcessor +from app.config import config + + +class ScriptGenerator: + def __init__(self): + self.temp_dir = utils.temp_dir() + self.keyframes_dir = os.path.join(self.temp_dir, "keyframes") + + async def generate_script( + self, + video_path: str, + video_theme: str = "", + custom_prompt: str = "", + skip_seconds: int = 0, + threshold: int = 30, + vision_batch_size: int = 5, + vision_llm_provider: str = "gemini", + progress_callback: Callable[[float, str], None] = None + ) -> List[Dict[Any, Any]]: + """ + 生成视频脚本的核心逻辑 + + Args: + video_path: 视频文件路径 + video_theme: 视频主题 + custom_prompt: 自定义提示词 + skip_seconds: 跳过开始的秒数 + threshold: 差异���值 + vision_batch_size: 视觉处理批次大小 + vision_llm_provider: 视觉模型提供商 + progress_callback: 进度回调函数 + + Returns: + List[Dict]: 生成的视频脚本 + """ + if progress_callback is None: + progress_callback = lambda p, m: None + + try: + # 提取关键帧 + progress_callback(10, "正在提取关键帧...") + keyframe_files = await self._extract_keyframes( + video_path, + skip_seconds, + threshold + ) + + if vision_llm_provider == "gemini": + script = await self._process_with_gemini( + keyframe_files, + video_theme, + custom_prompt, + vision_batch_size, + progress_callback + ) + elif vision_llm_provider == "narratoapi": + script = await self._process_with_narrato( + keyframe_files, + video_theme, + custom_prompt, + vision_batch_size, + progress_callback + ) + else: + raise ValueError(f"Unsupported vision provider: {vision_llm_provider}") + + return json.loads(script) if isinstance(script, str) else script + + except Exception as e: + logger.exception("Generate script failed") + raise + + async def _extract_keyframes( + self, + video_path: str, + skip_seconds: int, + threshold: int + ) -> List[str]: + """提取视频关键帧""" + video_hash = utils.md5(video_path + str(os.path.getmtime(video_path))) + video_keyframes_dir = os.path.join(self.keyframes_dir, video_hash) + + # 检查缓存 + keyframe_files = [] + if os.path.exists(video_keyframes_dir): + for filename in sorted(os.listdir(video_keyframes_dir)): + if filename.endswith('.jpg'): + keyframe_files.append(os.path.join(video_keyframes_dir, filename)) + + if keyframe_files: + logger.info(f"Using cached keyframes: {video_keyframes_dir}") + return keyframe_files + + # 提取新的关键帧 + os.makedirs(video_keyframes_dir, exist_ok=True) + + try: + if config.frames.get("version") == "v2": + processor = video_processor_v2.VideoProcessor(video_path) + processor.process_video_pipeline( + output_dir=video_keyframes_dir, + skip_seconds=skip_seconds, + threshold=threshold + ) + else: + processor = video_processor.VideoProcessor(video_path) + processor.process_video( + output_dir=video_keyframes_dir, + skip_seconds=skip_seconds + ) + + for filename in sorted(os.listdir(video_keyframes_dir)): + if filename.endswith('.jpg'): + keyframe_files.append(os.path.join(video_keyframes_dir, filename)) + + return keyframe_files + + except Exception as e: + if os.path.exists(video_keyframes_dir): + import shutil + shutil.rmtree(video_keyframes_dir) + raise + + async def _process_with_gemini( + self, + keyframe_files: List[str], + video_theme: str, + custom_prompt: str, + vision_batch_size: int, + progress_callback: Callable[[float, str], None] + ) -> str: + """使用Gemini处理视频帧""" + progress_callback(30, "正在初始化视觉分析器...") + + # 获取Gemini配置 + vision_api_key = config.app.get("vision_gemini_api_key") + vision_model = config.app.get("vision_gemini_model_name") + + if not vision_api_key or not vision_model: + raise ValueError("未配置 Gemini API Key 或者模型") + + analyzer = gemini_analyzer.VisionAnalyzer( + model_name=vision_model, + api_key=vision_api_key, + ) + + progress_callback(40, "正在分析关键帧...") + + # 执行异步分析 + results = await analyzer.analyze_images( + images=keyframe_files, + prompt=config.app.get('vision_analysis_prompt'), + batch_size=vision_batch_size + ) + + progress_callback(60, "正在整理分析结果...") + + # 合并所有批次的分析结果 + frame_analysis = "" + prev_batch_files = None + + for result in results: + if 'error' in result: + logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}") + continue + + batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size) + first_timestamp, last_timestamp, _ = self._get_batch_timestamps(batch_files, prev_batch_files) + + # 添加带时间戳的分��结果 + frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" + frame_analysis += result['response'] + frame_analysis += "\n" + + prev_batch_files = batch_files + + if not frame_analysis.strip(): + raise Exception("未能生成有效的帧分析结果") + + progress_callback(70, "正在生成脚本...") + + # 构建帧内容列表 + frame_content_list = [] + prev_batch_files = None + + for result in results: + if 'error' in result: + continue + + batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size) + _, _, timestamp_range = self._get_batch_timestamps(batch_files, prev_batch_files) + + frame_content = { + "timestamp": timestamp_range, + "picture": result['response'], + "narration": "", + "OST": 2 + } + frame_content_list.append(frame_content) + prev_batch_files = batch_files + + if not frame_content_list: + raise Exception("没有有效的帧内容可以处理") + + progress_callback(90, "正在生成文案...") + + # 获取文本生��配置 + text_provider = config.app.get('text_llm_provider', 'gemini').lower() + text_api_key = config.app.get(f'text_{text_provider}_api_key') + text_model = config.app.get(f'text_{text_provider}_model_name') + + processor = ScriptProcessor( + model_name=text_model, + api_key=text_api_key, + prompt=custom_prompt, + video_theme=video_theme + ) + + return processor.process_frames(frame_content_list) + + async def _process_with_narrato( + self, + keyframe_files: List[str], + video_theme: str, + custom_prompt: str, + vision_batch_size: int, + progress_callback: Callable[[float, str], None] + ) -> str: + """使用NarratoAPI处理视频帧""" + # 创建临时目录 + temp_dir = utils.temp_dir("narrato") + + # 打包关键帧 + progress_callback(30, "正在打包关键帧...") + zip_path = os.path.join(temp_dir, f"keyframes_{int(time.time())}.zip") + + try: + if not utils.create_zip(keyframe_files, zip_path): + raise Exception("打包关键帧失败") + + # 获取API配置 + api_url = config.app.get("narrato_api_url") + api_key = config.app.get("narrato_api_key") + + if not api_key: + raise ValueError("未配置 Narrato API Key") + + headers = { + 'X-API-Key': api_key, + 'accept': 'application/json' + } + + api_params = { + 'batch_size': vision_batch_size, + 'use_ai': False, + 'start_offset': 0, + 'vision_model': config.app.get('narrato_vision_model', 'gemini-1.5-flash'), + 'vision_api_key': config.app.get('narrato_vision_key'), + 'llm_model': config.app.get('narrato_llm_model', 'qwen-plus'), + 'llm_api_key': config.app.get('narrato_llm_key'), + 'custom_prompt': custom_prompt + } + + progress_callback(40, "正在上传文件...") + with open(zip_path, 'rb') as f: + files = {'file': (os.path.basename(zip_path), f, 'application/x-zip-compressed')} + response = requests.post( + f"{api_url}/video/analyze", + headers=headers, + params=api_params, + files=files, + timeout=30 + ) + response.raise_for_status() + + task_data = response.json() + task_id = task_data["data"].get('task_id') + if not task_id: + raise Exception(f"无效的API��应: {response.text}") + + progress_callback(50, "正在等待分析结果...") + retry_count = 0 + max_retries = 60 + + while retry_count < max_retries: + try: + status_response = requests.get( + f"{api_url}/video/tasks/{task_id}", + headers=headers, + timeout=10 + ) + status_response.raise_for_status() + task_status = status_response.json()['data'] + + if task_status['status'] == 'SUCCESS': + return task_status['result']['data'] + elif task_status['status'] in ['FAILURE', 'RETRY']: + raise Exception(f"任务失败: {task_status.get('error')}") + + retry_count += 1 + time.sleep(2) + + except requests.RequestException as e: + logger.warning(f"获取任务状态失败,重试中: {str(e)}") + retry_count += 1 + time.sleep(2) + continue + + raise Exception("任务执行超时") + + finally: + # 清理临时文件 + try: + if os.path.exists(zip_path): + os.remove(zip_path) + except Exception as e: + logger.warning(f"清理临时文件失败: {str(e)}") + + def _get_batch_files( + self, + keyframe_files: List[str], + result: Dict[str, Any], + batch_size: int + ) -> List[str]: + """获取当前批次的图片文件""" + batch_start = result['batch_index'] * batch_size + batch_end = min(batch_start + batch_size, len(keyframe_files)) + return keyframe_files[batch_start:batch_end] + + def _get_batch_timestamps( + self, + batch_files: List[str], + prev_batch_files: List[str] = None + ) -> tuple[str, str, str]: + """获取一批文件的时间戳范围,支持毫秒级精度""" + if not batch_files: + logger.warning("Empty batch files") + return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000" + + if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0: + first_frame = os.path.basename(prev_batch_files[-1]) + last_frame = os.path.basename(batch_files[0]) + else: + first_frame = os.path.basename(batch_files[0]) + last_frame = os.path.basename(batch_files[-1]) + + first_time = first_frame.split('_')[2].replace('.jpg', '') + last_time = last_frame.split('_')[2].replace('.jpg', '') + + def format_timestamp(time_str: str) -> str: + """将时间字符串转换为 HH:MM:SS,mmm 格式""" + try: + if len(time_str) < 4: + logger.warning(f"Invalid timestamp format: {time_str}") + return "00:00:00,000" + + # 处理毫秒部分 + if ',' in time_str: + time_part, ms_part = time_str.split(',') + ms = int(ms_part) + else: + time_part = time_str + ms = 0 + + # 处理时分秒 + parts = time_part.split(':') + if len(parts) == 3: # HH:MM:SS + h, m, s = map(int, parts) + elif len(parts) == 2: # MM:SS + h = 0 + m, s = map(int, parts) + else: # SS + h = 0 + m = 0 + s = int(parts[0]) + + # 处理进位 + if s >= 60: + m += s // 60 + s = s % 60 + if m >= 60: + h += m // 60 + m = m % 60 + + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + except Exception as e: + logger.error(f"时间戳格式转换错误 {time_str}: {str(e)}") + return "00:00:00,000" + + first_timestamp = format_timestamp(first_time) + last_timestamp = format_timestamp(last_time) + timestamp_range = f"{first_timestamp}-{last_timestamp}" + + return first_timestamp, last_timestamp, timestamp_range \ No newline at end of file diff --git a/app/services/subtitle.py b/app/services/subtitle.py index f37eb65..e7f037d 100644 --- a/app/services/subtitle.py +++ b/app/services/subtitle.py @@ -8,6 +8,8 @@ from timeit import default_timer as timer from loguru import logger import google.generativeai as genai +from moviepy.editor import VideoFileClip +import os from app.config import config from app.utils import utils @@ -362,29 +364,86 @@ def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Option return None +def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "") -> Optional[str]: + """ + 从视频文件中提取音频并生成字幕文件。 + + 参数: + - video_file: MP4视频文件的路径 + - subtitle_file: 输出字幕文件的路径(可选)。如果未提供,将根据视频文件名自动生成。 + + 返回: + - str: 生成的字幕文件路径 + - None: 如果处理过程中出现错误 + """ + try: + # 获取视频文件所在目录 + video_dir = os.path.dirname(video_file) + video_name = os.path.splitext(os.path.basename(video_file))[0] + + # 设置音频文件路径 + audio_file = os.path.join(video_dir, f"{video_name}_audio.wav") + + # 如果未指定字幕文件路径,则自动生成 + if not subtitle_file: + subtitle_file = os.path.join(video_dir, f"{video_name}.srt") + + logger.info(f"开始从视频提取音频: {video_file}") + + # 加载视频文件 + video = VideoFileClip(video_file) + + # 提取音频并保存为WAV格式 + logger.info(f"正在提取音频到: {audio_file}") + video.audio.write_audiofile(audio_file, codec='pcm_s16le') + + # 关闭视频文件 + video.close() + + logger.info("音频提取完成,开始生成字幕") + + # 使用create函数生成字幕 + create(audio_file, subtitle_file) + + # 删除临时音频文件 + if os.path.exists(audio_file): + os.remove(audio_file) + logger.info("已清理临时音频文件") + + return subtitle_file + + except Exception as e: + logger.error(f"处理视频文件时出错: {str(e)}") + logger.error(traceback.format_exc()) + return None + + if __name__ == "__main__": - task_id = "test456" + task_id = "123456" task_dir = utils.task_dir(task_id) - subtitle_file = f"{task_dir}/subtitle.srt" + subtitle_file = f"{task_dir}/subtitle_123456.srt" audio_file = f"{task_dir}/audio.wav" - - subtitles = file_to_subtitles(subtitle_file) - print(subtitles) - - # script_file = f"{task_dir}/script.json" - # with open(script_file, "r") as f: - # script_content = f.read() - # s = json.loads(script_content) - # script = s.get("script") - # - # correct(subtitle_file, script) - - subtitle_file = f"{task_dir}/subtitle111.srt" - create(audio_file, subtitle_file) - - # # 使用Gemini模型处理音频 - # gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥 - # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key) - # - # if gemini_subtitle_file: - # print(f"Gemini生成的字幕文件: {gemini_subtitle_file}") + video_file = "/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_1702.mp4" + + extract_audio_and_create_subtitle(video_file, subtitle_file) + + # subtitles = file_to_subtitles(subtitle_file) + # print(subtitles) + + # # script_file = f"{task_dir}/script.json" + # # with open(script_file, "r") as f: + # # script_content = f.read() + # # s = json.loads(script_content) + # # script = s.get("script") + # # + # # correct(subtitle_file, script) + + # subtitle_file = f"{task_dir}/subtitle111.srt" + # create(audio_file, subtitle_file) + + # # # 使用Gemini模型处理音频 + # # gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥 + # # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key) + # # + # # if gemini_subtitle_file: + # # print(f"Gemini生成的字幕文件: {gemini_subtitle_file}") diff --git a/app/services/task.py b/app/services/task.py index c903047..77f2cf5 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -206,134 +206,14 @@ def generate_final_videos( return final_video_paths, combined_video_paths -def start(task_id, params: VideoParams, stop_at: str = "video"): - logger.info(f"start task: {task_id}, stop_at: {stop_at}") - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5) - - if type(params.video_concat_mode) is str: - params.video_concat_mode = VideoConcatMode(params.video_concat_mode) - - # 1. Generate script - video_script = generate_script(task_id, params) - if not video_script: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - return - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10) - - if stop_at == "script": - sm.state.update_task( - task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script - ) - return {"script": video_script} - - # 2. Generate terms - video_terms = "" - if params.video_source != "local": - video_terms = generate_terms(task_id, params, video_script) - if not video_terms: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - return - - save_script_data(task_id, video_script, video_terms, params) - - if stop_at == "terms": - sm.state.update_task( - task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms - ) - return {"script": video_script, "terms": video_terms} - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) - - # 3. Generate audio - audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script) - if not audio_file: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - return - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30) - - if stop_at == "audio": - sm.state.update_task( - task_id, - state=const.TASK_STATE_COMPLETE, - progress=100, - audio_file=audio_file, - ) - return {"audio_file": audio_file, "audio_duration": audio_duration} - - # 4. Generate subtitle - subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file) - - if stop_at == "subtitle": - sm.state.update_task( - task_id, - state=const.TASK_STATE_COMPLETE, - progress=100, - subtitle_path=subtitle_path, - ) - return {"subtitle_path": subtitle_path} - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40) - - # 5. Get video materials - downloaded_videos = get_video_materials( - task_id, params, video_terms, audio_duration - ) - if not downloaded_videos: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - return - - if stop_at == "materials": - sm.state.update_task( - task_id, - state=const.TASK_STATE_COMPLETE, - progress=100, - materials=downloaded_videos, - ) - return {"materials": downloaded_videos} - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50) - - # 6. Generate final videos - final_video_paths, combined_video_paths = generate_final_videos( - task_id, params, downloaded_videos, audio_file, subtitle_path - ) - - if not final_video_paths: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - return - - logger.success( - f"task {task_id} finished, generated {len(final_video_paths)} videos." - ) - - kwargs = { - "videos": final_video_paths, - "combined_videos": combined_video_paths, - "script": video_script, - "terms": video_terms, - "audio_file": audio_file, - "audio_duration": audio_duration, - "subtitle_path": subtitle_path, - "materials": downloaded_videos, - } - sm.state.update_task( - task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs - ) - return kwargs - - -def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: list): - """ - 后台任务(自动剪辑视频进行剪辑) - - task_id: 任务ID - params: 剪辑参数 - subclip_path_videos: 视频文件路径 - - """ +def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict): + """后台任务(自动剪辑视频进行剪辑)""" logger.info(f"\n\n## 开始任务: {task_id}") + + # 初始化 ImageMagick + if not utils.init_imagemagick(): + logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示") + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5) # tts 角色名称 @@ -341,8 +221,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li logger.info("\n\n## 1. 加载视频脚本") video_script_path = path.join(params.video_clip_json_path) - # video_script_path = video_clip_json_path - # 判断json文件是否存在 + if path.exists(video_script_path): try: with open(video_script_path, "r", encoding="utf-8") as f: @@ -355,10 +234,12 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li logger.debug(f"解说完整脚本: \n{video_script}") logger.debug(f"解说 OST 列表: \n{video_ost}") logger.debug(f"解说时间戳列表: \n{time_list}") + # 获取视频总时长(单位 s) - total_duration = list_script[-1]['new_timestamp'] - total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int( - total_duration.split("-")[1].split(":")[1]) + last_timestamp = list_script[-1]['new_timestamp'] + end_time = last_timestamp.split("-")[1] + total_duration = utils.time_to_seconds(end_time) + except Exception as e: logger.error(f"无法读取视频json脚本,请检查配置是否正确。{e}") raise ValueError("无法读取视频json脚本,请检查配置是否正确") @@ -366,32 +247,51 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc()) raise ValueError("解说脚本不存在!请检查配置是否正确。") - logger.info("\n\n## 2. 生成音频列表") - audio_files, sub_maker_list = voice.tts_multiple( - task_id=task_id, - list_script=list_script, - voice_name=voice_name, - voice_rate=params.voice_rate, - voice_pitch=params.voice_pitch, - force_regenerate=True + logger.info("\n\n## 2. 根据OST设置生成音频列表") + # 只为OST=0或2的片段生成TTS音频 + tts_segments = [ + segment for segment in list_script + if segment['OST'] in [0, 2] + ] + # logger.debug(f"tts_segments: {tts_segments}") + if tts_segments: + audio_files, sub_maker_list = voice.tts_multiple( + task_id=task_id, + list_script=tts_segments, # 只传入需要TTS的片段 + voice_name=voice_name, + voice_rate=params.voice_rate, + voice_pitch=params.voice_pitch, + force_regenerate=True + ) + if audio_files is None: + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) + logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.") + return + else: + audio_files = [] + + logger.info(f"合并音频文件:\n{audio_files}") + # 传入OST信息以便正确处理音频 + final_audio = audio_merger.merge_audio_files( + task_id=task_id, + audio_files=audio_files, + total_duration=total_duration, + list_script=list_script # 传入完整脚本以便处理OST ) - if audio_files is None: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error( - "TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.") - return - logger.info(f"合并音频:\n\n {audio_files}") - audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script) sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30) + # 只为OST=0或2的片段生成字幕 subtitle_path = "" if params.subtitle_enabled: subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}") - # 使用 faster-whisper-large-v2 模型生成字幕 - subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) + + subtitle.create( + audio_file=final_audio, + subtitle_file=subtitle_path, + ) subtitle_lines = subtitle.file_to_subtitles(subtitle_path) if not subtitle_lines: @@ -402,7 +302,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li logger.info("\n\n## 4. 裁剪视频") subclip_videos = [x for x in subclip_path_videos.values()] - logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}") + # logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}") if not subclip_videos: sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) @@ -434,14 +334,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4") - logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}") - # 把所有东西合到在一起 + logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}") + # 传入OST信息以便正确处理音频和视频 video.generate_video_v2( video_path=combined_video_path, - audio_path=audio_file, + audio_path=final_audio, subtitle_path=subtitle_path, output_file=final_video_path, params=params, + list_script=list_script # 传入完整脚本以便处理OST ) _progress += 50 / 2 diff --git a/app/services/video.py b/app/services/video.py index 1d270fa..eadfce0 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -18,6 +18,15 @@ def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""): + """ + 获取背景音乐文件路径 + Args: + bgm_type: 背景音乐类型,可选值: random(随机), ""(无背景音乐) + bgm_file: 指定的背景音乐文件路径 + + Returns: + str: 背景音乐文件路径 + """ if not bgm_type: return "" @@ -48,21 +57,35 @@ def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""): def combine_videos( - combined_video_path: str, - video_paths: List[str], - audio_file: str, - video_aspect: VideoAspect = VideoAspect.portrait, - video_concat_mode: VideoConcatMode = VideoConcatMode.random, - max_clip_duration: int = 5, - threads: int = 2, + combined_video_path: str, + video_paths: List[str], + audio_file: str, + video_aspect: VideoAspect = VideoAspect.portrait, + video_concat_mode: VideoConcatMode = VideoConcatMode.random, + max_clip_duration: int = 5, + threads: int = 2, ) -> str: + """ + 合并多个视频片段 + Args: + combined_video_path: 合并后的视频保存路径 + video_paths: 待合并的视频路径列表 + audio_file: 音频文件路径 + video_aspect: 视频宽高比 + video_concat_mode: 视频拼接模式(随机/顺序) + max_clip_duration: 每个片段的最大时长(秒) + threads: 处理线程数 + + Returns: + str: 合并后的视频路径 + """ audio_clip = AudioFileClip(audio_file) audio_duration = audio_clip.duration - logger.info(f"max duration of audio: {audio_duration} seconds") - # Required duration of each clip + logger.info(f"音频时长: {audio_duration} 秒") + # 每个片段的所需时长 req_dur = audio_duration / len(video_paths) req_dur = max_clip_duration - logger.info(f"each clip will be maximum {req_dur} seconds long") + logger.info(f"每个片段最大时长: {req_dur} 秒") output_dir = os.path.dirname(combined_video_path) aspect = VideoAspect(video_aspect) @@ -81,22 +104,22 @@ def combine_videos( end_time = min(start_time + max_clip_duration, clip_duration) split_clip = clip.subclip(start_time, end_time) raw_clips.append(split_clip) - # logger.info(f"splitting from {start_time:.2f} to {end_time:.2f}, clip duration {clip_duration:.2f}, split_clip duration {split_clip.duration:.2f}") + # logger.info(f"从 {start_time:.2f} 到 {end_time:.2f}, 片段时长 {clip_duration:.2f}, 分割片段时长 {split_clip.duration:.2f}") start_time = end_time if video_concat_mode.value == VideoConcatMode.sequential.value: break - # random video_paths order + # 随机视频片段顺序 if video_concat_mode.value == VideoConcatMode.random.value: random.shuffle(raw_clips) - # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached + # 添加下载的片段,直到音频时长(max_duration)达到 while video_duration < audio_duration: for clip in raw_clips: - # Check if clip is longer than the remaining audio + # 检查片段是否比剩余音频时长长 if (audio_duration - video_duration) < clip.duration: clip = clip.subclip(0, (audio_duration - video_duration)) - # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image + # 仅当计算的片段时长(req_dur)小于实际片段时长时,缩短片段 elif req_dur < clip.duration: clip = clip.subclip(0, req_dur) clip = clip.set_fps(30) @@ -134,7 +157,7 @@ def combine_videos( ) logger.info( - f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}" + f"调整视频尺寸为 {video_width} x {video_height}, 片段尺寸: {clip_w} x {clip_h}" ) if clip.duration > max_clip_duration: @@ -146,7 +169,7 @@ def combine_videos( video_clip = concatenate_videoclips(clips) video_clip = video_clip.set_fps(30) logger.info("writing") - # https://github.com/harry0703/NarratoAI/issues/111#issuecomment-2032354030 + video_clip.write_videofile( filename=combined_video_path, threads=threads, @@ -161,6 +184,17 @@ def combine_videos( def wrap_text(text, max_width, font, fontsize=60): + """ + 文本自动换行处理 + Args: + text: 待处理的文本 + max_width: 最大宽度 + font: 字体文件路径 + fontsize: 字体大小 + + Returns: + tuple: (换行后的文本, 文本高度) + """ # 创建字体对象 font = ImageFont.truetype(font, fontsize) @@ -220,6 +254,14 @@ def get_text_size(inner_text): @contextmanager def manage_clip(clip): + """ + 视频片段资源管理器 + Args: + clip: 视频片段对象 + + Yields: + VideoFileClip: 视频片段对象 + """ try: yield clip finally: @@ -232,6 +274,7 @@ def generate_video_v2( audio_path: str, subtitle_path: str, output_file: str, + list_script: list, params: Union[VideoParams, VideoClipParams], progress_callback=None, ): @@ -250,7 +293,7 @@ def generate_video_v2( """ total_steps = 4 current_step = 0 - + def update_progress(step_name): nonlocal current_step current_step += 1 @@ -260,7 +303,7 @@ def update_progress(step_name): try: validate_params(video_path, audio_path, output_file, params) - + with manage_clip(VideoFileClip(video_path)) as video_clip: aspect = VideoAspect(params.video_aspect) video_width, video_height = aspect.to_resolution() @@ -304,7 +347,7 @@ def create_text_clip(subtitle_item): _clip = _clip.set_start(subtitle_item[0][0]) _clip = _clip.set_end(subtitle_item[0][1]) _clip = _clip.set_duration(duration) - + if params.subtitle_position == "bottom": _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h)) elif params.subtitle_position == "top": @@ -335,6 +378,7 @@ def create_text_clip(subtitle_item): update_progress("字幕处理完成") # 合并音频和导出 + logger.info("开始导出视频 (此步骤耗时较长请耐心等待)") video_clip = video_clip.set_audio(final_audio) video_clip.write_videofile( output_file, @@ -344,7 +388,7 @@ def create_text_clip(subtitle_item): logger=None, fps=30, ) - + except FileNotFoundError as e: logger.error(f"文件不存在: {str(e)}") raise @@ -356,15 +400,25 @@ def create_text_clip(subtitle_item): def process_audio_tracks(original_audio, new_audio, params, video_duration): - """处理所有音轨""" + """ + 处理所有音轨(原声、配音、背景音乐) + Args: + original_audio: 原始音频 + new_audio: 新音频 + params: 视频参数 + video_duration: 视频时长 + + Returns: + CompositeAudioClip: 合成后的音频 + """ audio_tracks = [] - + if original_audio is not None: audio_tracks.append(original_audio) - + new_audio = new_audio.volumex(params.voice_volume) audio_tracks.append(new_audio) - + # 处理背景音乐 bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) if bgm_file: @@ -374,35 +428,54 @@ def process_audio_tracks(original_audio, new_audio, params, video_duration): audio_tracks.append(bgm_clip) except Exception as e: logger.error(f"添加背景音乐失败: {str(e)}") - + return CompositeAudioClip(audio_tracks) if audio_tracks else new_audio def process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip): - """处理字幕""" + """ + 处理字幕 + Args: + subtitle_path: 字幕文件路径 + video_clip: 视频片段 + video_duration: 视频时长 + create_text_clip: 创建文本片段的回调函数 + + Returns: + CompositeVideoClip: 添加字幕后的视频 + """ if not (subtitle_path and os.path.exists(subtitle_path)): return video_clip - + sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8") text_clips = [] - + for item in sub.subtitles: clip = create_text_clip(subtitle_item=item) - + # 时间范围调整 start_time = max(clip.start, 0) if start_time >= video_duration: continue - + end_time = min(clip.end, video_duration) clip = clip.set_start(start_time).set_end(end_time) text_clips.append(clip) - + logger.info(f"处理了 {len(text_clips)} 段字幕") return CompositeVideoClip([video_clip, *text_clips]) def preprocess_video(materials: List[MaterialInfo], clip_duration=4): + """ + 预处理视频素材 + Args: + materials: 素材信息列表 + clip_duration: 片段时长(秒) + + Returns: + List[MaterialInfo]: 处理后的素材信息列表 + """ for material in materials: if not material.url: continue @@ -430,12 +503,12 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4): # 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。 # 假设我们想要从原始大小逐渐放大到120%的大小。 # t代表当前时间,clip.duration为视频总时长,这里是3秒。 - # 注意:1 表示100%的大小,所以1.2表示120%的大小 + # 注意:1 表示100%的大小所以1.2表示120%的大小 zoom_clip = clip.resize( lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration) ) - # 如果需要,可以创建一个包含缩放剪辑的复合视频剪辑 + # 如果需要,可以创建一个包含缩放剪辑的复合频剪辑 # (这在您想要在视频中添加其他元素时非常有用) final_clip = CompositeVideoClip([zoom_clip]) @@ -472,7 +545,7 @@ def combine_clip_videos(combined_video_path: str, from app.utils.utils import calculate_total_duration audio_duration = calculate_total_duration(list_script) logger.info(f"音频的最大持续时间: {audio_duration} s") - + output_dir = os.path.dirname(combined_video_path) aspect = VideoAspect(video_aspect) video_width, video_height = aspect.to_resolution() @@ -481,25 +554,25 @@ def combine_clip_videos(combined_video_path: str, for video_path, video_ost in zip(video_paths, video_ost_list): try: clip = VideoFileClip(video_path) - + if video_ost == 0: # 不保留原声 clip = clip.without_audio() # video_ost 为 1 或 2 时都保留原声,不需要特殊处理 - + clip = clip.set_fps(30) # 处理视频尺寸 clip_w, clip_h = clip.size if clip_w != video_width or clip_h != video_height: clip = resize_video_with_padding( - clip, - target_width=video_width, + clip, + target_width=video_width, target_height=video_height ) logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}") clips.append(clip) - + except Exception as e: logger.error(f"处理视频 {video_path} 时出错: {str(e)}") continue @@ -510,8 +583,8 @@ def combine_clip_videos(combined_video_path: str, try: video_clip = concatenate_videoclips(clips) video_clip = video_clip.set_fps(30) - - logger.info("开始合并视频...") + + logger.info("开始合并视频... (过程中出现 UserWarning: 不必理会)") video_clip.write_videofile( filename=combined_video_path, threads=threads, @@ -521,7 +594,7 @@ def combine_clip_videos(combined_video_path: str, temp_audiofile=os.path.join(output_dir, "temp-audio.m4a") ) finally: - # 确保资源被正确���放 + # 确保资源被正确放 video_clip.close() for clip in clips: clip.close() @@ -531,13 +604,22 @@ def combine_clip_videos(combined_video_path: str, def resize_video_with_padding(clip, target_width: int, target_height: int): - """辅助函数:调整视频尺寸并添加黑边""" + """ + 调整视频尺寸并添加黑边 + Args: + clip: 视频片段 + target_width: 目标宽度 + target_height: 目标高度 + + Returns: + CompositeVideoClip: 调整尺寸后的视频 + """ clip_ratio = clip.w / clip.h target_ratio = target_width / target_height if clip_ratio == target_ratio: return clip.resize((target_width, target_height)) - + if clip_ratio > target_ratio: scale_factor = target_width / clip.w else: @@ -548,10 +630,10 @@ def resize_video_with_padding(clip, target_width: int, target_height: int): clip_resized = clip.resize(newsize=(new_width, new_height)) background = ColorClip( - size=(target_width, target_height), + size=(target_width, target_height), color=(0, 0, 0) ).set_duration(clip.duration) - + return CompositeVideoClip([ background, clip_resized.set_position("center") @@ -559,106 +641,100 @@ def resize_video_with_padding(clip, target_width: int, target_height: int): def validate_params(video_path, audio_path, output_file, params): - """验证输入参数""" + """ + 验证输入参数 + Args: + video_path: 视频文件路径 + audio_path: 音频文件路径 + output_file: 输出文件路径 + params: 视频参数 + + Raises: + FileNotFoundError: 文件不存在时抛出 + ValueError: 参数无效时抛出 + """ if not os.path.exists(video_path): raise FileNotFoundError(f"视频文件不存在: {video_path}") - + if not os.path.exists(audio_path): raise FileNotFoundError(f"音频文件不存在: {audio_path}") - + output_dir = os.path.dirname(output_file) if not os.path.exists(output_dir): raise FileNotFoundError(f"输出目录不存在: {output_dir}") - + if not hasattr(params, 'video_aspect'): raise ValueError("params 缺少必要参数 video_aspect") if __name__ == "__main__": - # combined_video_path = "../../storage/tasks/12312312/com123.mp4" - # - # video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4', - # '../../storage/cache_videos/vid-00_03-00_07.mp4', - # '../../storage/cache_videos/vid-00_12-00_17.mp4', - # '../../storage/cache_videos/vid-00_26-00_31.mp4'] - # video_ost_list = [False, True, False, True] - # list_script = [ - # { - # "picture": "夜晚,一个小孩在树林里奔跑,后面有人拿着火把在追赶", - # "timestamp": "00:00-00:03", - # "narration": "夜黑风高的树林,一个小孩在拼命奔跑,后面的人穷追不舍!", - # "OST": False, - # "new_timestamp": "00:00-00:03" - # }, - # { - # "picture": "追赶的人命令抓住小孩", - # "timestamp": "00:03-00:07", - # "narration": "原声播放1", - # "OST": True, - # "new_timestamp": "00:03-00:07" - # }, - # { - # "picture": "小孩躲在草丛里,黑衣人用脚踢了踢他", - # "timestamp": "00:12-00:17", - # "narration": "小孩脱下外套,跑进树林, 一路奔跑,直到第二天清晨", - # "OST": False, - # "new_timestamp": "00:07-00:12" - # }, - # { - # "picture": "小孩跑到车前,慌慌张张地对女人说有人要杀他", - # "timestamp": "00:26-00:31", - # "narration": "原声播放2", - # "OST": True, - # "new_timestamp": "00:12-00:17" - # } - # ] - # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script) - - # cfg = VideoClipParams() - # cfg.video_aspect = VideoAspect.portrait - # cfg.font_name = "STHeitiMedium.ttc" - # cfg.font_size = 60 - # cfg.stroke_color = "#000000" - # cfg.stroke_width = 1.5 - # cfg.text_fore_color = "#FFFFFF" - # cfg.text_background_color = "transparent" - # cfg.bgm_type = "random" - # cfg.bgm_file = "" - # cfg.bgm_volume = 1.0 - # cfg.subtitle_enabled = True - # cfg.subtitle_position = "bottom" - # cfg.n_threads = 2 - # cfg.paragraph_number = 1 - # - # cfg.voice_volume = 1.0 - - # generate_video(video_path=video_file, - # audio_path=audio_file, - # subtitle_path=subtitle_file, - # output_file=output_file, - # params=cfg - # ) - # - # video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4" - # - # audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3" - # - # subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt" - # - # output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4" - # - # generate_video_v2(video_path=video_path, - # audio_path=audio_path, - # subtitle_path=subtitle_path, - # output_file=output_file, - # params=cfg - # ) - - # 合并视频 - video_list = [ - './storage/cache_videos/vid-01_03-01_50.mp4', - './storage/cache_videos/vid-01_55-02_29.mp4', - './storage/cache_videos/vid-03_24-04_04.mp4', - './storage/cache_videos/vid-04_50-05_28.mp4' + combined_video_path = "../../storage/tasks/123/combined.mp4" + + video_paths = ['../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-00-10_000-00-00-43_039.mp4', + '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-00-45_439-00-01-01_600.mp4', + '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-01-07_920-00-01-25_719.mp4', + '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-01-36_959-00-01-53_719.mp4'] + video_ost_list = [2, 2, 2, 2] + list_script = [ + { + "timestamp": "00:10-00:43", + "picture": "好的,以下是视频画面的客观描述:\n\n视频显示一个男人在一个树木繁茂的地区,靠近一个泥土斜坡他穿着一件深色T恤、卡其色长裤和登山靴。他背着一个军绿色背包,里面似乎装有头和其他工具。\n\n第一个镜头显示该男子从远处走近斜坡,背对着镜头。下一个镜头特写显示了的背包,一个镐头从背包中伸出来。下一个镜头显示该男子用镐头敲打斜坡。下一个镜头是该男子脚上的特写镜头,他穿着登山靴,正站在泥土斜坡上。最后一个镜显示该男子在斜坡上,仔细地拨开树根和泥土。周围的环境是树木繁茂的,阳光透过树叶照射下来。土壤是浅棕色的,斜坡上有许多树根和植被。", + "narration": "(接上文)好吧,今天我们的男主角,背着一个看似随时要发射军绿色背包,竟然化身“泥土探险家”,在斜坡上挥舞着镐头!他这是准备挖宝还是给树根做个“美容”?阳光洒下来,简直是自然界的聚光灯,仿佛在说:“快来看看,这位勇士要挑战泥土极限!”我只能默默想,如果树根能说话,它们一定会喊:“别打我,我还有家人!”这就是生活,总有些搞笑的瞬间等着我们去发现!", + "OST": 2, + "new_timestamp": "00:00:00,000-00:00:33,000" + }, + { + "timestamp": "00:45-01:01", + "picture": "好的以下是视频画面的客观描述:\n\n视频显示了一个人在森林里挖掘。\n\n第一个镜头是地面特写,显示出松��的泥土、碎石和落叶。光线照在部分区域。\n\n第二个镜头中,一模糊不清的蹲一个树根旁挖掘,一个橄榄绿色的背包放在地上。树根缠绕着常春藤。\n\n第三个镜头显示该人在一个更开阔的区域挖掘,那里有一些树根,以及部分倒的树干。他起来像是在挖掘一个较大的坑。\n\n第四个镜头是特写镜头,显示该人用工具清理土坑的墙壁。\n\n第五个镜头是土坑内部的特写镜头,可以看到土质的纹理,有一些小树根和它植被的残留物。", + "narration": "现在,这位勇敢的挖掘者就像个“现代版的土豆农夫”,在林里开辟新天地。的目标是什么?挖一个宝藏还块“树根披萨”?小心哦,别让树根追着你喊:“不要挖我,我也是有故事的!”", + "OST": 2, + "new_timestamp": "00:00:33,000-00:00:49,000" + }, + { + "timestamp": "01:07-01:25", + "picture": "好,以下是视频画面的客观描述:\n\n画面1:特写镜头,显示出一丛带有水珠的深绿色灌木叶片。叶片呈椭圆形,边缘光滑。背景是树根和泥土。\n\n画面2:一个留着胡子的男人正在一个森林中土坑里挖掘。他穿着黑色T恤和卡其色裤子,跪在地,用具挖掘泥土。周围环绕着树木、树根和灌木。一个倒下的树干横跨土坑上方。\n\n画面3:同一个男人坐在他刚才挖的坑的边缘,看着前方。他的表情似乎略带沉思。背景与画面2相同。\n\n画面4:一个广角镜头显示出他挖出的坑。这是一个不规则形状的土坑,在树木繁茂的斜坡上。土壤呈深棕色,可见树根。\n\n画面5:同一个男人跪在地上,用一把小斧头砍一根木头。他穿着与前几个画面相同的衣服。地面上覆盖着落叶。周围是树木和灌木。", + "narration": "“哎呀,这片灌木叶子滴水如雨,感觉像是大自然的洗发水广告!但我这位‘挖宝达人’似乎更适合拍个‘森林里的单身狗’真人秀。等会儿,我要给树根唱首歌,听说它们爱音乐!”", + "OST": 2, + "new_timestamp": "00:00:49,000-00:01:07,000" + }, + { + "timestamp": "01:36-01:53", + "picture": "好的,以下是视频画面内容的客观描述:\n\n视频包含三个镜头:\n\n**镜头一:**个小型、浅水池塘,位于树林中。池塘的水看起来浑浊,呈绿褐色。池塘周围遍布泥土和落叶。多根树枝和树干横跨池塘,部分浸没在水中。周围的植被茂密主要是深色树木和灌木。\n\n**镜头二:**距拍摄树深处,阳光透过树叶洒落在植被上。镜头中可见粗大的树干、树枝和各种绿叶植物。部分树枝似乎被砍断,切口可见。\n\n**镜头三:**近距离特写镜头,聚焦在树枝和绿叶上。叶片呈圆形,颜色为鲜绿色,有些叶片上有缺损。树枝颜色较深,呈现深褐色。背景是模糊的树林。\n", + "narration": "“好吧,看来我们的‘挖宝达人’终于找到了一‘宝藏’——一个色泽如同绿豆汤的池塘!我敢打赌,这里不仅是小鱼儿的游乐场更是树枝们的‘水疗中心’!下次来这里,我得带上浮潜装备!”", + "OST": 2, + "new_timestamp": "00:01:07,000-00:01:24,000" + } ] + # 合并子视频 + # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script) + cfg = VideoClipParams() + cfg.video_aspect = VideoAspect.portrait + cfg.font_name = "STHeitiMedium.ttc" + cfg.font_size = 60 + cfg.stroke_color = "#000000" + cfg.stroke_width = 1.5 + cfg.text_fore_color = "#FFFFFF" + cfg.text_background_color = "transparent" + cfg.bgm_type = "random" + cfg.bgm_file = "" + cfg.bgm_volume = 1.0 + cfg.subtitle_enabled = True + cfg.subtitle_position = "bottom" + cfg.n_threads = 2 + cfg.video_volume = 1 + + cfg.voice_volume = 1.0 + + video_path = "../../storage/tasks/123/combined.mp4" + audio_path = "../../storage/tasks/123/final_audio.mp3" + subtitle_path = "../../storage/tasks/123/subtitle.srt" + output_file = "../../storage/tasks/123/final-123.mp4" + + generate_video_v2(video_path=video_path, + audio_path=audio_path, + subtitle_path=subtitle_path, + output_file=output_file, + params=cfg, + list_script=list_script, + ) diff --git a/app/services/video_service.py b/app/services/video_service.py new file mode 100644 index 0000000..2a0a9a6 --- /dev/null +++ b/app/services/video_service.py @@ -0,0 +1,58 @@ +import os +from uuid import uuid4 +from loguru import logger +from typing import Dict, List, Optional, Tuple + +from app.services import material +from app.models.schema import VideoClipParams +from app.utils import utils + + +class VideoService: + @staticmethod + async def crop_video( + video_path: str, + video_script: List[dict] + ) -> Tuple[str, Dict[str, str]]: + """ + 裁剪视频服务 + + Args: + video_path: 视频文件路径 + video_script: 视频脚本列表 + + Returns: + Tuple[str, Dict[str, str]]: (task_id, 裁剪后的视频片段字典) + 视频片段字典格式: {timestamp: video_path} + """ + try: + task_id = str(uuid4()) + + # 从脚本中提取时间戳列表 + time_list = [scene['timestamp'] for scene in video_script] + + # 调用裁剪服务 + subclip_videos = material.clip_videos( + task_id=task_id, + timestamp_terms=time_list, + origin_video=video_path + ) + + if subclip_videos is None: + raise ValueError("裁剪视频失败") + + # 更新脚本中的视频路径 + for scene in video_script: + try: + scene['path'] = subclip_videos[scene['timestamp']] + except KeyError as err: + logger.error(f"更新视频路径失败: {err}") + + logger.debug(f"裁剪视频成功,共生成 {len(time_list)} 个视频片段") + logger.debug(f"视频片段路径: {subclip_videos}") + + return task_id, subclip_videos + + except Exception as e: + logger.exception("裁剪视频失败") + raise \ No newline at end of file diff --git a/app/services/voice.py b/app/services/voice.py index 02245f6..5d6aa99 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -11,6 +11,7 @@ from xml.sax.saxutils import unescape from edge_tts import submaker, SubMaker from moviepy.video.tools import subtitles +import time from app.config import config from app.utils import utils @@ -989,6 +990,9 @@ def get_all_azure_voices(filter_locals=None) -> list[str]: Name: zh-CN-XiaoxiaoMultilingualNeural-V2 Gender: Female + +Name: zh-CN-YunxiNeural-V2 +Gender: Male """.strip() voices = [] name = "" @@ -1034,8 +1038,8 @@ def is_azure_v2_voice(voice_name: str): def tts( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str ) -> [SubMaker, None]: - # if is_azure_v2_voice(voice_name): - # return azure_tts_v2(text, voice_name, voice_file) + if is_azure_v2_voice(voice_name): + return azure_tts_v2(text, voice_name, voice_file) return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) @@ -1068,33 +1072,47 @@ def azure_tts_v1( pitch_str = convert_pitch_to_percent(voice_pitch) for i in range(3): try: - logger.info(f"start, voice name: {voice_name}, try: {i + 1}") + logger.info(f"第 {i+1} 次使用 edge_tts 生成音频") - async def _do() -> SubMaker: + async def _do() -> tuple[SubMaker, bytes]: communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http")) sub_maker = edge_tts.SubMaker() - with open(voice_file, "wb") as file: - async for chunk in communicate.stream(): - if chunk["type"] == "audio": - file.write(chunk["data"]) - elif chunk["type"] == "WordBoundary": - sub_maker.create_sub( - (chunk["offset"], chunk["duration"]), chunk["text"] - ) - return sub_maker - # 判断音频文件是否一件存在 + audio_data = bytes() # 用于存储音频数据 + + async for chunk in communicate.stream(): + if chunk["type"] == "audio": + audio_data += chunk["data"] + elif chunk["type"] == "WordBoundary": + sub_maker.create_sub( + (chunk["offset"], chunk["duration"]), chunk["text"] + ) + return sub_maker, audio_data + + # 判断音频文件是否已存在 if os.path.exists(voice_file): logger.info(f"voice file exists, skip tts: {voice_file}") continue - sub_maker = asyncio.run(_do()) - if not sub_maker or not sub_maker.subs: - logger.warning(f"failed, sub_maker is None or sub_maker.subs is None") + + # 获取音频数据和字幕信息 + sub_maker, audio_data = asyncio.run(_do()) + + # 验证数据是否有效 + if not sub_maker or not sub_maker.subs or not audio_data: + logger.warning(f"failed, invalid data generated") + if i < 2: + time.sleep(1) continue + # 数据有效,写入文件 + with open(voice_file, "wb") as file: + file.write(audio_data) + logger.info(f"completed, output file: {voice_file}") return sub_maker except Exception as e: - logger.error(f"failed, error: {str(e)}") + logger.error(f"生成音频文件时出错: {str(e)}") + if i < 2: + time.sleep(1) return None @@ -1130,14 +1148,6 @@ def _format_duration_to_offset(duration) -> int: sub_maker = SubMaker() def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs): - # print('WordBoundary event:') - # print('\tBoundaryType: {}'.format(evt.boundary_type)) - # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000))) - # print('\tDuration: {}'.format(evt.duration)) - # print('\tText: {}'.format(evt.text)) - # print('\tTextOffset: {}'.format(evt.text_offset)) - # print('\tWordLength: {}'.format(evt.word_length)) - duration = _format_duration_to_offset(str(evt.duration)) offset = _format_duration_to_offset(evt.audio_offset) sub_maker.subs.append(evt.text) @@ -1183,9 +1193,13 @@ def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs): logger.error( f"azure v2 speech synthesis error: {cancellation_details.error_details}" ) + if i < 2: # 如果不是最后一次重试,则等待1秒 + time.sleep(1) logger.info(f"completed, output file: {voice_file}") except Exception as e: logger.error(f"failed, error: {str(e)}") + if i < 2: # 如果不是最后一次重试,则等待1秒 + time.sleep(1) return None @@ -1443,7 +1457,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f if sub_maker is None: logger.error(f"无法为时间戳 {timestamp} 生成音频; " - f"如果您在中国,请使用VPN。或者手动选择 zh-CN-YunyangNeural 等角色;" + f"如果您在中国,请使用VPN; " f"或者使用其他 tts 引擎") continue @@ -1460,17 +1474,12 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f voice_name = parse_voice_name(voice_name) print(voice_name) - with open("../../resource/scripts/test.json", 'r', encoding='utf-8') as f: + with open("../../resource/scripts/2024-1203-205442.json", 'r', encoding='utf-8') as f: data = json.load(f) - audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1) + audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1, voice_pitch=1) full_text = " ".join([item['narration'] for item in data if not item['OST']]) subtitle_file = os.path.join(utils.task_dir("12312312"), "subtitle_multiple.srt") create_subtitle_from_multiple(full_text, sub_maker_list, data, subtitle_file) print(f"生成的音频文件列表: {audio_files}") - print(f"生成的字幕文件: {subtitle_file}") - - # text = " ".join([item['narration'] for item in data]) - # sub_marks = tts(text=text, voice_name=voice_name, voice_rate=1, voice_file="../../storage/tasks/12312312/aaa.mp3") - # create_subtitle(text=text, sub_maker=sub_marks, subtitle_file="../../storage/tasks/12312312/subtitle_123.srt") diff --git a/app/services/youtube_service.py b/app/services/youtube_service.py new file mode 100644 index 0000000..e4a7a79 --- /dev/null +++ b/app/services/youtube_service.py @@ -0,0 +1,146 @@ +import yt_dlp +import os +from typing import List, Dict, Optional, Tuple +from loguru import logger +from uuid import uuid4 + +from app.utils import utils +from app.services import video as VideoService + + +class YoutubeService: + def __init__(self): + self.supported_formats = ['mp4', 'mkv', 'webm', 'flv', 'avi'] + + def _get_video_formats(self, url: str) -> List[Dict]: + """获取视频可用的格式列表""" + ydl_opts = { + 'quiet': True, + 'no_warnings': True + } + + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=False) + formats = info.get('formats', []) + + format_list = [] + for f in formats: + format_info = { + 'format_id': f.get('format_id', 'N/A'), + 'ext': f.get('ext', 'N/A'), + 'resolution': f.get('format_note', 'N/A'), + 'filesize': f.get('filesize', 'N/A'), + 'vcodec': f.get('vcodec', 'N/A'), + 'acodec': f.get('acodec', 'N/A') + } + format_list.append(format_info) + + return format_list + except Exception as e: + logger.error(f"获取视频格式失败: {str(e)}") + raise + + def _validate_format(self, output_format: str) -> None: + """验证输出格式是否支持""" + if output_format.lower() not in self.supported_formats: + raise ValueError( + f"不支持的视频格式: {output_format}。" + f"支持的格式: {', '.join(self.supported_formats)}" + ) + + async def download_video( + self, + url: str, + resolution: str, + output_format: str = 'mp4', + rename: Optional[str] = None + ) -> Tuple[str, str, str]: + """ + 下载指定分辨率的视频 + + Args: + url: YouTube视频URL + resolution: 目标分辨率 ('2160p', '1440p', '1080p', '720p' etc.) + 注意:对于类似'1080p60'的输入会被处理为'1080p' + output_format: 输出视频格式 + rename: 可选的重命名 + + Returns: + Tuple[str, str, str]: (task_id, output_path, filename) + """ + try: + task_id = str(uuid4()) + self._validate_format(output_format) + + # 标准化分辨率格式 + base_resolution = resolution.split('p')[0] + 'p' + + # 获取所有可用格式 + formats = self._get_video_formats(url) + + # 查找指定分辨率的最佳视频格式 + target_format = None + for fmt in formats: + fmt_resolution = fmt['resolution'] + # 将格式的分辨率也标准化后进行比较 + if fmt_resolution != 'N/A': + fmt_base_resolution = fmt_resolution.split('p')[0] + 'p' + if fmt_base_resolution == base_resolution and fmt['vcodec'] != 'none': + target_format = fmt + break + + if target_format is None: + # 收集可用分辨率时也进行标准化 + available_resolutions = set( + fmt['resolution'].split('p')[0] + 'p' + for fmt in formats + if fmt['resolution'] != 'N/A' and fmt['vcodec'] != 'none' + ) + raise ValueError( + f"未找到 {base_resolution} 分辨率的视频。" + f"可用分辨率: {', '.join(sorted(available_resolutions))}" + ) + + # 创建输出目录 + output_dir = utils.video_dir() + os.makedirs(output_dir, exist_ok=True) + + # 设置下载选项 + if rename: + # 如果指定了重命名,直接使用新名字 + filename = f"{rename}.{output_format}" + output_template = os.path.join(output_dir, filename) + else: + # 否则使用任务ID和原标题 + output_template = os.path.join(output_dir, f'{task_id}_%(title)s.%(ext)s') + + ydl_opts = { + 'format': f"{target_format['format_id']}+bestaudio[ext=m4a]/best", + 'outtmpl': output_template, + 'merge_output_format': output_format.lower(), + 'postprocessors': [{ + 'key': 'FFmpegVideoConvertor', + 'preferedformat': output_format.lower(), + }] + } + + # 执行下载 + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + if rename: + # 如果指定了重命名,使用新文件名 + output_path = output_template + filename = os.path.basename(output_path) + else: + # 否则使用原始标题 + video_title = info.get('title', task_id) + filename = f"{task_id}_{video_title}.{output_format}" + output_path = os.path.join(output_dir, filename) + + logger.info(f"视频下载成功: {output_path}") + return task_id, output_path, filename + + except Exception as e: + logger.exception("下载视频失败") + raise diff --git a/app/test/test_moviepy.py b/app/test/test_moviepy.py index d37d518..79d93c2 100644 --- a/app/test/test_moviepy.py +++ b/app/test/test_moviepy.py @@ -1,21 +1,32 @@ """ -使用 moviepy 库剪辑指定时间戳视频 +使用 moviepy 库剪辑指定时间戳视频,支持时分秒毫秒精度 """ from moviepy.editor import VideoFileClip from datetime import datetime +import os def time_str_to_seconds(time_str: str) -> float: """ 将时间字符串转换为秒数 参数: - time_str: 格式为"MM:SS"的时间字符串 + time_str: 格式为"HH:MM:SS,mmm"的时间字符串,例如"00:01:23,456" 返回: - 转换后的秒数 + 转换后的秒数(float) """ - time_obj = datetime.strptime(time_str, "%M:%S") - return time_obj.minute * 60 + time_obj.second + try: + # 分离时间和毫秒 + time_part, ms_part = time_str.split(',') + # 转换时分秒 + time_obj = datetime.strptime(time_part, "%H:%M:%S") + # 计算总秒数 + total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + # 添加毫秒部分 + total_seconds += int(ms_part) / 1000 + return total_seconds + except ValueError as e: + raise ValueError("时间格式错误,请使用 HH:MM:SS,mmm 格式,例如 00:01:23,456") from e def format_duration(seconds: float) -> str: @@ -24,40 +35,88 @@ def format_duration(seconds: float) -> str: 参数: seconds: 秒数 返回: - 格式化的时间字符串 (MM:SS) + 格式化的时间字符串 (HH:MM:SS,mmm) """ - minutes = int(seconds // 60) - remaining_seconds = int(seconds % 60) - return f"{minutes:02d}:{remaining_seconds:02d}" + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds_remain = seconds % 60 + whole_seconds = int(seconds_remain) + milliseconds = int((seconds_remain - whole_seconds) * 1000) + + return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" -def cut_video(video_path: str, start_time: str, end_time: str) -> None: +def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None: """ 剪辑视频 参数: video_path: 视频文件路径 - start_time: 开始时间 (格式: "MM:SS") - end_time: 结束时间 (格式: "MM:SS") + start_time: 开始时间 (格式: "HH:MM:SS,mmm") + end_time: 结束时间 (格式: "HH:MM:SS,mmm") + output_path: 输出文件路径 """ - # 转换时间字符串为秒数 - start_seconds = time_str_to_seconds(start_time) - end_seconds = time_str_to_seconds(end_time) - - # 加载视频文件 - video = VideoFileClip(video_path) - - # 计算剪辑时长 - clip_duration = end_seconds - start_seconds - print(f"原视频总长度: {format_duration(video.duration)}") - print(f"剪辑时长: {format_duration(clip_duration)}") - - # 剪辑视频 - video = video.subclip(start_seconds, end_seconds) - video.write_videofile("../../resource/videos/cut_video2.mp4") - - # 释放资源 - video.close() + try: + # 确保输出目录存在 + output_dir = os.path.dirname(output_path) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # 如果输出文件已存在,先尝试删除 + if os.path.exists(output_path): + try: + os.remove(output_path) + except PermissionError: + print(f"无法删除已存在的文件:{output_path},请确保文件未被其他程序占用") + return + + # 转换时间字符串为秒数 + start_seconds = time_str_to_seconds(start_time) + end_seconds = time_str_to_seconds(end_time) + + # 加载视频文件 + video = VideoFileClip(video_path) + + # 验证时间范围 + if start_seconds >= video.duration or end_seconds > video.duration: + raise ValueError(f"剪辑时间超出视频长度!视频总长度为: {format_duration(video.duration)}") + + if start_seconds >= end_seconds: + raise ValueError("结束时间必须大于开始时间!") + + # 计算剪辑时长 + clip_duration = end_seconds - start_seconds + print(f"原视频总长度: {format_duration(video.duration)}") + print(f"剪辑时长: {format_duration(clip_duration)}") + print(f"剪辑区间: {start_time} -> {end_time}") + + # 剪辑视频 + video = video.subclip(start_seconds, end_seconds) + + # 添加错误处理的写入过程 + try: + video.write_videofile( + output_path, + codec='libx264', + audio_codec='aac', + temp_audiofile='temp-audio.m4a', + remove_temp=True + ) + except IOError as e: + print(f"写入视频文件时发生错误:{str(e)}") + raise + finally: + # 确保资源被释放 + video.close() + + except Exception as e: + print(f"视频剪辑过程中发生错误:{str(e)}") + raise if __name__ == "__main__": - cut_video("../../resource/videos/best.mp4", "00:40", "02:40") + cut_video( + video_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp.mp4", + start_time="00:00:00,789", + end_time="00:02:00,123", + output_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp_cut3.mp4" + ) diff --git a/app/test/test_qwen.py b/app/test/test_qwen.py new file mode 100644 index 0000000..2a69225 --- /dev/null +++ b/app/test/test_qwen.py @@ -0,0 +1,105 @@ +import os +import traceback +import json +from openai import OpenAI +from pydantic import BaseModel +from typing import List +from app.utils import utils +from app.services.subtitle import extract_audio_and_create_subtitle + + +class Step(BaseModel): + timestamp: str + picture: str + narration: str + OST: int + new_timestamp: str + +class MathReasoning(BaseModel): + result: List[Step] + + +def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str: + """ + 与通义千问AI模型进行对话 + + Args: + prompt (str): 用户输入的问题或提示 + system_message (str): 系统提示信息,用于设定AI助手的行为。默认为"You are a helpful assistant." + subtitle_path (str): 字幕文件路径 + Returns: + str: AI助手的回复内容 + + Raises: + Exception: 当API调用失败时抛出异常 + """ + try: + client = OpenAI( + api_key="sk-a1acd853d88d41d3ae92777d7bfa2612", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + ) + + # 读取字幕文件 + with open(subtitle_path, "r", encoding="utf-8") as file: + subtitle_content = file.read() + + completion = client.chat.completions.create( + model="qwen-turbo-2024-11-01", + messages=[ + {'role': 'system', 'content': system_message}, + {'role': 'user', 'content': prompt + subtitle_content} + ] + ) + return completion.choices[0].message.content + + except Exception as e: + error_message = f"调用千问API时发生错误:{str(e)}" + print(error_message) + print("请参考文档:https://help.aliyun.com/zh/model-studio/developer-reference/error-code") + raise Exception(error_message) + + +# 使用示例 +if __name__ == "__main__": + try: + video_path = utils.video_dir("duanju_yuansp.mp4") + # # 判断视频是否存在 + # if not os.path.exists(video_path): + # print(f"视频文件不存在:{video_path}") + # exit(1) + # 提取字幕 + subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt") + extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path) + # 分析字幕 + system_message = """ + 你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配。 + 输出需严格按照如下 json 格式: + [ + { + "timestamp": "00:00:50,020-00,01:44,000", + "picture": "画面1", + "narration": "播放原声", + "OST": 0, + "new_timestamp": "00:00:00,000-00:00:54,020" + }, + { + "timestamp": "01:49-02:30", + "picture": "画面2", + "narration": "播放原声", + "OST": 2, + "new_timestamp": "00:54-01:35" + }, + ] + """ + prompt = "字幕如下:\n" + response = chat_with_qwen(prompt, system_message, subtitle_path) + print(response) + # 保存json,注意json中是时间戳需要转换为 分:秒(现在的时间是 "timestamp": "00:00:00,020-00:00:01,660", 需要转换为 "timestamp": "00:00-01:66") + # response = json.loads(response) + # for item in response: + # item["timestamp"] = item["timestamp"].replace(":", "-") + # with open(os.path.join(utils.video_dir(""), "duanju_yuan.json"), "w", encoding="utf-8") as file: + # json.dump(response, file, ensure_ascii=False) + + except Exception as e: + print(traceback.format_exc()) diff --git a/app/utils/vision_analyzer.py b/app/utils/gemini_analyzer.py similarity index 84% rename from app/utils/vision_analyzer.py rename to app/utils/gemini_analyzer.py index 06342d7..07306c5 100644 --- a/app/utils/vision_analyzer.py +++ b/app/utils/gemini_analyzer.py @@ -10,6 +10,7 @@ import google.generativeai as genai import PIL.Image import traceback +from app.utils import utils class VisionAnalyzer: @@ -146,14 +147,34 @@ def save_results_to_txt(self, results: List[Dict], output_dir: str): response_text = result['response'] image_paths = result['image_paths'] - img_name_start = Path(image_paths[0]).stem.split('_')[-1] - img_name_end = Path(image_paths[-1]).stem.split('_')[-1] - txt_path = os.path.join(output_dir, f"frame_{img_name_start}_{img_name_end}.txt") + # 从文件名中提取时间戳并转换为标准格式 + def format_timestamp(img_path): + # 从文件名中提取时间部分 + timestamp = Path(img_path).stem.split('_')[-1] + try: + # 将时间转换为秒 + seconds = utils.time_to_seconds(timestamp.replace('_', ':')) + # 转换为 HH:MM:SS,mmm 格式 + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds_remainder = seconds % 60 + whole_seconds = int(seconds_remainder) + milliseconds = int((seconds_remainder - whole_seconds) * 1000) + + return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" + except Exception as e: + logger.error(f"时间戳格式转换错误: {timestamp}, {str(e)}") + return timestamp + + start_timestamp = format_timestamp(image_paths[0]) + end_timestamp = format_timestamp(image_paths[-1]) + + txt_path = os.path.join(output_dir, f"frame_{start_timestamp}_{end_timestamp}.txt") # 保存结果到txt文件 with open(txt_path, 'w', encoding='utf-8') as f: f.write(response_text.strip()) - print(f"已保存分析结果到: {txt_path}") + logger.info(f"已保存分析结果到: {txt_path}") def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]: """ diff --git a/app/utils/qwenvl_analyzer.py b/app/utils/qwenvl_analyzer.py new file mode 100644 index 0000000..54e6e36 --- /dev/null +++ b/app/utils/qwenvl_analyzer.py @@ -0,0 +1,265 @@ +import json +from typing import List, Union, Dict +import os +from pathlib import Path +from loguru import logger +from tqdm import tqdm +import asyncio +from tenacity import retry, stop_after_attempt, RetryError, wait_exponential +from openai import OpenAI +import PIL.Image +import base64 +import io +import traceback + + +class QwenAnalyzer: + """千问视觉分析器类""" + + def __init__(self, model_name: str = "qwen-vl-max-latest", api_key: str = None, base_url: str = None): + """ + 初始化千问视觉分析器 + + Args: + model_name: 模型名称,默认使用 qwen-vl-max-latest + api_key: 阿里云API密钥 + base_url: API基础URL,如果为None则使用默认值 + """ + if not api_key: + raise ValueError("必须提供API密钥") + + self.model_name = model_name + self.api_key = api_key + self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" + + # 配置API客户端 + self._configure_client() + + def _configure_client(self): + """ + 配置API客户端 + 使用最简化的参数配置,避免不必要的参数 + """ + try: + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url + ) + except Exception as e: + logger.error(f"初始化OpenAI客户端失败: {str(e)}") + raise + + def _image_to_base64(self, image: PIL.Image.Image) -> str: + """ + 将PIL图片对象转换为base64字符串 + """ + buffered = io.BytesIO() + image.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10) + ) + async def _generate_content_with_retry(self, prompt: str, batch: List[PIL.Image.Image]): + """使用重试机制的内部方法来调用千问API""" + try: + # 构建消息内容 + content = [] + + # 添加图片 + for img in batch: + base64_image = self._image_to_base64(img) + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + }) + + # 添加文本提示 + content.append({ + "type": "text", + "text": prompt + }) + + # 调用API + response = await asyncio.to_thread( + self.client.chat.completions.create, + model=self.model_name, + messages=[{ + "role": "user", + "content": content + }] + ) + + return response.choices[0].message.content + + except Exception as e: + logger.error(f"API调用错误: {str(e)}") + raise RetryError("API调用失败") + + async def analyze_images(self, + images: Union[List[str], List[PIL.Image.Image]], + prompt: str, + batch_size: int = 5) -> List[Dict]: + """ + 批量分析多张图片 + Args: + images: 图片路径列表或PIL图片对象列表 + prompt: 分析提示词 + batch_size: 批处理大小 + Returns: + 分析结果列表 + """ + try: + # 保存原始图片路径(如果是路径列表的话) + original_paths = images if isinstance(images[0], str) else None + + # 加载图片 + if isinstance(images[0], str): + logger.info("正在加载图片...") + images = self.load_images(images) + + # 验证图片列表 + if not images: + raise ValueError("图片列表为空") + + # 验证每个图片对象 + valid_images = [] + valid_paths = [] + for i, img in enumerate(images): + if not isinstance(img, PIL.Image.Image): + logger.error(f"无效的图片对象,索引 {i}: {type(img)}") + continue + valid_images.append(img) + if original_paths: + valid_paths.append(original_paths[i]) + + if not valid_images: + raise ValueError("没有有效的图片对象") + + images = valid_images + results = [] + total_batches = (len(images) + batch_size - 1) // batch_size + + with tqdm(total=total_batches, desc="分析进度") as pbar: + for i in range(0, len(images), batch_size): + batch = images[i:i + batch_size] + batch_paths = valid_paths[i:i + batch_size] if valid_paths else None + retry_count = 0 + + while retry_count < 3: + try: + # 在每个批次处理前��加小延迟 + if i > 0: + await asyncio.sleep(2) + + # 确保每个批次的图片都是有效的 + valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)] + if not valid_batch: + raise ValueError(f"批次 {i // batch_size} 中没有有效的图片") + + response = await self._generate_content_with_retry(prompt, valid_batch) + result_dict = { + 'batch_index': i // batch_size, + 'images_processed': len(valid_batch), + 'response': response, + 'model_used': self.model_name + } + + # 添加图片路径信息(如果有的话) + if batch_paths: + result_dict['image_paths'] = batch_paths + + results.append(result_dict) + break + + except Exception as e: + retry_count += 1 + error_msg = f"批次 {i // batch_size} 处理出错: {str(e)}" + logger.error(error_msg) + + if retry_count >= 3: + results.append({ + 'batch_index': i // batch_size, + 'images_processed': len(batch), + 'error': error_msg, + 'model_used': self.model_name, + 'image_paths': batch_paths if batch_paths else [] + }) + else: + logger.info(f"批次 {i // batch_size} 处理失败,等待60秒后重试当前批次...") + await asyncio.sleep(60) + + pbar.update(1) + + return results + + except Exception as e: + error_msg = f"图片分析过程中发生错误: {str(e)}\n{traceback.format_exc()}" + logger.error(error_msg) + raise Exception(error_msg) + + def save_results_to_txt(self, results: List[Dict], output_dir: str): + """将分析结果保存到txt文件""" + # 确保输出目录存在 + os.makedirs(output_dir, exist_ok=True) + + for i, result in enumerate(results): + response_text = result['response'] + + # 如果有图片路径信息,���用它来生成文件名 + if result.get('image_paths'): + image_paths = result['image_paths'] + img_name_start = Path(image_paths[0]).stem.split('_')[-1] + img_name_end = Path(image_paths[-1]).stem.split('_')[-1] + file_name = f"frame_{img_name_start}_{img_name_end}.txt" + else: + # 如果没有路径信息,使用批次索引 + file_name = f"batch_{result['batch_index']}.txt" + + txt_path = os.path.join(output_dir, file_name) + + # 保存结果到txt文件 + with open(txt_path, 'w', encoding='utf-8') as f: + f.write(response_text.strip()) + logger.info(f"已保存分析结果到: {txt_path}") + + def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]: + """ + 加载多张图片 + Args: + image_paths: 图片路径列表 + Returns: + 加载后的PIL Image对象列表 + """ + images = [] + failed_images = [] + + for img_path in image_paths: + try: + if not os.path.exists(img_path): + logger.error(f"图片文件不存在: {img_path}") + failed_images.append(img_path) + continue + + img = PIL.Image.open(img_path) + # 确保图片被完全加载 + img.load() + # 转换为RGB模式 + if img.mode != 'RGB': + img = img.convert('RGB') + images.append(img) + + except Exception as e: + logger.error(f"无法加载图片 {img_path}: {str(e)}") + failed_images.append(img_path) + + if failed_images: + logger.warning(f"以下图片加载失败:\n{json.dumps(failed_images, indent=2, ensure_ascii=False)}") + + if not images: + raise ValueError("没有成功加载任何图片") + + return images diff --git a/app/utils/script_generator.py b/app/utils/script_generator.py index e36064a..6493e82 100644 --- a/app/utils/script_generator.py +++ b/app/utils/script_generator.py @@ -374,22 +374,65 @@ def _get_default_prompt(self) -> str: 记住:要敢于用"温和的违反"制造笑点,但要把握好尺度,让观众在轻松愉快中感受到乐趣。""" def calculate_duration_and_word_count(self, time_range: str) -> int: + """ + 计算时间范围的持续时长并估算合适的字数 + + Args: + time_range: 时间范围字符串,格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm" + 例如: "00:00:50,100-00:01:21,500" + + Returns: + int: 估算的合适字数 + 基于经验公式: 每0.35秒可以说一个字 + 例如: 10秒可以说约28个字 (10/0.35≈28.57) + """ try: start_str, end_str = time_range.split('-') - - def time_to_seconds(time_str): - minutes, seconds = map(int, time_str.split(':')) - return minutes * 60 + seconds - + + def time_to_seconds(time_str: str) -> float: + """ + 将时间字符串转换为秒数(带毫秒精度) + + Args: + time_str: 时间字符串,格式为 "HH:MM:SS,mmm" + 例如: "00:00:50,100" 表示50.1秒 + + Returns: + float: 转换后的秒数(带毫秒) + """ + try: + # 处理毫秒部分 + time_part, ms_part = time_str.split(',') + hours, minutes, seconds = map(int, time_part.split(':')) + milliseconds = int(ms_part) + + # 转换为秒 + total_seconds = (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000) + return total_seconds + + except ValueError as e: + logger.warning(f"时间格式解析错误: {time_str}, error: {e}") + return 0.0 + + # 计算开始和结束时间的秒数 start_seconds = time_to_seconds(start_str) end_seconds = time_to_seconds(end_str) + + # 计算持续时间(秒) duration = end_seconds - start_seconds - word_count = int(duration / 0.35) - + + # 根据经验公式计算字数: 每0.5秒一个字 + word_count = int(duration / 0.4) + + # 确保字数在合理范围内 + word_count = max(10, min(word_count, 500)) # 限制在10-500字之间 + + logger.debug(f"时间范围 {time_range} 的持续时间为 {duration:.3f}秒, 估算字数: {word_count}") return word_count + except Exception as e: - logger.info(f"时间格式转换错误: {traceback.format_exc()}") - return 100 + logger.warning(f"字数计算错误: {traceback.format_exc()}") + return 100 # 发生错误时返回默认字数 def process_frames(self, frame_content_list: List[Dict]) -> List[Dict]: for frame_content in frame_content_list: @@ -406,22 +449,47 @@ def process_frames(self, frame_content_list: List[Dict]) -> List[Dict]: def _save_results(self, frame_content_list: List[Dict]): """保存处理结果,并添加新的时间戳""" try: - # 转换秒数为 MM:SS 格式 - def seconds_to_time(seconds): - minutes = seconds // 60 - remaining_seconds = seconds % 60 - return f"{minutes:02d}:{remaining_seconds:02d}" + def format_timestamp(seconds: float) -> str: + """将秒数转换为 HH:MM:SS,mmm 格式""" + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds_remainder = seconds % 60 + whole_seconds = int(seconds_remainder) + milliseconds = int((seconds_remainder - whole_seconds) * 1000) + + return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" # 计算新的时间戳 - current_time = 0 # 当前时间点(秒) + current_time = 0.0 # 当前时间点(秒,包含毫秒) for frame in frame_content_list: # 获取原始时间戳的持续时间 start_str, end_str = frame['timestamp'].split('-') - def time_to_seconds(time_str): - minutes, seconds = map(int, time_str.split(':')) - return minutes * 60 + seconds + def time_to_seconds(time_str: str) -> float: + """将时间字符串转换为秒数(包含毫秒)""" + try: + if ',' in time_str: + time_part, ms_part = time_str.split(',') + ms = float(ms_part) / 1000 + else: + time_part = time_str + ms = 0 + + parts = time_part.split(':') + if len(parts) == 3: # HH:MM:SS + h, m, s = map(float, parts) + seconds = h * 3600 + m * 60 + s + elif len(parts) == 2: # MM:SS + m, s = map(float, parts) + seconds = m * 60 + s + else: # SS + seconds = float(parts[0]) + + return seconds + ms + except Exception as e: + logger.error(f"时间格式转换错误 {time_str}: {str(e)}") + return 0.0 # 计算当前片段的持续时间 start_seconds = time_to_seconds(start_str) @@ -429,8 +497,8 @@ def time_to_seconds(time_str): duration = end_seconds - start_seconds # 设置新的时间戳 - new_start = seconds_to_time(current_time) - new_end = seconds_to_time(current_time + duration) + new_start = format_timestamp(current_time) + new_end = format_timestamp(current_time + duration) frame['new_timestamp'] = f"{new_start}-{new_end}" # 更新当前时间点 @@ -443,7 +511,7 @@ def time_to_seconds(time_str): with open(file_name, 'w', encoding='utf-8') as file: json.dump(frame_content_list, file, ensure_ascii=False, indent=4) - logger.info(f"保存脚本成功,总时长: {seconds_to_time(current_time)}") + logger.info(f"保存脚本成功,总时长: {format_timestamp(current_time)}") except Exception as e: logger.error(f"保存结果时发生错误: {str(e)}\n{traceback.format_exc()}") diff --git a/app/utils/utils.py b/app/utils/utils.py index 307823c..db0d248 100644 --- a/app/utils/utils.py +++ b/app/utils/utils.py @@ -40,7 +40,7 @@ def serialize(o): # 如果对象是二进制数据,转换为base64编码的字符串 elif isinstance(o, bytes): return "*** binary data ***" - # 如果对象是字典,递归处理每个键值对 + # 如果象是字典,递归处理每个键值对 elif isinstance(o, dict): return {k: serialize(v) for k, v in o.items()} # 如果对象是列表或元组,递归处理每个元素 @@ -56,7 +56,7 @@ def serialize(o): # 使用serialize函数处理输入对象 serialized_obj = serialize(obj) - # 序列化处理后的对象为JSON���符串 + # 序列化处理后的对象为JSON符串 return json.dumps(serialized_obj, ensure_ascii=False, indent=4) except Exception as e: return None @@ -126,6 +126,15 @@ def public_dir(sub_dir: str = ""): return d +def srt_dir(sub_dir: str = ""): + d = resource_dir(f"srt") + if sub_dir: + d = os.path.join(d, sub_dir) + if not os.path.exists(d): + os.makedirs(d) + return d + + def run_in_background(func, *args, **kwargs): def run(): try: @@ -302,15 +311,49 @@ def get_current_country(): def time_to_seconds(time_str: str) -> float: - parts = time_str.split(':') - if len(parts) == 2: - m, s = map(float, parts) - return m * 60 + s - elif len(parts) == 3: - h, m, s = map(float, parts) - return h * 3600 + m * 60 + s - else: - raise ValueError(f"Invalid time format: {time_str}") + """ + 将时间字符串转换为秒数,支持多种格式: + - "HH:MM:SS,mmm" -> 小时:分钟:秒,毫秒 + - "MM:SS,mmm" -> 分钟:秒,毫秒 + - "SS,mmm" -> 秒,毫秒 + - "SS-mmm" -> 秒-毫秒 + + Args: + time_str: 时间字符串 + + Returns: + float: 转换后的秒数(包含毫秒) + """ + try: + # 处理带有'-'的毫秒格式 + if '-' in time_str: + time_part, ms_part = time_str.split('-') + ms = float(ms_part) / 1000 + # 处理带有','的毫秒格式 + elif ',' in time_str: + time_part, ms_part = time_str.split(',') + ms = float(ms_part) / 1000 + else: + time_part = time_str + ms = 0 + + # 分割时间部分 + parts = time_part.split(':') + + if len(parts) == 3: # HH:MM:SS + h, m, s = map(float, parts) + seconds = h * 3600 + m * 60 + s + elif len(parts) == 2: # MM:SS + m, s = map(float, parts) + seconds = m * 60 + s + else: # SS + seconds = float(parts[0]) + + return seconds + ms + + except (ValueError, IndexError) as e: + logger.error(f"时间格式转换错误 {time_str}: {str(e)}") + return 0.0 def seconds_to_time(seconds: float) -> str: @@ -320,15 +363,25 @@ def seconds_to_time(seconds: float) -> str: def calculate_total_duration(scenes): + """ + 计算场景列表的总时长 + + Args: + scenes: 场景列表,每个场景包含 timestamp 字段,格式如 "00:00:28,350-00:00:41,000" + + Returns: + float: 总时长(秒) + """ total_seconds = 0 for scene in scenes: start, end = scene['timestamp'].split('-') - start_time = datetime.strptime(start, '%M:%S') - end_time = datetime.strptime(end, '%M:%S') + # 使用 time_to_seconds 函数处理更精确的时间格式 + start_seconds = time_to_seconds(start) + end_seconds = time_to_seconds(end) - duration = end_time - start_time - total_seconds += duration.total_seconds() + duration = end_seconds - start_seconds + total_seconds += duration return total_seconds @@ -451,7 +504,7 @@ def clear_keyframes_cache(video_path: str = None): return if video_path: - # ���理指定视频的缓存 + # 理指定视频的缓存 video_hash = md5(video_path + str(os.path.getmtime(video_path))) video_keyframes_dir = os.path.join(keyframes_dir, video_hash) if os.path.exists(video_keyframes_dir): @@ -520,3 +573,21 @@ def download_font(url: str, font_path: str): except Exception as e: logger.error(f"下载字体文件失败: {e}") raise + +def init_imagemagick(): + """初始化 ImageMagick 配置""" + try: + # 检查 ImageMagick 是否已安装 + import subprocess + result = subprocess.run(['magick', '-version'], capture_output=True, text=True) + if result.returncode != 0: + logger.error("ImageMagick 未安装或配置不正确") + return False + + # 设置 IMAGEMAGICK_BINARY 环境变量 + os.environ['IMAGEMAGICK_BINARY'] = 'magick' + + return True + except Exception as e: + logger.error(f"初始化 ImageMagick 失败: {str(e)}") + return False diff --git a/app/utils/video_processor_v2.py b/app/utils/video_processor_v2.py index 03bbb84..825306b 100644 --- a/app/utils/video_processor_v2.py +++ b/app/utils/video_processor_v2.py @@ -51,21 +51,34 @@ def preprocess_video(self) -> Generator[np.ndarray, None, None]: def detect_shot_boundaries(self, frames: List[np.ndarray], threshold: int = 30) -> List[int]: """ 使用帧差法检测镜头边界 - + Args: frames: 视频帧列表 - threshold: 差异阈值 - + threshold: 差异阈值,默认值调低为30 + Returns: List[int]: 镜头边界帧的索引列表 """ shot_boundaries = [] + if len(frames) < 2: # 添加帧数检查 + logger.warning("视频帧数过少,无法检测场景边界") + return [len(frames) - 1] # 返回最后一帧作为边界 + for i in range(1, len(frames)): prev_frame = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY) curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY) - diff = np.mean(np.abs(curr_frame.astype(int) - prev_frame.astype(int))) + + # 计算帧差 + diff = np.mean(np.abs(curr_frame.astype(float) - prev_frame.astype(float))) + if diff > threshold: shot_boundaries.append(i) + + # 如果没有检测到任何边界,至少返回最后一帧 + if not shot_boundaries: + logger.warning("未检测到场景边界,将视频作为单个场景处理") + shot_boundaries.append(len(frames) - 1) + return shot_boundaries def extract_keyframes(self, frames: List[np.ndarray], shot_boundaries: List[int]) -> Tuple[ @@ -113,12 +126,7 @@ def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int output_dir: str, desc: str = "保存关键帧") -> None: """ 保存关键帧到指定目录,文件名格式为:keyframe_帧序号_时间戳.jpg - - Args: - keyframes: 关键帧列表 - keyframe_indices: 关键帧索引列表 - output_dir: 输出目录 - desc: 进度条描述 + 时间戳精确到毫秒,格式为:HHMMSSmmm """ if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -126,11 +134,13 @@ def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int for keyframe, frame_idx in tqdm(zip(keyframes, keyframe_indices), total=len(keyframes), desc=desc): + # 计算精确到毫秒的时间戳 timestamp = frame_idx / self.fps hours = int(timestamp // 3600) minutes = int((timestamp % 3600) // 60) seconds = int(timestamp % 60) - time_str = f"{hours:02d}{minutes:02d}{seconds:02d}" + milliseconds = int((timestamp % 1) * 1000) # 计算毫秒部分 + time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" output_path = os.path.join(output_dir, f'keyframe_{frame_idx:06d}_{time_str}.jpg') @@ -138,11 +148,7 @@ def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str) -> None: """ - 根据指定的帧号提取帧,如果多个帧在同一秒内,只保留一个 - - Args: - frame_numbers: 要提取的帧号列表 - output_folder: 输出文件夹路径 + 根据指定的帧号提取帧,如果多个帧在同一毫秒内,只保留一个 """ if not frame_numbers: raise ValueError("未提供帧号列表") @@ -153,29 +159,31 @@ def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str if not os.path.exists(output_folder): os.makedirs(output_folder) - # 用于记录已处理的时间戳(秒) - processed_seconds = set() + # 用于记录已处理的时间戳(毫秒) + processed_timestamps = set() for frame_number in tqdm(frame_numbers, desc="提取高清帧"): - # 计算时间戳(秒) - timestamp_seconds = int(frame_number / self.fps) + # 计算精确到毫秒的时间戳 + timestamp = frame_number / self.fps + timestamp_ms = int(timestamp * 1000) # 转换为毫秒 - # 如果这一秒已经处理过,跳过 - if timestamp_seconds in processed_seconds: + # 如果这一毫秒已经处理过,跳过 + if timestamp_ms in processed_timestamps: continue self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) ret, frame = self.cap.read() if ret: - # 记录这一秒已经处理 - processed_seconds.add(timestamp_seconds) + # 记录这一毫秒已经处理 + processed_timestamps.add(timestamp_ms) # 计算时间戳字符串 - hours = int(timestamp_seconds // 3600) - minutes = int((timestamp_seconds % 3600) // 60) - seconds = int(timestamp_seconds % 60) - time_str = f"{hours:02d}{minutes:02d}{seconds:02d}" + hours = int(timestamp // 3600) + minutes = int((timestamp % 3600) // 60) + seconds = int(timestamp % 60) + milliseconds = int((timestamp % 1) * 1000) # 计算毫秒部分 + time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" output_path = os.path.join(output_folder, f"keyframe_{frame_number:06d}_{time_str}.jpg") @@ -183,27 +191,34 @@ def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str else: logger.info(f"无法读取帧 {frame_number}") - logger.info(f"共提取了 {len(processed_seconds)} 个不同时间戳的帧") + logger.info(f"共提取了 {len(processed_timestamps)} 个不同时间戳的帧") @staticmethod def extract_numbers_from_folder(folder_path: str) -> List[int]: """ 从文件夹中提取帧号 - + Args: folder_path: 关键帧文件夹路径 - + Returns: List[int]: 排序后的帧号列表 """ files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')] - # 更新正则表达式以匹配新的文件名格式:keyframe_000123_010534.jpg - pattern = re.compile(r'keyframe_(\d+)_\d+\.jpg$') + # 更新正则表达式以匹配新的文件名格式:keyframe_000123_010534123.jpg + pattern = re.compile(r'keyframe_(\d+)_\d{9}\.jpg$') numbers = [] + for f in files: match = pattern.search(f) if match: numbers.append(int(match.group(1))) + else: + logger.warning(f"文件名格式不匹配: {f}") + + if not numbers: + logger.error(f"在目录 {folder_path} 中未找到有效的关键帧文件") + return sorted(numbers) def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int = 30) -> None: @@ -212,7 +227,7 @@ def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int Args: output_dir: 输出目录 - skip_seconds: 跳过视���开头的秒数 + skip_seconds: 跳过视频开头的秒数 """ skip_frames = int(skip_seconds * self.fps) @@ -240,11 +255,14 @@ def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int def process_video_pipeline(self, output_dir: str, skip_seconds: float = 0, - threshold: int = 30, + threshold: int = 20, # 降低默认阈值 compressed_width: int = 320, keep_temp: bool = False) -> None: """ - 执行完整的视频处理流程:压缩、提取关键帧、导出高清帧 + 执行完整的视频处理流程 + + Args: + threshold: 降低默认阈值为20,使场景检测更敏感 """ os.makedirs(output_dir, exist_ok=True) temp_dir = os.path.join(output_dir, 'temp') @@ -358,7 +376,7 @@ def process_video_pipeline(self, import time start_time = time.time() - processor = VideoProcessor("best.mp4") - processor.process_video_pipeline(output_dir="output4") + processor = VideoProcessor("E:\\projects\\NarratoAI\\resource\\videos\\test.mp4") + processor.process_video_pipeline(output_dir="output") end_time = time.time() print(f"处理完成!总耗时: {end_time - start_time:.2f} 秒") diff --git a/config.example.toml b/config.example.toml index e6b3919..c9702f4 100644 --- a/config.example.toml +++ b/config.example.toml @@ -1,5 +1,5 @@ [app] - project_version="0.3.5" + project_version="0.3.9" # 支持视频理解的大模型提供商 # gemini # NarratoAPI diff --git a/main.py b/main.py index e84f32b..bfec175 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import os import uvicorn from loguru import logger @@ -7,6 +8,8 @@ logger.info( "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs" ) + os.environ["HTTP_PROXY"] = config.proxy.get("http") + os.environ["HTTPS_PROXY"] = config.proxy.get("https") uvicorn.run( app="app.asgi:app", host=config.listen_host, diff --git a/requirements.txt b/requirements.txt index 2ae1f29..f98c399 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ requests~=2.31.0 -moviepy~=2.0.0.dev2 +moviepy==2.0.0.dev2 faster-whisper~=1.0.1 edge_tts~=6.1.15 uvicorn~=0.27.1 @@ -26,9 +26,12 @@ psutil>=5.9.0 opencv-python~=4.10.0.84 scikit-learn~=1.5.2 google-generativeai~=0.8.3 -Pillow>=11.0.0 +pillow==10.3.0 python-dotenv~=1.0.1 openai~=1.53.0 tqdm>=4.66.6 tenacity>=9.0.0 -tiktoken==0.8.0 \ No newline at end of file +tiktoken==0.8.0 +yt-dlp==2024.11.18 +pysrt==1.1.2 +httpx==0.27.2 diff --git a/resource/fonts/fonts_in_here.txt b/resource/fonts/fonts_in_here.txt new file mode 100644 index 0000000..8858c69 --- /dev/null +++ b/resource/fonts/fonts_in_here.txt @@ -0,0 +1 @@ +此处放字体文件 \ No newline at end of file diff --git a/resource/scripts/script_in_here.txt b/resource/scripts/script_in_here.txt new file mode 100644 index 0000000..e69de29 diff --git a/resource/songs/song_in_here.txt b/resource/songs/song_in_here.txt new file mode 100644 index 0000000..e69de29 diff --git a/resource/srt/srt_in_here.txt b/resource/srt/srt_in_here.txt new file mode 100644 index 0000000..e69de29 diff --git a/resource/videos/video_in_here.txt b/resource/videos/video_in_here.txt new file mode 100644 index 0000000..e69de29 diff --git a/video_pipeline.py b/video_pipeline.py new file mode 100644 index 0000000..dc7fa26 --- /dev/null +++ b/video_pipeline.py @@ -0,0 +1,178 @@ +import requests +import json +import os +import time +from typing import Dict, Any + +class VideoPipeline: + def __init__(self, base_url: str = "http://127.0.0.1:8080"): + self.base_url = base_url + + def download_video(self, url: str, resolution: str = "1080p", + output_format: str = "mp4", rename: str = None) -> Dict[str, Any]: + """下载视频的第一步""" + endpoint = f"{self.base_url}/api/v2/youtube/download" + payload = { + "url": url, + "resolution": resolution, + "output_format": output_format, + "rename": rename or time.strftime("%Y-%m-%d") + } + + response = requests.post(endpoint, json=payload) + response.raise_for_status() + return response.json() + + def generate_script(self, video_path: str, skip_seconds: int = 0, + threshold: int = 30, vision_batch_size: int = 10, + vision_llm_provider: str = "gemini") -> Dict[str, Any]: + """生成脚本的第二步""" + endpoint = f"{self.base_url}/api/v2/scripts/generate" + payload = { + "video_path": video_path, + "skip_seconds": skip_seconds, + "threshold": threshold, + "vision_batch_size": vision_batch_size, + "vision_llm_provider": vision_llm_provider + } + + response = requests.post(endpoint, json=payload) + response.raise_for_status() + return response.json() + + def crop_video(self, video_path: str, script: list) -> Dict[str, Any]: + """剪辑视频的第三步""" + endpoint = f"{self.base_url}/api/v2/scripts/crop" + payload = { + "video_origin_path": video_path, + "video_script": script + } + + response = requests.post(endpoint, json=payload) + response.raise_for_status() + return response.json() + + def generate_final_video(self, task_id: str, video_path: str, + script_path: str, script: list, subclip_videos: Dict[str, str], voice_name: str) -> Dict[str, Any]: + """生成最终视频的第四步""" + endpoint = f"{self.base_url}/api/v2/scripts/start-subclip" + + request_data = { + "video_clip_json": script, + "video_clip_json_path": script_path, + "video_origin_path": video_path, + "video_aspect": "16:9", + "video_language": "zh-CN", + "voice_name": voice_name, + "voice_volume": 1, + "voice_rate": 1.2, + "voice_pitch": 1, + "bgm_name": "random", + "bgm_type": "random", + "bgm_file": "", + "bgm_volume": 0.3, + "subtitle_enabled": True, + "subtitle_position": "bottom", + "font_name": "STHeitiMedium.ttc", + "text_fore_color": "#FFFFFF", + "text_background_color": "transparent", + "font_size": 75, + "stroke_color": "#000000", + "stroke_width": 1.5, + "custom_position": 70, + "n_threads": 8 + } + + payload = { + "request": request_data, + "subclip_videos": subclip_videos + } + + params = {"task_id": task_id} + response = requests.post(endpoint, params=params, json=payload) + response.raise_for_status() + return response.json() + + def save_script_to_json(self, script: list, script_path: str) -> str: + """保存脚本到json文件""" + try: + with open(script_path, 'w', encoding='utf-8') as f: + json.dump(script, f, ensure_ascii=False, indent=2) + print(f"脚本已保存到: {script_path}") + return script_path + except Exception as e: + print(f"保存脚本失败: {str(e)}") + raise + + def run_pipeline(self, task_id: str, script_name: str, youtube_url: str, video_name: str="null", skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 10, vision_llm_provider: str = "gemini", voice_name: str = "zh-CN-YunjianNeural") -> Dict[str, Any]: + """运行完整的pipeline""" + try: + current_path = os.path.dirname(os.path.abspath(__file__)) + video_path = os.path.join(current_path, "resource", "videos", f"{video_name}.mp4") + # 判断视频是否存在 + if not os.path.exists(video_path): + # 1. 下载视频 + print(f"视频不存在, 开始下载视频: {video_path}") + download_result = self.download_video(url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name) + video_path = download_result["output_path"] + else: + print(f"视频已存在: {video_path}") + + # 2. 判断script_name是否存在 + # 2.1.1 拼接脚本路径 NarratoAI/resource/scripts + script_path = os.path.join(current_path, "resource", "scripts", script_name) + if os.path.exists(script_path): + script = json.load(open(script_path, "r", encoding="utf-8")) + else: + # 2.1.2 生成脚本 + print("开始生成脚本...") + script_result = self.generate_script(video_path=video_path, skip_seconds=skip_seconds, threshold=threshold, vision_batch_size=vision_batch_size, vision_llm_provider=vision_llm_provider) + script = script_result["script"] + + # 2.2 保存脚本到json文件 + print("保存脚本到json文件...") + self.save_script_to_json(script=script, script_path=script_path) + + # 3. 剪辑视频 + print("开始剪辑视频...") + crop_result = self.crop_video(video_path=video_path, script=script) + subclip_videos = crop_result["subclip_videos"] + + # 4. 生成最终视频 + print("开始生成最终视频...") + self.generate_final_video( + task_id=task_id, + video_path=video_path, + script_path=script_path, + script=script, + subclip_videos=subclip_videos, + voice_name=voice_name + ) + + return { + "status": "等待异步生成视频", + "path": os.path.join(current_path, "storage", "tasks", task_id) + } + + except Exception as e: + return { + "status": "error", + "error": str(e) + } + + +# 使用示例 +if __name__ == "__main__": + pipeline = VideoPipeline() + result = pipeline.run_pipeline( + task_id="test_111901", + script_name="test.json", + youtube_url="https://www.youtube.com/watch?v=vLJ7Yed6FQ4", + video_name="2024-11-19-01", + skip_seconds=50, + threshold=35, + vision_batch_size=10, + vision_llm_provider="gemini", + voice_name="zh-CN-YunjianNeural", + ) + print(result) diff --git a/webui.py b/webui.py index 1f4cb97..434cbb9 100644 --- a/webui.py +++ b/webui.py @@ -3,7 +3,7 @@ import sys from uuid import uuid4 from app.config import config -from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings +from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings from webui.utils import cache, file_utils from app.utils import utils from app.models.schema import VideoClipParams, VideoAspect @@ -178,7 +178,9 @@ def main(): # 渲染基础设置面板 basic_settings.render_basic_settings(tr) - + # 渲染合并设置 + merge_settings.render_merge_settings(tr) + # 渲染主面板 panel = st.columns(3) with panel[0]: @@ -188,6 +190,8 @@ def main(): audio_settings.render_audio_panel(tr) with panel[2]: subtitle_settings.render_subtitle_panel(tr) + # 渲染系统设置面板 + system_settings.render_system_panel(tr) # 渲染视频审查面板 review_settings.render_review_panel(tr) diff --git a/webui.txt b/webui.txt index e835524..c8d66c9 100644 --- a/webui.txt +++ b/webui.txt @@ -47,3 +47,328 @@ pause rem set HF_ENDPOINT=https://hf-mirror.com streamlit run webui.py --browser.serverAddress="127.0.0.1" --server.enableCORS=True --server.maxUploadSize=2048 --browser.gatherUsageStats=False + +请求0: +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/youtube/download' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "url": "https://www.youtube.com/watch?v=Kenm35gdqtk", + "resolution": "1080p", + "output_format": "mp4", + "rename": "2024-11-19" +}' +{ + "url": "https://www.youtube.com/watch?v=Kenm35gdqtk", + "resolution": "1080p", + "output_format": "mp4", + "rename": "2024-11-19" +} + +请求1: +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/scripts/generate' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "skip_seconds": 0, + "threshold": 30, + "vision_batch_size": 10, + "vision_llm_provider": "gemini" +}' +{ + "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "skip_seconds": 0, + "threshold": 30, + "vision_batch_size": 10, + "vision_llm_provider": "gemini" +} + +请求2: +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/scripts/crop' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "video_script": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是视频画面的客观描述:\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角,背着军绿色背包,穿着卡其色长裤和深色T恤,走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包,一个镐头从背包里伸出来,包里还有一些其他工具。\n\n然后,视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头,展现男子的靴子在泥土中行走,以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘,包括从侧面和上方。\n\n可以看到他用工具挖掘,清理泥土,并检查挖出的土壤。\n\n最后,一个镜头展现了挖出的土壤的质地和颜色。", + "narration": "好的,接下来就是我们这位“胡须大侠”的精彩冒险了!只见他背着军绿色的背包,迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀,这个背包可真是个宝贝,里面藏着一把镐头和一些工具,简直像是个随身携带的“建筑工具箱”! \n\n看他挥舞着镐头,挖掘泥土的姿势,仿佛在进行一场“挖土大赛”,结果却比我做饭还要糟糕。泥土飞扬中,他的靴子也成了“泥巴艺术家”。最后,那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱!真是一次让人捧腹的建造之旅!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是视频画面的客观描述:\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头,镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上,用工具挖土。\n\n第三个镜头是一个中等镜头,显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构,该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n", + "narration": "接下来,我们的“挖土大师”又开始了他的森林探险。看这镜头,水滴在叶子上闪烁,仿佛在说:“快来,快来,这里有故事!”他一边挖洞,一边像个新手厨师试图切洋葱——每一下都小心翼翼,生怕自己不小心挖出个“历史遗址”。坐下休息的时候,脸上的表情就像发现新大陆一样!然后,他拿起斧头砍树枝,简直是现代版的“神雕侠侣”,只不过对象是树木。最后,那堆树枝架过泥泞的小水坑,仿佛在说:“我就是不怕湿脚的勇士!”这就是我们的建造之旅!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ] +}' +{ + "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "video_script": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是视频画面的客观描述:\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角,背着军绿色背包,穿着卡其色长裤和深色T恤,走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包,一个镐头从背包里伸出来,包里还有一些其他工具。\n\n然后,视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头,展现男子的靴子在泥土中行走,以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘,包括从侧面和上方。\n\n可以看到他用工具挖掘,清理泥土,并检查挖出的土壤。\n\n最后,一个镜头展现了挖出的土壤的质地和颜色。", + "narration": "好的,接下来就是我们这位“胡须大侠”的精彩冒险了!只见他背着军绿色的背包,迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀,这个背包可真是个宝贝,里面藏着一把镐头和一些工具,简直像是个随身携带的“建筑工具箱”! \n\n看他挥舞着镐头,挖掘泥土的姿势,仿佛在进行一场“挖土大赛”,结果却比我做饭还要糟糕。泥土飞扬中,他的靴子也成了“泥巴艺术家”。最后,那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱!真是一次让人捧腹的建造之旅!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是视频画面的客观描述:\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头,镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上,用工具挖土。\n\n第三个镜头是一个中等镜头,显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构,该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n", + "narration": "接下来,我们的“挖土大师”又开始了他的森林探险。看这镜头,水滴在叶子上闪烁,仿佛在说:“快来,快来,这里有故事!”他一边挖洞,一边像个新手厨师试图切洋葱——每一下都小心翼翼,生怕自己不小心挖出个“历史遗址”。坐下休息的时候,脸上的表情就像发现新大陆一样!然后,他拿起斧头砍树枝,简直是现代版的“神雕侠侣”,只不过对象是树木。最后,那堆树枝架过泥泞的小水坑,仿佛在说:“我就是不怕湿脚的勇士!”这就是我们的建造之旅!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ] +} + +请求3: +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "request": { + "video_clip_json": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是视频画面的客观描述:\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角,背着军绿色背包,穿着卡其色长裤和深色T恤,走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包,一个镐头从背包里伸出来,包里还有一些其他工具。\n\n然后,视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头,展现男子的靴子在泥土中行走,以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘,包括从侧面和上方。\n\n可以看到他用工具挖掘,清理泥土,并检查挖出的土壤。\n\n最后,一个镜头展现了挖出的土壤的质地和颜色。", + "narration": "好的,接下来就是我们这位“胡须大侠”的精彩冒险了!只见他背着军绿色的背包,迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀,这个背包可真是个宝贝,里面藏着一把镐头和一些工具,简直像是个随身携带的“建筑工具箱”! \n\n看他挥舞着镐头,挖掘泥土的姿势,仿佛在进行一场“挖土大赛”,结果却比我做饭还要糟糕。泥土飞扬中,他的靴子也成了“泥巴艺术家”。最后,那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱!真是一次让人捧腹的建造之旅!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是视频画面的客观描述:\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头,镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上,用工具挖土。\n\n第三个镜头是一个中等镜头,显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构,该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n", + "narration": "接下来,我们的“挖土大师”又开始了他的森林探险。看这镜头,水滴在叶子上闪烁,仿佛在说:“快来,快来,这里有故事!”他一边挖洞,一边像个新手厨师试图切洋葱——每一下都小心翼翼,生怕自己不小心挖出个“历史遗址”。坐下休息的时候,脸上的表情就像发现新大陆一样!然后,他拿起斧头砍树枝,简直是现代版的“神雕侠侣”,只不过对象是树木。最后,那堆树枝架过泥泞的小水坑,仿佛在说:“我就是不怕湿脚的勇士!”这就是我们的建造之旅!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ], + "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json", + "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "video_aspect": "16:9", + "video_language": "zh-CN", + "voice_name": "zh-CN-YunjianNeural", + "voice_volume": 1, + "voice_rate": 1.2, + "voice_pitch": 1, + "bgm_name": "random", + "bgm_type": "random", + "bgm_file": "", + "bgm_volume": 0.3, + "subtitle_enabled": true, + "subtitle_position": "bottom", + "font_name": "STHeitiMedium.ttc", + "text_fore_color": "#FFFFFF", + "text_background_color": "transparent", + "font_size": 75, + "stroke_color": "#000000", + "stroke_width": 1.5, + "custom_position": 70, + "n_threads": 8 + }, + "subclip_videos": { + "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4", + "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4" + } +}' +{ + "request": { + "video_clip_json": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是视频画面的客观描述:\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角,背着军绿色背包,穿着卡其色长裤和深色T恤,走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包,一个镐头从背包里伸出来,包里还有一些其他工具。\n\n然后,视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头,展现男子的靴子在泥土中行走,以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘,包括从侧面和上方。\n\n可以看到他用工具挖掘,清理泥土,并检查挖出的土壤。\n\n最后,一个镜头展现了挖出的土壤的质地和颜色。", + "narration": "好的,接下来就是我们这位“胡须大侠”的精彩冒险了!只见他背着军绿色的背包,迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀,这个背包可真是个宝贝,里面藏着一把镐头和一些工具,简直像是个随身携带的“建筑工具箱”! \n\n看他挥舞着镐头,挖掘泥土的姿势,仿佛在进行一场“挖土大赛”,结果却比我做饭还要糟糕。泥土飞扬中,他的靴子也成了“泥巴艺术家”。最后,那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱!真是一次让人捧腹的建造之旅!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是视频画面的客观描述:\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头,镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上,用工具挖土。\n\n第三个镜头是一个中等镜头,显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构,该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n", + "narration": "接下来,我们的“挖土大师”又开始了他的森林探险。看这镜头,水滴在叶子上闪烁,仿佛在说:“快来,快来,这里有故事!”他一边挖洞,一边像个新手厨师试图切洋葱——每一下都小心翼翼,生怕自己不小心挖出个“历史遗址”。坐下休息的时候,脸上的表情就像发现新大陆一样!然后,他拿起斧头砍树枝,简直是现代版的“神雕侠侣”,只不过对象是树木。最后,那堆树枝架过泥泞的小水坑,仿佛在说:“我就是不怕湿脚的勇士!”这就是我们的建造之旅!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ], + "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json", + "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "video_aspect": "16:9", + "video_language": "zh-CN", + "voice_name": "zh-CN-YunjianNeural", + "voice_volume": 1, + "voice_rate": 1.2, + "voice_pitch": 1, + "bgm_name": "random", + "bgm_type": "random", + "bgm_file": "", + "bgm_volume": 0.3, + "subtitle_enabled": true, + "subtitle_position": "bottom", + "font_name": "STHeitiMedium.ttc", + "text_fore_color": "#FFFFFF", + "text_background_color": "transparent", + "font_size": 75, + "stroke_color": "#000000", + "stroke_width": 1.5, + "custom_position": 70, + "n_threads": 8 + }, + "subclip_videos": { + "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4", + "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4" + } +} + + +请在最外层新建一个pipeline 工作流执行逻辑的代码; +他会按照下面的顺序请求接口 +1.下载视频 +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/youtube/download' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "url": "https://www.youtube.com/watch?v=Kenm35gdqtk", + "resolution": "1080p", + "output_format": "mp4", + "rename": "2024-11-19" +}' +2.生成脚本 +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/scripts/generate' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "skip_seconds": 0, + "threshold": 30, + "vision_batch_size": 10, + "vision_llm_provider": "gemini" +}' +3. 剪辑视频 +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/scripts/crop' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "video_script": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是视频画面的客观描述:\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角,背着军绿色背包,穿着卡其色长裤和深色T恤,走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包,一个镐头从背包里伸出来,包里还有一些其他工具。\n\n然后,视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头,展现男子的靴子在泥土中行走,以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘,包括从侧面和上方。\n\n可以看到他用工具挖掘,清理泥土,并检查挖出的土壤。\n\n最后,一个镜头展现了挖出的土壤的质地和颜色。", + "narration": "好的,接下来就是我们这位“胡须大侠”的精彩冒险了!只见他背着军绿色的背包,迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀,这个背包可真是个宝贝,里面藏着一把镐头和一些工具,简直像是个随身携带的“建筑工具箱”! \n\n看他挥舞着镐头,挖掘泥土的姿势,仿佛在进行一场“挖土大赛”,结果却比我做饭还要糟糕。泥土飞扬中,他的靴子也成了“泥巴艺术家”。最后,那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱!真是一次让人捧腹的建造之旅!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是视频画面的客观描述:\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头,镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上,用工具挖土。\n\n第三个镜头是一个中等镜头,显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构,该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n", + "narration": "接下来,我们的“挖土大师”又开始了他的森林探险。看这镜头,水滴在叶子上闪烁,仿佛在说:“快来,快来,这里有故事!”他一边挖洞,一边像个新手厨师试图切洋葱——每一下都小心翼翼,生怕自己不小心挖出个“历史遗址”。坐下休息的时候,脸上的表情就像发现新大陆一样!然后,他拿起斧头砍树枝,简直是现代版的“神雕侠侣”,只不过对象是树木。最后,那堆树枝架过泥泞的小水坑,仿佛在说:“我就是不怕湿脚的勇士!”这就是我们的建造之旅!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ] +}' +4.生成视频 +curl -X 'POST' \ + 'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "request": { + "video_clip_json": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是视频画面的客观描述:\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角,背着军绿色背包,穿着卡其色长裤和深色T恤,走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包,一个镐头从背包里伸出来,包里还有一些其他工具。\n\n然后,视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头,展现男子的靴子在泥土中行走,以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘,包括从侧面和上方。\n\n可以看到他用工具挖掘,清理泥土,并检查挖出的土壤。\n\n最后,一个镜头展现了挖出的土壤的质地和颜色。", + "narration": "好的,接下来就是我们这位“胡须大侠”的精彩冒险了!只见他背着军绿色的背包,迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀,这个背包可真是个宝贝,里面藏着一把镐头和一些工具,简直像是个随身携带的“建筑工具箱”! \n\n看他挥舞着镐头,挖掘泥土的姿势,仿佛在进行一场“挖土大赛”,结果却比我做饭还要糟糕。泥土飞扬中,他的靴子也成了“泥巴艺术家”。最后,那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱!真是一次让人捧腹的建造之旅!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是视频画面的客观描述:\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头,镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上,用工具挖土。\n\n第三个镜头是一个中等镜头,显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构,该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n", + "narration": "接下来,我们的“挖土大师”又开始了他的森林探险。看这镜头,水滴在叶子上闪烁,仿佛在说:“快来,快来,这里有故事!”他一边挖洞,一边像个新手厨师试图切洋葱——每一下都小心翼翼,生怕自己不小心挖出个“历史遗址”。坐下休息的时候,脸上的表情就像发现新大陆一样!然后,他拿起斧头砍树枝,简直是现代版的“神雕侠侣”,只不过对象是树木。最后,那堆树枝架过泥泞的小水坑,仿佛在说:“我就是不怕湿脚的勇士!”这就是我们的建造之旅!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ], + "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json", + "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4", + "video_aspect": "16:9", + "video_language": "zh-CN", + "voice_name": "zh-CN-YunjianNeural", + "voice_volume": 1, + "voice_rate": 1.2, + "voice_pitch": 1, + "bgm_name": "random", + "bgm_type": "random", + "bgm_file": "", + "bgm_volume": 0.3, + "subtitle_enabled": true, + "subtitle_position": "bottom", + "font_name": "STHeitiMedium.ttc", + "text_fore_color": "#FFFFFF", + "text_background_color": "transparent", + "font_size": 75, + "stroke_color": "#000000", + "stroke_width": 1.5, + "custom_position": 70, + "n_threads": 8 + }, + "subclip_videos": { + "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4", + "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4" + } +}' + +请求1,返回的参数是: +{ + "task_id": "4e9b575f-68c0-4ae1-b218-db42b67993d0", + "output_path": "E:\\projects\\NarratoAI\\resource\\videos\\2024-11-19.mp4", + "resolution": "1080p", + "format": "mp4", + "filename": "2024-11-19.mp4" +} +output_path需要传递给请求2 +请求2,返回数据为: +{ + "task_id": "04497017-953c-44b4-bf1d-9d8ed3ebbbce", + "script": [ + { + "timestamp": "00:10-01:01", + "picture": "好的,以下是對影片畫面的客觀描述:\n\n影片顯示一名留著鬍鬚的男子在一處樹林茂密的斜坡上挖掘。\n\n畫面一:男子從後方出現,背著一個軍綠色的背包,背包裡似乎裝有工具。他穿著卡其色的長褲和深色的登山鞋。\n\n畫面二:特寫鏡頭顯示男子的背包,一個舊的鎬頭從包裡露出來,包裡還有其他工具,包括一個鏟子。\n\n畫面三:男子用鎬頭在斜坡上挖土,背包放在他旁邊。\n\n畫面四:特寫鏡頭顯示男子的登山鞋在泥土中。\n\n畫面五:男子坐在斜坡上,用手清理樹根和泥土。\n\n畫面六:地上有一些鬆動的泥土和落葉。\n\n畫面七:男子的背包近景鏡頭,他正在挖掘。\n\n畫面八:男子在斜坡上挖掘,揚起一陣塵土。\n\n畫面九:特寫鏡頭顯示男子用手清理泥土。\n\n畫面十:特寫鏡頭顯示挖出的泥土剖面,可以看到土壤的層次。", + "narration": "上一个画面是我在绝美的自然中,准备开启我的“土豪”挖掘之旅。现在,你们看到这位留着胡子的“大哥”,他背着个军绿色的包,里面装的可不仅仅是工具,还有我对生活的无限热爱(以及一丝不安)。看!这把旧镐头就像我的前任——用起来费劲,但又舍不得扔掉。\n\n他在斜坡上挖土,泥土飞扬,仿佛在跟大地进行一场“泥巴大战”。每一铲下去,都能听到大地微微的呻吟:哎呀,我这颗小树根可比我当年的情感纠葛还难处理呢!别担心,这些泥土层次分明,简直可以开个“泥土博物馆”。所以,朋友们,跟着我一起享受这场泥泞中的乐趣吧!", + "OST": 2, + "new_timestamp": "00:00-00:51" + }, + { + "timestamp": "01:07-01:53", + "picture": "好的,以下是對影片畫面內容的客觀描述:\n\n影片以一系列森林環境的鏡頭開始。第一個鏡頭展示了綠葉植物的特寫鏡頭,葉子上有一些水珠。接下來的鏡頭是一個男人在森林裡挖掘一個小坑,他跪在地上,用鏟子挖土。\n\n接下來的鏡頭是同一個男人坐在他挖的坑旁邊,望著前方。然後,鏡頭顯示該坑的廣角鏡頭,顯示其結構和大小。\n\n之後的鏡頭,同一個男人在樹林裡劈柴。鏡頭最後呈現出一潭渾濁的水,周圍環繞著樹枝。然後鏡頭又回到了森林裡生長茂盛的植物特寫鏡頭。", + "narration": "好嘞,朋友们,我们已经在泥土博物馆里捣鼓了一阵子,现在是时候跟大自然亲密接触了!看看这片森林,绿叶上水珠闪闪发光,就像我曾经的爱情,虽然短暂,却美得让人心碎。\n\n现在,我在这里挖个小坑,感觉自己就像是一位新晋“挖土大王”,不过说实话,这手艺真不敢恭维,连铲子都快对我崩溃了。再说劈柴,这动作简直比我前任的情绪波动还要激烈!最后这一潭浑浊的水,别担心,它只是告诉我:生活就像这水,总有些杂质,但也别忘了,要勇敢面对哦!", + "OST": 2, + "new_timestamp": "00:51-01:37" + } + ] +} +output_path和script参数需要传递给请求3 +请求3返回参数是 +{ + "task_id": "b6f5a98a-b2e0-4e3d-89c5-64fb90db2ec1", + "subclip_videos": { + "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4", + "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4" + } +} +subclip_videos和 output_path和script参数需要传递给请求4 +最后完成工作流 + +0代表只播放文案音频,禁用视频原声;1代表只播放视频原声,不需要播放文案音频和字幕;2代表即播放文案音频也要播放视频原声; \ No newline at end of file diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index a189f65..f81effe 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -20,7 +20,7 @@ def render_audio_panel(tr): def render_tts_settings(tr): """渲染TTS(文本转语音)设置""" # 获取支持的语音列表 - support_locales = ["zh-CN", "zh-HK", "zh-TW", "en-US"] + support_locales = ["zh-CN"] voices = voice.get_all_azure_voices(filter_locals=support_locales) # 创建友好的显示名称 diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index adeca9e..d7b5144 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -52,18 +52,34 @@ def render_language_settings(tr): def render_proxy_settings(tr): """渲染代理设置""" - proxy_url_http = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "") - proxy_url_https = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "") + # 获取当前代理状态 + proxy_enabled = config.proxy.get("enabled", True) + proxy_url_http = config.proxy.get("http") + proxy_url_https = config.proxy.get("https") - HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http) - HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https) - - if HTTP_PROXY: - config.proxy["http"] = HTTP_PROXY - os.environ["HTTP_PROXY"] = HTTP_PROXY - if HTTPS_PROXY: - config.proxy["https"] = HTTPS_PROXY - os.environ["HTTPS_PROXY"] = HTTPS_PROXY + # 添加代理开关 + proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled) + + # 保存代理开关状态 + config.proxy["enabled"] = proxy_enabled + + # 只有在代理启用时才显示代理设置输入框 + if proxy_enabled: + HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http) + HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https) + + if HTTP_PROXY: + config.proxy["http"] = HTTP_PROXY + os.environ["HTTP_PROXY"] = HTTP_PROXY + if HTTPS_PROXY: + config.proxy["https"] = HTTPS_PROXY + os.environ["HTTPS_PROXY"] = HTTPS_PROXY + else: + # 当代理被禁用时,清除环境变量和配置 + os.environ.pop("HTTP_PROXY", None) + os.environ.pop("HTTPS_PROXY", None) + config.proxy["http"] = "" + config.proxy["https"] = "" def test_vision_model_connection(api_key, base_url, model_name, provider, tr): @@ -90,6 +106,28 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr): except Exception as e: return False, f"{tr('gemini model is not available')}: {str(e)}" + elif provider.lower() == 'qwenvl': + from openai import OpenAI + try: + client = OpenAI( + api_key=api_key, + base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + + # 发送一个简单的测试请求 + response = client.chat.completions.create( + model=model_name or "qwen-vl-max-latest", + messages=[{"role": "user", "content": "直接回复我文本'当前网络可用'"}] + ) + + if response and response.choices: + return True, tr("QwenVL model is available") + else: + return False, tr("QwenVL model returned invalid response") + + except Exception as e: + return False, f"{tr('QwenVL model is not available')}: {str(e)}" + elif provider.lower() == 'narratoapi': import requests try: @@ -116,7 +154,7 @@ def render_vision_llm_settings(tr): st.subheader(tr("Vision Model Settings")) # 视频分析模型提供商选择 - vision_providers = ['Gemini', 'NarratoAPI(待发布)', 'QwenVL(待发布)'] + vision_providers = ['Gemini', 'QwenVL', 'NarratoAPI(待发布)'] saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower() saved_provider_index = 0 @@ -142,18 +180,33 @@ def render_vision_llm_settings(tr): # 渲染视觉模型配置输入框 st_vision_api_key = st.text_input(tr("Vision API Key"), value=vision_api_key, type="password") - # 当选择 Gemini 时禁用 base_url 输入 - if vision_provider.lower() == 'gemini': + # 根据不同提供商设置默认值和帮助信息 + if vision_provider == 'gemini': st_vision_base_url = st.text_input( tr("Vision Base URL"), value=vision_base_url, disabled=True, help=tr("Gemini API does not require a base URL") ) + st_vision_model_name = st.text_input( + tr("Vision Model Name"), + value=vision_model_name or "gemini-1.5-flash", + help=tr("Default: gemini-1.5-flash") + ) + elif vision_provider == 'qwenvl': + st_vision_base_url = st.text_input( + tr("Vision Base URL"), + value=vision_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1", + help=tr("Default: https://dashscope.aliyuncs.com/compatible-mode/v1") + ) + st_vision_model_name = st.text_input( + tr("Vision Model Name"), + value=vision_model_name or "qwen-vl-max-latest", + help=tr("Default: qwen-vl-max-latest") + ) else: st_vision_base_url = st.text_input(tr("Vision Base URL"), value=vision_base_url) - - st_vision_model_name = st.text_input(tr("Vision Model Name"), value=vision_model_name) + st_vision_model_name = st.text_input(tr("Vision Model Name"), value=vision_model_name) # 在配置输入框后添加测试按钮 if st.button(tr("Test Connection"), key="test_vision_connection"): @@ -174,7 +227,7 @@ def render_vision_llm_settings(tr): # 保存视觉模型配置 if st_vision_api_key: config.app[f"vision_{vision_provider}_api_key"] = st_vision_api_key - st.session_state[f"vision_{vision_provider}_api_key"] = st_vision_api_key # 用于script_settings.py + st.session_state[f"vision_{vision_provider}_api_key"] = st_vision_api_key if st_vision_base_url: config.app[f"vision_{vision_provider}_base_url"] = st_vision_base_url st.session_state[f"vision_{vision_provider}_base_url"] = st_vision_base_url @@ -182,81 +235,6 @@ def render_vision_llm_settings(tr): config.app[f"vision_{vision_provider}_model_name"] = st_vision_model_name st.session_state[f"vision_{vision_provider}_model_name"] = st_vision_model_name - # # NarratoAPI 特殊配置 - # if vision_provider == 'narratoapi': - # st.subheader(tr("Narrato Additional Settings")) - # - # # Narrato API 基础配置 - # narrato_api_key = st.text_input( - # tr("Narrato API Key"), - # value=config.app.get("narrato_api_key", ""), - # type="password", - # help="用于访问 Narrato API 的密钥" - # ) - # if narrato_api_key: - # config.app["narrato_api_key"] = narrato_api_key - # st.session_state['narrato_api_key'] = narrato_api_key - # - # narrato_api_url = st.text_input( - # tr("Narrato API URL"), - # value=config.app.get("narrato_api_url", "http://127.0.0.1:8000/api/v1/video/analyze") - # ) - # if narrato_api_url: - # config.app["narrato_api_url"] = narrato_api_url - # st.session_state['narrato_api_url'] = narrato_api_url - # - # # 视频分析模型配置 - # st.markdown("##### " + tr("Vision Model Settings")) - # narrato_vision_model = st.text_input( - # tr("Vision Model Name"), - # value=config.app.get("narrato_vision_model", "gemini-1.5-flash") - # ) - # narrato_vision_key = st.text_input( - # tr("Vision Model API Key"), - # value=config.app.get("narrato_vision_key", ""), - # type="password", - # help="用于视频分析的模 API Key" - # ) - # - # if narrato_vision_model: - # config.app["narrato_vision_model"] = narrato_vision_model - # st.session_state['narrato_vision_model'] = narrato_vision_model - # if narrato_vision_key: - # config.app["narrato_vision_key"] = narrato_vision_key - # st.session_state['narrato_vision_key'] = narrato_vision_key - # - # # 文案生成模型配置 - # st.markdown("##### " + tr("Text Generation Model Settings")) - # narrato_llm_model = st.text_input( - # tr("LLM Model Name"), - # value=config.app.get("narrato_llm_model", "qwen-plus") - # ) - # narrato_llm_key = st.text_input( - # tr("LLM Model API Key"), - # value=config.app.get("narrato_llm_key", ""), - # type="password", - # help="用于文案生成的模型 API Key" - # ) - # - # if narrato_llm_model: - # config.app["narrato_llm_model"] = narrato_llm_model - # st.session_state['narrato_llm_model'] = narrato_llm_model - # if narrato_llm_key: - # config.app["narrato_llm_key"] = narrato_llm_key - # st.session_state['narrato_llm_key'] = narrato_llm_key - # - # # 批处理配置 - # narrato_batch_size = st.number_input( - # tr("Batch Size"), - # min_value=1, - # max_value=50, - # value=config.app.get("narrato_batch_size", 10), - # help="每批处理的图片数量" - # ) - # if narrato_batch_size: - # config.app["narrato_batch_size"] = narrato_batch_size - # st.session_state['narrato_batch_size'] = narrato_batch_size - def test_text_model_connection(api_key, base_url, model_name, provider, tr): """测试文本模型连接 @@ -328,6 +306,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr): except Exception as e: return False, f"{tr('Connection failed')}: {str(e)}" + def render_text_llm_settings(tr): """渲染文案生成模型设置""" st.subheader(tr("Text Generation Model Settings")) diff --git a/webui/components/merge_settings.py b/webui/components/merge_settings.py new file mode 100644 index 0000000..99b8b43 --- /dev/null +++ b/webui/components/merge_settings.py @@ -0,0 +1,303 @@ +import os +import time +import math +import sys +import tempfile +import traceback +import shutil + +import streamlit as st +from loguru import logger +from typing import List, Dict, Tuple +from dataclasses import dataclass +from streamlit.runtime.uploaded_file_manager import UploadedFile + +from webui.utils.merge_video import merge_videos_and_subtitles +from app.utils.utils import video_dir, srt_dir +from app.services.subtitle import extract_audio_and_create_subtitle + +# 定义临时目录路径 +TEMP_MERGE_DIR = os.path.join("storage", "temp", "merge") + +# 确保临时目录存在 +os.makedirs(TEMP_MERGE_DIR, exist_ok=True) + + +@dataclass +class VideoSubtitlePair: + video_file: UploadedFile | None + subtitle_file: str | None + base_name: str + order: int = 0 + + +def save_uploaded_file(uploaded_file: UploadedFile, target_dir: str) -> str: + """Save uploaded file to target directory and return the file path""" + file_path = os.path.join(target_dir, uploaded_file.name) + # 如果文件已存在,先删除它 + if os.path.exists(file_path): + os.remove(file_path) + with open(file_path, "wb") as f: + f.write(uploaded_file.getvalue()) + return file_path + + +def clean_temp_dir(): + """清空临时目录""" + if os.path.exists(TEMP_MERGE_DIR): + for file in os.listdir(TEMP_MERGE_DIR): + file_path = os.path.join(TEMP_MERGE_DIR, file) + try: + if os.path.isfile(file_path): + os.unlink(file_path) + except Exception as e: + logger.error(f"清理临时文件失败: {str(e)}") + + +def group_files(files: List[UploadedFile]) -> Dict[str, VideoSubtitlePair]: + """Group uploaded files by their base names""" + pairs = {} + order_counter = 0 + + # 首先处理所有视频文件 + for file in files: + base_name = os.path.splitext(file.name)[0] + ext = os.path.splitext(file.name)[1].lower() + + if ext == ".mp4": + if base_name not in pairs: + pairs[base_name] = VideoSubtitlePair(None, None, base_name, order_counter) + order_counter += 1 + pairs[base_name].video_file = file + # 保存视频文件到临时目录 + video_path = save_uploaded_file(file, TEMP_MERGE_DIR) + + # 然后处理所有字幕文件 + for file in files: + base_name = os.path.splitext(file.name)[0] + ext = os.path.splitext(file.name)[1].lower() + + if ext == ".srt": + # 即使没有对应视频也保存字幕文件 + subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt") + save_uploaded_file(file, TEMP_MERGE_DIR) + + if base_name in pairs: # 如果有对应的视频 + pairs[base_name].subtitle_file = subtitle_path + + return pairs + + +def render_merge_settings(tr): + """Render the merge settings section""" + with st.expander(tr("Video Subtitle Merge"), expanded=False): + # 上传文件区域 + uploaded_files = st.file_uploader( + tr("Upload Video and Subtitle Files"), + type=["mp4", "srt"], + accept_multiple_files=True, + key="merge_files" + ) + + if uploaded_files: + all_pairs = group_files(uploaded_files) + + if all_pairs: + st.write(tr("All Uploaded Files")) + + # 初始化或更新session state中的排序信息 + if 'file_orders' not in st.session_state: + st.session_state.file_orders = { + name: pair.order for name, pair in all_pairs.items() + } + st.session_state.needs_reorder = False + + # 确保所有新文件都有排序值 + for name, pair in all_pairs.items(): + if name not in st.session_state.file_orders: + st.session_state.file_orders[name] = pair.order + + # 移除不存在的文件的排序值 + st.session_state.file_orders = { + k: v for k, v in st.session_state.file_orders.items() + if k in all_pairs + } + + # 按照排序值对文件对进行排序 + sorted_pairs = sorted( + all_pairs.items(), + key=lambda x: st.session_state.file_orders[x[0]] + ) + + # 计算需要多少行来显示所有视频(每行5个) + num_pairs = len(sorted_pairs) + num_rows = (num_pairs + 4) // 5 # 向上取整,每行5个 + + # 遍历每一行 + for row in range(num_rows): + # 创建5列 + cols = st.columns(5) + + # 在这一行中填充视频(最多5个) + for col_idx in range(5): + pair_idx = row * 5 + col_idx + if pair_idx < num_pairs: + base_name, pair = sorted_pairs[pair_idx] + with cols[col_idx]: + st.caption(base_name) + + # 显示视频预览(如果存在) + video_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4") + if os.path.exists(video_path): + st.video(video_path) + else: + st.warning(tr("Missing Video")) + + # 显示字幕预览(如果存在) + subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt") + if os.path.exists(subtitle_path): + with open(subtitle_path, 'r', encoding='utf-8') as f: + subtitle_content = f.read() + st.markdown(tr("Subtitle Preview")) + st.text_area( + "Subtitle Content", + value=subtitle_content, + height=100, # 减高度以适应5列布局 + label_visibility="collapsed", + key=f"subtitle_preview_{base_name}" + ) + else: + st.warning(tr("Missing Subtitle")) + # 如果有视频但没有字幕,显示一键转录按钮 + if os.path.exists(video_path): + if st.button(tr("One-Click Transcribe"), key=f"transcribe_{base_name}"): + with st.spinner(tr("Transcribing...")): + try: + # 生成字幕文件 + result = extract_audio_and_create_subtitle(video_path, subtitle_path) + if result: + # 读取生成的字幕文件内容并显示预览 + with open(subtitle_path, 'r', encoding='utf-8') as f: + subtitle_content = f.read() + st.markdown(tr("Subtitle Preview")) + st.text_area( + "Subtitle Content", + value=subtitle_content, + height=150, + label_visibility="collapsed", + key=f"subtitle_preview_transcribed_{base_name}" + ) + st.success(tr("Transcription Complete!")) + # 更新pair的字幕文件路径 + pair.subtitle_file = subtitle_path + else: + st.error(tr("Transcription Failed. Please try again.")) + except Exception as e: + error_message = str(e) + logger.error(traceback.format_exc()) + if "rate limit exceeded" in error_message.lower(): + st.error(tr("API rate limit exceeded. Please wait about an hour and try again.")) + elif "resource_exhausted" in error_message.lower(): + st.error(tr("Resources exhausted. Please try again later.")) + else: + st.error(f"{tr('Transcription Failed')}: {str(e)}") + + # 排序输入框 + order = st.number_input( + tr("Order"), + min_value=0, + value=st.session_state.file_orders[base_name], + key=f"order_{base_name}", + on_change=lambda: setattr(st.session_state, 'needs_reorder', True) + ) + if order != st.session_state.file_orders[base_name]: + st.session_state.file_orders[base_name] = order + st.session_state.needs_reorder = True + + # 如果需要重新排序,重新加载页面 + if st.session_state.needs_reorder: + st.session_state.needs_reorder = False + st.rerun() + + # 找出有完整视频和字幕的文件对 + complete_pairs = { + k: v for k, v in all_pairs.items() + if os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.mp4")) and + os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.srt")) + } + + # 合并按钮和结果显示 + cols = st.columns([1, 2, 1]) + with cols[0]: + st.write(f"{tr('Mergeable Files')}: {len(complete_pairs)}") + + merge_videos_result = None + + with cols[1]: + if st.button(tr("Merge All Files"), type="primary", use_container_width=True): + try: + # 获取排序后的完整文件对 + sorted_complete_pairs = sorted( + [(k, v) for k, v in complete_pairs.items()], + key=lambda x: st.session_state.file_orders[x[0]] + ) + + video_paths = [] + subtitle_paths = [] + for base_name, _ in sorted_complete_pairs: + video_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4")) + subtitle_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")) + + # 获取输出文件路径 + output_video = os.path.join(video_dir(), f"merged_video_{time.strftime('%M%S')}.mp4") + output_subtitle = os.path.join(srt_dir(), f"merged_subtitle_{time.strftime('%M%S')}.srt") + + with st.spinner(tr("Merging files...")): + # 合并文件 + merge_videos_and_subtitles( + video_paths, + subtitle_paths, + output_video, + output_subtitle + ) + + success = True + error_msg = "" + + # 检查输出文件是否成功生成 + if not os.path.exists(output_video): + success = False + error_msg += tr("Failed to generate merged video. ") + if not os.path.exists(output_subtitle): + success = False + error_msg += tr("Failed to generate merged subtitle. ") + + if success: + # 显示成功消息 + st.success(tr("Merge completed!")) + merge_videos_result = (output_video, output_subtitle) + # 清理临时目录 + clean_temp_dir() + else: + st.error(error_msg) + + except Exception as e: + error_message = str(e) + if "moviepy" in error_message.lower(): + st.error(tr("Error processing video files. Please check if the videos are valid MP4 files.")) + elif "pysrt" in error_message.lower(): + st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files.")) + else: + st.error(f"{tr('Error during merge')}: {error_message}") + + # 合并结果预览放在合并按钮下方 + if merge_videos_result: + st.markdown(f"