Skip to content

Commit

Permalink
Merge pull request #437 from rmusser01/dev
Browse files Browse the repository at this point in the history
Multi-file audio upload
  • Loading branch information
rmusser01 authored Nov 23, 2024
2 parents a8ff2b5 + a623e0f commit d50376d
Show file tree
Hide file tree
Showing 38 changed files with 986 additions and 135 deletions.
152 changes: 80 additions & 72 deletions App_Function_Libraries/Audio/Audio_Files.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def download_audio_file(url, current_whisper_model="", use_cookies=False, cookie
logging.error(f"Unexpected error downloading audio file: {str(e)}")
raise

def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
def process_audio_files(audio_urls, audio_files, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize,
keep_timestamps, custom_title):
Expand Down Expand Up @@ -193,9 +193,9 @@ def convert_mp3_to_wav(mp3_file_path):
# Process URLs if provided
if audio_urls:
urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
for i, url in enumerate(urls):
for i, url in enumerate(urls, 1):
try:
update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
update_progress(f"Processing URL {i}/{len(urls)}: {url}")

# Download and process audio file
audio_file_path = download_audio_file(url, use_cookies, cookies)
Expand Down Expand Up @@ -260,83 +260,91 @@ def convert_mp3_to_wav(mp3_file_path):
)

processed_count += 1
update_progress(f"Successfully processed URL {i + 1}")
update_progress(f"Successfully processed URL {i}")
log_counter("audio_files_processed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})

except Exception as e:
failed_count += 1
update_progress(f"Failed to process URL {i + 1}: {str(e)}")
update_progress(f"Failed to process URL {i}: {str(e)}")
log_counter("audio_files_failed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
continue

# Process uploaded file if provided
if audio_file:
try:
update_progress("Processing uploaded file...")
if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
raise ValueError(f"File size exceeds maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB")

reencoded_mp3_path = reencode_mp3(audio_file.name)
temp_files.append(reencoded_mp3_path)

wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
temp_files.append(wav_file_path)

# Transcribe audio
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=diarize)

if isinstance(segments, dict) and 'segments' in segments:
segments = segments['segments']

if not isinstance(segments, list):
raise ValueError("Unexpected segments format received from speech_to_text")

transcription = format_transcription_with_timestamps(segments)
if not transcription.strip():
raise ValueError("Empty transcription generated")

# Initialize summary with default value
summary = "No summary available"

# Attempt summarization if API is provided
if api_name and api_name.lower() != "none":
try:
chunked_text = improved_chunking_process(transcription, chunk_options)
summary_result = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
if summary_result:
summary = summary_result
update_progress("Audio summarized successfully.")
except Exception as e:
logging.error(f"Summarization failed: {str(e)}")
summary = "Summary generation failed"

# Add to results
all_transcriptions.append(transcription)
all_summaries.append(summary)

# Add to database
title = custom_title if custom_title else os.path.basename(wav_file_path)
add_media_with_keywords(
url="Uploaded File",
title=title,
media_type='audio',
content=transcription,
keywords=custom_keywords,
prompt=custom_prompt_input,
summary=summary,
transcription_model=whisper_model,
author="Unknown",
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)

processed_count += 1
update_progress("Successfully processed uploaded file")
log_counter("audio_files_processed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
# Process uploaded files if provided
if audio_files:
# Convert to list if single file
if not isinstance(audio_files, list):
audio_files = [audio_files]

except Exception as e:
failed_count += 1
update_progress(f"Failed to process uploaded file: {str(e)}")
log_counter("audio_files_failed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
for i, audio_file in enumerate(audio_files, 1):
try:
file_title = f"{custom_title}_{i}" if custom_title else os.path.basename(audio_file.name)
update_progress(f"Processing file {i}/{len(audio_files)}: {file_title}")

if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
raise ValueError(f"File {file_title} size exceeds maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB")

# Process the audio file
reencoded_mp3_path = reencode_mp3(audio_file.name)
temp_files.append(reencoded_mp3_path)

wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
temp_files.append(wav_file_path)

# Transcribe audio
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=diarize)

if isinstance(segments, dict) and 'segments' in segments:
segments = segments['segments']

if not isinstance(segments, list):
raise ValueError("Unexpected segments format received from speech_to_text")

transcription = format_transcription_with_timestamps(segments)
if not transcription.strip():
raise ValueError("Empty transcription generated")

# Initialize summary with default value
summary = "No summary available"

# Attempt summarization if API is provided
if api_name and api_name.lower() != "none":
try:
chunked_text = improved_chunking_process(transcription, chunk_options)
summary_result = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
if summary_result:
summary = summary_result
update_progress(f"Audio file {i} summarized successfully.")
except Exception as e:
logging.error(f"Summarization failed for file {i}: {str(e)}")
summary = "Summary generation failed"

# Add to results with file identifier
all_transcriptions.append(f"=== {file_title} ===\n{transcription}")
all_summaries.append(f"=== {file_title} ===\n{summary}")

# Add to database
add_media_with_keywords(
url="Uploaded File",
title=file_title,
media_type='audio',
content=transcription,
keywords=custom_keywords,
prompt=custom_prompt_input,
summary=summary,
transcription_model=whisper_model,
author="Unknown",
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)

processed_count += 1
update_progress(f"Successfully processed file {i}")
log_counter("audio_files_processed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})

except Exception as e:
failed_count += 1
update_progress(f"Failed to process file {i}: {str(e)}")
log_counter("audio_files_failed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
continue

# Cleanup temporary files
if not keep_original:
Expand Down
53 changes: 45 additions & 8 deletions App_Function_Libraries/Audio/Audio_Transcription_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import gc
import json
import logging
from memory_profiler import profile
import multiprocessing
import os
import queue
Expand Down Expand Up @@ -43,6 +44,30 @@
# https://www.gyan.dev/ffmpeg/builds/
#

# FIXME
# 1. Implement chunking for large audio files
# def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, chunk_size=30):
# # ... existing code ...
# segments = []
# for segment_chunk in whisper_model_instance.transcribe(audio_file_path, beam_size=10, best_of=10, vad_filter=vad_filter, chunk_size=chunk_size):
# # Process each chunk
# # ... existing code ...
#
# 2. Use generators
# def generate_segments(segments_raw):
# for segment_chunk in segments_raw:
# yield {
# "Time_Start": segment_chunk.start,
# "Time_End": segment_chunk.end,
# "Text": segment_chunk.text
# }
# # Usage
# segments = list(generate_segments(segments_raw))
#
# 3. Use subprocess instead of os.system for ffmpeg
# 4. Adjust CPU threads properly
# 5. Use quantized models - compute_type="int8"


whisper_model_instance = None
config = load_comprehensive_config()
Expand Down Expand Up @@ -108,16 +133,26 @@ def __init__(
# **model_kwargs
)

def get_whisper_model(model_name, device):
# Implement FIXME
def unload_whisper_model():
global whisper_model_instance
if whisper_model_instance is not None:
del whisper_model_instance
whisper_model_instance = None
gc.collect()


def get_whisper_model(model_name, device, ):
global whisper_model_instance
if whisper_model_instance is None:
logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
# FIXME - add compute_type="int8"
whisper_model_instance = WhisperModel(model_name, device=device)
return whisper_model_instance

# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
#DEBUG
#@profile
@profile
def convert_to_wav(video_file_path, offset=0, overwrite=False):
log_counter("convert_to_wav_attempt", labels={"file_path": video_file_path})
start_time = time.time()
Expand Down Expand Up @@ -186,7 +221,7 @@ def convert_to_wav(video_file_path, offset=0, overwrite=False):

# Transcribe .wav into .segments.json
#DEBUG
#@profile
@profile
# FIXME - I feel like the `vad_filter` shoudl be enabled by default....
def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
log_counter("speech_to_text_attempt", labels={"file_path": audio_file_path, "model": whisper_model})
Expand All @@ -204,7 +239,6 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
if os.path.exists(out_file):
logging.info("speech-to-text: Segments file already exists: %s", out_file)
with open(out_file) as f:
global segments
segments = json.load(f)
return segments

Expand Down Expand Up @@ -251,13 +285,16 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
if save_json:
logging.info("speech-to-text: Saving segments to JSON file")
output_data = {'segments': segments}
logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
with open(prettified_out_file, 'w') as f:
json.dump(output_data, f, indent=2)

logging.info("speech-to-text: Saving JSON to %s", out_file)
with open(out_file, 'w') as f:
json.dump(output_data, f)
del output_data
gc.collect()
with open(prettified_out_file, 'w') as f:
logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
json.dump({'segments': segments}, f, indent=2)



logging.debug(f"speech-to-text: returning {segments[:500]}")
gc.collect()
Expand Down
21 changes: 13 additions & 8 deletions App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,13 @@ def create_audio_processing_tab():
with gr.Row():
with gr.Column():
audio_url_input = gr.Textbox(label="Audio File URL(s)", placeholder="Enter the URL(s) of the audio file(s), one per line")
audio_file_input = gr.File(label="Upload Audio File", file_types=["audio/*"])
custom_title_input = gr.Textbox(label="Custom Title/Name", placeholder="Enter a custom title or name for the audio file")
# Updated to support multiple files
audio_file_input = gr.Files(
label="Upload Audio Files (Supported formats: MP3, WAV, M4A, FLAC, AAC, OGG)",
#file_types=[".mp3", ".ogg", ".aac", ".flac", ".wav", ".m4a", ".flac", ".wma", ".aiff", ".alac"],
file_count="multiple"
)
custom_title_input = gr.Textbox(label="Custom Title Prefix", placeholder="Enter a prefix for the audio files (individual files will be numbered)")
use_cookies_input = gr.Checkbox(label="Use cookies for authenticated download", value=False)
cookies_input = gr.Textbox(
label="Audio Download Cookies",
Expand Down Expand Up @@ -207,7 +212,7 @@ def update_prompts(preset_name):
)
api_key_input = gr.Textbox(label="API Key (if required)", placeholder="Enter your API key here", type="password")
custom_keywords_input = gr.Textbox(label="Custom Keywords", placeholder="Enter custom keywords, comma-separated")
keep_original_input = gr.Checkbox(label="Keep original audio file", value=False)
keep_original_input = gr.Checkbox(label="Keep original audio files", value=False)

chunking_options_checkbox = gr.Checkbox(label="Show Chunking Options", value=False)
with gr.Row(visible=False) as chunking_options_box:
Expand All @@ -229,9 +234,9 @@ def update_prompts(preset_name):
process_audio_button = gr.Button("Process Audio File(s)")

with gr.Column():
audio_progress_output = gr.Textbox(label="Progress")
audio_transcription_output = gr.Textbox(label="Transcription")
audio_summary_output = gr.Textbox(label="Summary")
audio_progress_output = gr.Textbox(label="Progress", lines=10)
audio_transcription_output = gr.Textbox(label="Transcriptions", lines=10)
audio_summary_output = gr.Textbox(label="Summaries", lines=10)
download_transcription = gr.File(label="Download All Transcriptions as JSON")
download_summary = gr.File(label="Download All Summaries as Text")

Expand All @@ -244,8 +249,8 @@ def update_prompts(preset_name):
outputs=[audio_progress_output, audio_transcription_output, audio_summary_output]
)

def on_file_clear(file):
if file is None:
def on_file_clear(files):
if not files:
cleanup_temp_files()

audio_file_input.clear(
Expand Down
26 changes: 22 additions & 4 deletions Config_Files/Backup_Config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,29 @@ backup_count = 5

#[Comments]
#OpenAI Models:
# f
# gpt-4o
# gpt-4o-2024-08-06
# gpt-4o-mini
# o1-preview
# o1-mini
# text-embedding-3-large
# text-embedding-3-small
#
#Anthropic Models:
# f
# claude-3-5-sonnet-20241022
# claude-3-5-sonnet-20240620
# claude-3-5-haiku-20241022
# claude-3-opus-20240229
#
#Cohere Models:
# f
# command-r-plus-08-2024
# command-r-plus-04-2024
# command-r-08-2024
# command-r-03-2024
#
#DeepSeek Models:
# f
# deepseek-chat
#
#Groq Models:
# f
#Mistral Models:
Expand All @@ -123,6 +139,8 @@ backup_count = 5
# open-codestral-mamba
# Google's Models (11/15/2024): https://ai.google.dev/gemini-api/docs/models/gemini
# gemini-1.5-pro
# gemini-1.5-pro-2
# LearnLM
# gemini-1.5-flash
# gemini-1.5-flash-8b
# aqa
Expand Down
Loading

0 comments on commit d50376d

Please sign in to comment.