Merge pull request #437 from rmusser01/dev

Multi-file audio upload
rmusser01 · Nov 23, 2024 · d50376d · d50376d
2 parents a8ff2b5 + a623e0f
commit d50376d
Show file tree

Hide file tree

Showing 38 changed files with 986 additions and 135 deletions.
diff --git a/App_Function_Libraries/Audio/Audio_Files.py b/App_Function_Libraries/Audio/Audio_Files.py
@@ -106,7 +106,7 @@ def download_audio_file(url, current_whisper_model="", use_cookies=False, cookie
         logging.error(f"Unexpected error downloading audio file: {str(e)}")
         raise
 
-def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
+def process_audio_files(audio_urls, audio_files, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
                         custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
                         use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize,
                         keep_timestamps, custom_title):
@@ -193,9 +193,9 @@ def convert_mp3_to_wav(mp3_file_path):
         # Process URLs if provided
         if audio_urls:
             urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
-            for i, url in enumerate(urls):
+            for i, url in enumerate(urls, 1):
                 try:
-                    update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
+                    update_progress(f"Processing URL {i}/{len(urls)}: {url}")
 
                     # Download and process audio file
                     audio_file_path = download_audio_file(url, use_cookies, cookies)
@@ -260,83 +260,91 @@ def convert_mp3_to_wav(mp3_file_path):
                     )
 
                     processed_count += 1
-                    update_progress(f"Successfully processed URL {i + 1}")
+                    update_progress(f"Successfully processed URL {i}")
                     log_counter("audio_files_processed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
 
                 except Exception as e:
                     failed_count += 1
-                    update_progress(f"Failed to process URL {i + 1}: {str(e)}")
+                    update_progress(f"Failed to process URL {i}: {str(e)}")
                     log_counter("audio_files_failed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
                     continue
 
-        # Process uploaded file if provided
-        if audio_file:
-            try:
-                update_progress("Processing uploaded file...")
-                if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
-                    raise ValueError(f"File size exceeds maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB")
-
-                reencoded_mp3_path = reencode_mp3(audio_file.name)
-                temp_files.append(reencoded_mp3_path)
-
-                wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
-                temp_files.append(wav_file_path)
-
-                # Transcribe audio
-                segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=diarize)
-
-                if isinstance(segments, dict) and 'segments' in segments:
-                    segments = segments['segments']
-
-                if not isinstance(segments, list):
-                    raise ValueError("Unexpected segments format received from speech_to_text")
-
-                transcription = format_transcription_with_timestamps(segments)
-                if not transcription.strip():
-                    raise ValueError("Empty transcription generated")
-
-                # Initialize summary with default value
-                summary = "No summary available"
-
-                # Attempt summarization if API is provided
-                if api_name and api_name.lower() != "none":
-                    try:
-                        chunked_text = improved_chunking_process(transcription, chunk_options)
-                        summary_result = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
-                        if summary_result:
-                            summary = summary_result
-                        update_progress("Audio summarized successfully.")
-                    except Exception as e:
-                        logging.error(f"Summarization failed: {str(e)}")
-                        summary = "Summary generation failed"
-
-                # Add to results
-                all_transcriptions.append(transcription)
-                all_summaries.append(summary)
-
-                # Add to database
-                title = custom_title if custom_title else os.path.basename(wav_file_path)
-                add_media_with_keywords(
-                    url="Uploaded File",
-                    title=title,
-                    media_type='audio',
-                    content=transcription,
-                    keywords=custom_keywords,
-                    prompt=custom_prompt_input,
-                    summary=summary,
-                    transcription_model=whisper_model,
-                    author="Unknown",
-                    ingestion_date=datetime.now().strftime('%Y-%m-%d')
-                )
-
-                processed_count += 1
-                update_progress("Successfully processed uploaded file")
-                log_counter("audio_files_processed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
+        # Process uploaded files if provided
+        if audio_files:
+            # Convert to list if single file
+            if not isinstance(audio_files, list):
+                audio_files = [audio_files]
 
-            except Exception as e:
-                failed_count += 1
-                update_progress(f"Failed to process uploaded file: {str(e)}")
-                log_counter("audio_files_failed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
+            for i, audio_file in enumerate(audio_files, 1):
+                try:
+                    file_title = f"{custom_title}_{i}" if custom_title else os.path.basename(audio_file.name)
+                    update_progress(f"Processing file {i}/{len(audio_files)}: {file_title}")
+
+                    if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
+                        raise ValueError(f"File {file_title} size exceeds maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB")
+
+                    # Process the audio file
+                    reencoded_mp3_path = reencode_mp3(audio_file.name)
+                    temp_files.append(reencoded_mp3_path)
+
+                    wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
+                    temp_files.append(wav_file_path)
+
+                    # Transcribe audio
+                    segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=diarize)
+
+                    if isinstance(segments, dict) and 'segments' in segments:
+                        segments = segments['segments']
+
+                    if not isinstance(segments, list):
+                        raise ValueError("Unexpected segments format received from speech_to_text")
+
+                    transcription = format_transcription_with_timestamps(segments)
+                    if not transcription.strip():
+                        raise ValueError("Empty transcription generated")
+
+                    # Initialize summary with default value
+                    summary = "No summary available"
+
+                    # Attempt summarization if API is provided
+                    if api_name and api_name.lower() != "none":
+                        try:
+                            chunked_text = improved_chunking_process(transcription, chunk_options)
+                            summary_result = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
+                            if summary_result:
+                                summary = summary_result
+                            update_progress(f"Audio file {i} summarized successfully.")
+                        except Exception as e:
+                            logging.error(f"Summarization failed for file {i}: {str(e)}")
+                            summary = "Summary generation failed"
+
+                    # Add to results with file identifier
+                    all_transcriptions.append(f"=== {file_title} ===\n{transcription}")
+                    all_summaries.append(f"=== {file_title} ===\n{summary}")
+
+                    # Add to database
+                    add_media_with_keywords(
+                        url="Uploaded File",
+                        title=file_title,
+                        media_type='audio',
+                        content=transcription,
+                        keywords=custom_keywords,
+                        prompt=custom_prompt_input,
+                        summary=summary,
+                        transcription_model=whisper_model,
+                        author="Unknown",
+                        ingestion_date=datetime.now().strftime('%Y-%m-%d')
+                    )
+
+                    processed_count += 1
+                    update_progress(f"Successfully processed file {i}")
+                    log_counter("audio_files_processed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
+
+                except Exception as e:
+                    failed_count += 1
+                    update_progress(f"Failed to process file {i}: {str(e)}")
+                    log_counter("audio_files_failed_total", 1, {"whisper_model": whisper_model, "api_name": api_name})
+                    continue
 
         # Cleanup temporary files
         if not keep_original:

diff --git a/App_Function_Libraries/Audio/Audio_Transcription_Lib.py b/App_Function_Libraries/Audio/Audio_Transcription_Lib.py
@@ -16,6 +16,7 @@
 import gc
 import json
 import logging
+from memory_profiler import profile
 import multiprocessing
 import os
 import queue
@@ -43,6 +44,30 @@
 #       https://www.gyan.dev/ffmpeg/builds/
 #
 
+# FIXME
+# 1. Implement chunking for large audio files
+# def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, chunk_size=30):
+#     # ... existing code ...
+#     segments = []
+#     for segment_chunk in whisper_model_instance.transcribe(audio_file_path, beam_size=10, best_of=10, vad_filter=vad_filter, chunk_size=chunk_size):
+#         # Process each chunk
+#         # ... existing code ...
+#
+# 2. Use generators
+#     def generate_segments(segments_raw):
+#         for segment_chunk in segments_raw:
+#             yield {
+#                 "Time_Start": segment_chunk.start,
+#                 "Time_End": segment_chunk.end,
+#                 "Text": segment_chunk.text
+#             }
+#     # Usage
+#     segments = list(generate_segments(segments_raw))
+#
+# 3. Use subprocess instead of os.system for ffmpeg
+# 4. Adjust CPU threads properly
+# 5. Use quantized models - compute_type="int8"
+
 
 whisper_model_instance = None
 config = load_comprehensive_config()
@@ -108,16 +133,26 @@ def __init__(
 #            **model_kwargs
         )
 
-def get_whisper_model(model_name, device):
+# Implement FIXME
+def unload_whisper_model():
+    global whisper_model_instance
+    if whisper_model_instance is not None:
+        del whisper_model_instance
+        whisper_model_instance = None
+        gc.collect()
+
+
+def get_whisper_model(model_name, device, ):
     global whisper_model_instance
     if whisper_model_instance is None:
         logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
+        # FIXME - add compute_type="int8"
         whisper_model_instance = WhisperModel(model_name, device=device)
     return whisper_model_instance
 
 # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
 #DEBUG
-#@profile
+@profile
 def convert_to_wav(video_file_path, offset=0, overwrite=False):
     log_counter("convert_to_wav_attempt", labels={"file_path": video_file_path})
     start_time = time.time()
@@ -186,7 +221,7 @@ def convert_to_wav(video_file_path, offset=0, overwrite=False):
 
 # Transcribe .wav into .segments.json
 #DEBUG
-#@profile
+@profile
 # FIXME - I feel like the `vad_filter` shoudl be enabled by default....
 def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
     log_counter("speech_to_text_attempt", labels={"file_path": audio_file_path, "model": whisper_model})
@@ -204,7 +239,6 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
         if os.path.exists(out_file):
             logging.info("speech-to-text: Segments file already exists: %s", out_file)
             with open(out_file) as f:
-                global segments
                 segments = json.load(f)
             return segments
 
@@ -251,13 +285,16 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
         if save_json:
             logging.info("speech-to-text: Saving segments to JSON file")
             output_data = {'segments': segments}
-            logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
-            with open(prettified_out_file, 'w') as f:
-                json.dump(output_data, f, indent=2)
-
             logging.info("speech-to-text: Saving JSON to %s", out_file)
             with open(out_file, 'w') as f:
                 json.dump(output_data, f)
+            del output_data
+            gc.collect()
+            with open(prettified_out_file, 'w') as f:
+                logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
+                json.dump({'segments': segments}, f, indent=2)
+
+
 
         logging.debug(f"speech-to-text: returning {segments[:500]}")
         gc.collect()

diff --git a/App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py b/App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py
@@ -39,8 +39,13 @@ def create_audio_processing_tab():
         with gr.Row():
             with gr.Column():
                 audio_url_input = gr.Textbox(label="Audio File URL(s)", placeholder="Enter the URL(s) of the audio file(s), one per line")
-                audio_file_input = gr.File(label="Upload Audio File", file_types=["audio/*"])
-                custom_title_input = gr.Textbox(label="Custom Title/Name", placeholder="Enter a custom title or name for the audio file")
+                # Updated to support multiple files
+                audio_file_input = gr.Files(
+                    label="Upload Audio Files (Supported formats: MP3, WAV, M4A, FLAC, AAC, OGG)",
+                    #file_types=[".mp3", ".ogg", ".aac", ".flac", ".wav", ".m4a", ".flac", ".wma", ".aiff", ".alac"],
+                    file_count="multiple"
+                )
+                custom_title_input = gr.Textbox(label="Custom Title Prefix", placeholder="Enter a prefix for the audio files (individual files will be numbered)")
                 use_cookies_input = gr.Checkbox(label="Use cookies for authenticated download", value=False)
                 cookies_input = gr.Textbox(
                     label="Audio Download Cookies",
@@ -207,7 +212,7 @@ def update_prompts(preset_name):
                 )
                 api_key_input = gr.Textbox(label="API Key (if required)", placeholder="Enter your API key here", type="password")
                 custom_keywords_input = gr.Textbox(label="Custom Keywords", placeholder="Enter custom keywords, comma-separated")
-                keep_original_input = gr.Checkbox(label="Keep original audio file", value=False)
+                keep_original_input = gr.Checkbox(label="Keep original audio files", value=False)
 
                 chunking_options_checkbox = gr.Checkbox(label="Show Chunking Options", value=False)
                 with gr.Row(visible=False) as chunking_options_box:
@@ -229,9 +234,9 @@ def update_prompts(preset_name):
                 process_audio_button = gr.Button("Process Audio File(s)")
 
             with gr.Column():
-                audio_progress_output = gr.Textbox(label="Progress")
-                audio_transcription_output = gr.Textbox(label="Transcription")
-                audio_summary_output = gr.Textbox(label="Summary")
+                audio_progress_output = gr.Textbox(label="Progress", lines=10)
+                audio_transcription_output = gr.Textbox(label="Transcriptions", lines=10)
+                audio_summary_output = gr.Textbox(label="Summaries", lines=10)
                 download_transcription = gr.File(label="Download All Transcriptions as JSON")
                 download_summary = gr.File(label="Download All Summaries as Text")
 
@@ -244,8 +249,8 @@ def update_prompts(preset_name):
             outputs=[audio_progress_output, audio_transcription_output, audio_summary_output]
         )
 
-        def on_file_clear(file):
-            if file is None:
+        def on_file_clear(files):
+            if not files:
                 cleanup_temp_files()
 
         audio_file_input.clear(

diff --git a/Config_Files/Backup_Config.txt b/Config_Files/Backup_Config.txt
@@ -103,13 +103,29 @@ backup_count = 5
 
 #[Comments]
 #OpenAI Models:
-#    f
+#    gpt-4o
+#    gpt-4o-2024-08-06
+#    gpt-4o-mini
+#    o1-preview
+#    o1-mini
+#    text-embedding-3-large
+#    text-embedding-3-small
+#
 #Anthropic Models:
-#    f
+#    claude-3-5-sonnet-20241022
+#    claude-3-5-sonnet-20240620
+#    claude-3-5-haiku-20241022
+#    claude-3-opus-20240229
+#
 #Cohere Models:
-#    f
+#    command-r-plus-08-2024
+#    command-r-plus-04-2024
+#    command-r-08-2024
+#    command-r-03-2024
+#
 #DeepSeek Models:
-#    f
+#    deepseek-chat
+#
 #Groq Models:
 #    f
 #Mistral Models:
@@ -123,6 +139,8 @@ backup_count = 5
 #    open-codestral-mamba
 # Google's Models (11/15/2024): https://ai.google.dev/gemini-api/docs/models/gemini
 #   gemini-1.5-pro
+#   gemini-1.5-pro-2
+#   LearnLM
 #   gemini-1.5-flash
 #   gemini-1.5-flash-8b
 #   aqa