diff --git a/.gitignore b/.gitignore index 2e0134142..ff1d2ca6d 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/App_Function_Libraries/Article_Summarization_Lib.py b/App_Function_Libraries/Article_Summarization_Lib.py index 811d48708..b33f08c6c 100644 --- a/App_Function_Libraries/Article_Summarization_Lib.py +++ b/App_Function_Libraries/Article_Summarization_Lib.py @@ -24,7 +24,7 @@ # 3rd-Party Imports from tqdm import tqdm -from App_Function_Libraries.Utils import sanitize_filename +from App_Function_Libraries.Utils.Utils import sanitize_filename # Local Imports from Article_Extractor_Lib import scrape_article from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \ @@ -32,7 +32,7 @@ from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \ summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \ summarize_with_mistral -from App_Function_Libraries.DB_Manager import ingest_article_to_db +from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db # ####################################################################################################################### # Function Definitions diff --git a/App_Function_Libraries/Audio_Files.py b/App_Function_Libraries/Audio_Files.py index 61e6f0519..3dfe8f31f 100644 --- a/App_Function_Libraries/Audio_Files.py +++ b/App_Function_Libraries/Audio_Files.py @@ -30,11 +30,11 @@ from App_Function_Libraries.Chunk_Lib import improved_chunking_process # # Local Imports -from App_Function_Libraries.DB_Manager import add_media_to_database, add_media_with_keywords, \ +from App_Function_Libraries.DB.DB_Manager import add_media_to_database, add_media_with_keywords, \ check_media_and_whisper_model from App_Function_Libraries.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \ perform_summarization -from App_Function_Libraries.Utils import create_download_directory, save_segments_to_json, downloaded_files, \ +from App_Function_Libraries.Utils.Utils import create_download_directory, save_segments_to_json, downloaded_files, \ sanitize_filename from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata diff --git a/App_Function_Libraries/Book_Ingestion_Lib.py b/App_Function_Libraries/Book_Ingestion_Lib.py index 41aa06038..f59c2fc0d 100644 --- a/App_Function_Libraries/Book_Ingestion_Lib.py +++ b/App_Function_Libraries/Book_Ingestion_Lib.py @@ -20,7 +20,7 @@ # Import Local -from SQLite_DB import add_media_with_keywords +from App_Function_Libraries.DB.SQLite_DB import add_media_with_keywords ####################################################################################################################### # Function Definitions diff --git a/App_Function_Libraries/Chat.py b/App_Function_Libraries/Chat.py index c7174e9ad..280414e46 100644 --- a/App_Function_Libraries/Chat.py +++ b/App_Function_Libraries/Chat.py @@ -13,13 +13,13 @@ # External Imports # # Local Imports -from App_Function_Libraries.DB_Manager import get_conversation_name, save_chat_history_to_database +from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \ chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface#, chat_with_vllm from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \ chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi -from App_Function_Libraries.SQLite_DB import load_media_content -from App_Function_Libraries.Utils import generate_unique_filename +from App_Function_Libraries.DB.SQLite_DB import load_media_content +from App_Function_Libraries.Utils.Utils import generate_unique_filename # #################################################################################################### # diff --git a/App_Function_Libraries/Chunk_Lib.py b/App_Function_Libraries/Chunk_Lib.py index 5444514b2..1520fba34 100644 --- a/App_Function_Libraries/Chunk_Lib.py +++ b/App_Function_Libraries/Chunk_Lib.py @@ -6,24 +6,25 @@ # #### # Import necessary libraries +import hashlib import logging import re - -from typing import List, Optional, Tuple, Dict, Any - -from openai import OpenAI -from tqdm import tqdm +from typing import Any, Dict, List, Optional, Tuple # # Import 3rd party +from openai import OpenAI +from tqdm import tqdm +from langdetect import detect from transformers import GPT2Tokenizer import nltk from nltk.tokenize import sent_tokenize, word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity +import textstat # # Import Local from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize -from App_Function_Libraries.Utils import load_comprehensive_config +from App_Function_Libraries.Utils.Utils import load_comprehensive_config # @@ -43,6 +44,15 @@ def ntlk_prep(): config = load_comprehensive_config() openai_api_key = config.get('API', 'openai_api_key', fallback=None) + +def detect_language(text: str) -> str: + try: + return detect(text) + except: + # Default to English if detection fails + return 'en' + + def load_document(file_path): with open(file_path, 'r') as file: text = file.read() @@ -51,20 +61,27 @@ def load_document(file_path): def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]: chunk_method = chunk_options.get('method', 'words') - max_chunk_size = chunk_options.get('max_size', 300) + base_size = chunk_options.get('base_size', 1000) + min_size = chunk_options.get('min_size', 100) + max_size = chunk_options.get('max_size', 2000) overlap = chunk_options.get('overlap', 0) - language = chunk_options.get('language', 'english') + language = chunk_options.get('language', None) adaptive = chunk_options.get('adaptive', False) multi_level = chunk_options.get('multi_level', False) + if language is None: + language = detect_language(text) + if adaptive: - max_chunk_size = adaptive_chunk_size(text, max_chunk_size) + max_chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size) + else: + max_chunk_size = base_size if multi_level: chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language) else: if chunk_method == 'words': - chunks = chunk_text_by_words(text, max_chunk_size, overlap) + chunks = chunk_text_by_words(text, max_chunk_size, overlap, language) elif chunk_method == 'sentences': chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language) elif chunk_method == 'paragraphs': @@ -77,15 +94,23 @@ def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[ # No chunking applied chunks = [text] - return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks] + chunks_with_metadata = [] + for i, chunk in enumerate(chunks): + metadata = get_chunk_metadata( + chunk, + text, + chunk_type=chunk_method, + language=language + ) + metadata['chunk_index'] = i + metadata['total_chunks'] = len(chunks) + chunks_with_metadata.append({ + 'text': chunk, + 'metadata': metadata + }) -def adaptive_chunk_size(text: str, base_size: int) -> int: - # Simple adaptive logic: adjust chunk size based on text complexity - avg_word_length = sum(len(word) for word in text.split()) / len(text.split()) - if avg_word_length > 6: # Arbitrary threshold for "complex" text - return int(base_size * 0.8) # Reduce chunk size for complex text - return base_size + return chunks_with_metadata def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]: @@ -96,7 +121,7 @@ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, la chunks = [] for para in paragraphs: if method == 'words': - chunks.extend(chunk_text_by_words(para, max_size, overlap)) + chunks.extend(chunk_text_by_words(para, max_size, overlap, language)) elif method == 'sentences': chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language)) else: @@ -105,8 +130,20 @@ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, la return chunks -def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]: - words = text.split() +def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0, language: str = None) -> List[str]: + if language is None: + language = detect_language(text) + + if language.startswith('zh'): # Chinese + import jieba + words = list(jieba.cut(text)) + elif language == 'ja': # Japanese + import fugashi + tagger = fugashi.Tagger() + words = [word.surface for word in tagger(text)] + else: # Default to simple splitting for other languages + words = text.split() + chunks = [] for i in range(0, len(words), max_words - overlap): chunk = ' '.join(words[i:i + max_words]) @@ -114,10 +151,22 @@ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> Li return post_process_chunks(chunks) -def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[ - str]: +def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = None) -> List[str]: + if language is None: + language = detect_language(text) + nltk.download('punkt', quiet=True) - sentences = nltk.sent_tokenize(text, language=language) + + if language.startswith('zh'): # Chinese + import jieba + sentences = list(jieba.cut(text, cut_all=False)) + elif language == 'ja': # Japanese + import fugashi + tagger = fugashi.Tagger() + sentences = [word.surface for word in tagger(text) if word.feature.pos1 in ['記号', '補助記号'] and word.surface.strip()] + else: # Default to NLTK for other languages + sentences = sent_tokenize(text, language=language) + chunks = [] for i in range(0, len(sentences), max_sentences - overlap): chunk = ' '.join(sentences[i:i + max_sentences]) @@ -162,25 +211,51 @@ def post_process_chunks(chunks: List[str]) -> List[str]: return [chunk.strip() for chunk in chunks if chunk.strip()] -def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]: +def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", + chapter_number: Optional[int] = None, + chapter_pattern: Optional[str] = None, + language: str = None) -> Dict[str, Any]: try: start_index = full_text.index(chunk) + end_index = start_index + len(chunk) + # Calculate a hash for the chunk + chunk_hash = hashlib.md5(chunk.encode()).hexdigest() + metadata = { 'start_index': start_index, - 'end_index': start_index + len(chunk), + 'end_index': end_index, 'word_count': len(chunk.split()), 'char_count': len(chunk), - 'chunk_type': chunk_type + 'chunk_type': chunk_type, + 'language': language, + 'chunk_hash': chunk_hash, + 'relative_position': start_index / len(full_text), + 'readability_score': textstat.flesch_reading_ease(chunk) } + if chunk_type == "chapter": metadata['chapter_number'] = chapter_number metadata['chapter_pattern'] = chapter_pattern + + # Add readability score (you might need to install 'textstat') + metadata['readability_score'] = textstat.flesch_reading_ease(chunk) + return metadata except ValueError as e: logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}") raise +def process_document_with_metadata(text: str, chunk_options: Dict[str, Any], + document_metadata: Dict[str, Any]) -> Dict[str, Any]: + chunks = improved_chunking_process(text, chunk_options) + + return { + 'document_metadata': document_metadata, + 'chunks': chunks + } + + # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number def chunk_text_hybrid(text, max_tokens=1000): sentences = nltk.tokenize.sent_tokenize(text) @@ -215,6 +290,9 @@ def chunk_on_delimiter(input_string: str, combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] return combined_chunks + + + # ????FIXME def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None): summarized_chunks = [] @@ -273,7 +351,7 @@ def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, # # Chunk text into segments based on semantic similarity -def count_units(text, unit='tokens'): +def count_units(text, unit='words'): if unit == 'words': return len(text.split()) elif unit == 'tokens': @@ -321,21 +399,20 @@ def semantic_chunking(text, max_chunk_size=2000, unit='words'): return chunks -def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100): +def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100, unit='words'): try: with open(file_path, 'r', encoding='utf-8') as file: content = file.read() - chunks = semantic_chunking(content, max_chunk_size, overlap) + chunks = semantic_chunking(content, max_chunk_size, unit) return chunks except Exception as e: logging.error(f"Error chunking text file: {str(e)}") return None -####################################################################################################################### - - - +# +# +####################################################################################################################### ####################################################################################################################### @@ -361,7 +438,7 @@ def combine_chunks_with_no_minimum( chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False, -) -> Tuple[List[str], List[int]]: +) -> Tuple[List[str], List[List[int]], int]: dropped_chunk_count = 0 output = [] # list to hold the final combined chunks output_indices = [] # list to hold the indices of the final combined chunks @@ -579,7 +656,108 @@ def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Di # print(chunk['text']) # print(f"Metadata: {chunk['metadata']}\n") +# +# End of ebook chapter chunking +####################################################################################################################### + +####################################################################################################################### +# +# Functions for adapative chunking: + +# FIXME - punkt +def adaptive_chunk_size(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> int: + # Ensure NLTK data is downloaded + nltk.download('punkt', quiet=True) + + # Tokenize the text into sentences + sentences = sent_tokenize(text) + + # Calculate average sentence length + avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) + + # Adjust chunk size based on average sentence length + if avg_sentence_length < 10: + size_factor = 1.2 # Increase chunk size for short sentences + elif avg_sentence_length > 20: + size_factor = 0.8 # Decrease chunk size for long sentences + else: + size_factor = 1.0 + + # Calculate adaptive chunk size + adaptive_size = int(base_size * size_factor) + + # Ensure chunk size is within bounds + return max(min_size, min(adaptive_size, max_size)) + +# Non-Punkt version +# def adaptive_chunk_size(text: str, base_size: int, min_size: int = 100, max_size: int = 2000) -> int: +# # Adaptive logic: adjust chunk size based on text complexity +# words = text.split() +# if not words: +# return base_size # Return base_size if text is empty +# +# avg_word_length = sum(len(word) for word in words) / len(words) +# +# if avg_word_length > 6: # Threshold for "complex" text +# adjusted_size = int(base_size * 0.8) # Reduce chunk size for complex text +# elif avg_word_length < 4: # Threshold for "simple" text +# adjusted_size = int(base_size * 1.2) # Increase chunk size for simple text +# else: +# adjusted_size = base_size +# +# # Ensure the chunk size is within the specified range +# return max(min_size, min(adjusted_size, max_size)) + + +def adaptive_chunking(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> List[str]: + chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size) + words = text.split() + chunks = [] + current_chunk = [] + current_length = 0 + + for word in words: + if current_length + len(word) > chunk_size and current_chunk: + chunks.append(' '.join(current_chunk)) + current_chunk = [] + current_length = 0 + current_chunk.append(word) + current_length += len(word) + 1 # +1 for space + + if current_chunk: + chunks.append(' '.join(current_chunk)) + return chunks + +# FIXME - usage example +# chunk_options = { +# 'method': 'words', # or any other method +# 'base_size': 1000, +# 'min_size': 100, +# 'max_size': 2000, +# 'adaptive': True, +# 'language': 'en' +# } +#chunks = improved_chunking_process(your_text, chunk_options) + + +# Example of chunking a document with metadata +# document_metadata = { +# 'title': 'Example Document', +# 'author': 'John Doe', +# 'creation_date': '2023-06-14', +# 'source': 'https://example.com/document', +# 'document_type': 'article' +# } +# +# chunk_options = { +# 'method': 'sentences', +# 'base_size': 1000, +# 'adaptive': True, +# 'language': 'en' +# } +# +# processed_document = process_document_with_metadata(your_text, chunk_options, document_metadata) # diff --git a/App_Function_Libraries/DB_Manager.py b/App_Function_Libraries/DB/DB_Manager.py similarity index 81% rename from App_Function_Libraries/DB_Manager.py rename to App_Function_Libraries/DB/DB_Manager.py index 817f70ca6..77d48880b 100644 --- a/App_Function_Libraries/DB_Manager.py +++ b/App_Function_Libraries/DB/DB_Manager.py @@ -3,7 +3,7 @@ import os from contextlib import contextmanager from time import sleep -from typing import Tuple +from typing import Tuple, List import sqlite3 # 3rd-Party Libraries from elasticsearch import Elasticsearch @@ -23,7 +23,7 @@ #### # Import your existing SQLite functions -from SQLite_DB import ( +from App_Function_Libraries.DB.SQLite_DB import ( update_media_content as sqlite_update_media_content, list_prompts as sqlite_list_prompts, search_and_display as sqlite_search_and_display, @@ -58,7 +58,9 @@ get_conversation_name as sqlite_get_conversation_name, add_media_with_keywords as sqlite_add_media_with_keywords, check_media_and_whisper_model as sqlite_check_media_and_whisper_model, - DatabaseError + DatabaseError, create_document_version as sqlite_create_document_version, + get_document_version as sqlite_get_document_version, sqlite_search_db, sqlite_add_media_chunk, + sqlite_update_fts_for_media, sqlite_get_unprocessed_media ) class Database: @@ -154,6 +156,15 @@ def get_db_config(): # # DB-Searching functions +def search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10): + if db_type == 'sqlite': + return sqlite_search_db(search_query, search_fields, keywords, page, results_per_page) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version when available + raise NotImplementedError("Elasticsearch version of search_db not yet implemented") + else: + raise ValueError(f"Unsupported database type: {db_type}") + def view_database(*args, **kwargs): if db_type == 'sqlite': return sqlite_view_database(*args, **kwargs) @@ -179,6 +190,7 @@ def search_and_display(*args, **kwargs): # End of DB-Searching functions ############################################################################################################ + ############################################################################################################ # # Transcript-related Functions @@ -194,16 +206,38 @@ def get_transcripts(*args, **kwargs): # End of Transcript-related Functions ############################################################################################################ + ############################################################################################################ # # DB-Ingestion functions def add_media_to_database(*args, **kwargs): if db_type == 'sqlite': - return sqlite_add_media_to_database(*args, **kwargs) + result = sqlite_add_media_to_database(*args, **kwargs) + + # Extract content + segments = args[2] + if isinstance(segments, list): + content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment]) + elif isinstance(segments, dict): + content = segments.get('text', '') or segments.get('content', '') + else: + content = str(segments) + + # Extract media_id from the result + # Assuming the result is in the format "Media 'Title' added/updated successfully with ID: {media_id}" + import re + match = re.search(r"with ID: (\d+)", result) + if match: + media_id = int(match.group(1)) + + # Create initial document version + sqlite_create_document_version(media_id, content) + + return result elif db_type == 'elasticsearch': # Implement Elasticsearch version - raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + raise NotImplementedError("Elasticsearch version of add_media_to_database not yet implemented") def import_obsidian_note_to_db(*args, **kwargs): @@ -213,12 +247,27 @@ def import_obsidian_note_to_db(*args, **kwargs): # Implement Elasticsearch version raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + def update_media_content(*args, **kwargs): if db_type == 'sqlite': - return sqlite_update_media_content(*args, **kwargs) + result = sqlite_update_media_content(*args, **kwargs) + + # Extract media_id and content + selected_item = args[0] + item_mapping = args[1] + content_input = args[2] + + if selected_item and item_mapping and selected_item in item_mapping: + media_id = item_mapping[selected_item] + + # Create new document version + sqlite_create_document_version(media_id, content_input) + + return result elif db_type == 'elasticsearch': # Implement Elasticsearch version - raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + raise NotImplementedError("Elasticsearch version of update_media_content not yet implemented") + def add_media_with_keywords(*args, **kwargs): if db_type == 'sqlite': @@ -241,6 +290,36 @@ def ingest_article_to_db(url, title, author, content, keywords, summary, ingesti else: raise ValueError(f"Unsupported database type: {db_type}") + +def add_media_chunk(media_id: int, chunk_text: str, start_index: int, end_index: int, chunk_id: str): + if db_type == 'sqlite': + sqlite_add_media_chunk(db, media_id, chunk_text, start_index, end_index, chunk_id) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version not yet implemented") + else: + raise ValueError(f"Unsupported database type: {db_type}") + +def update_fts_for_media(media_id: int): + if db_type == 'sqlite': + sqlite_update_fts_for_media(db, media_id) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version not yet implemented") + else: + raise ValueError(f"Unsupported database type: {db_type}") + + +def get_unprocessed_media(): + if db_type == 'sqlite': + return sqlite_get_unprocessed_media(db) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of get_unprocessed_media not yet implemented") + else: + raise ValueError(f"Unsupported database type: {db_type}") + + # # End of DB-Ingestion functions ############################################################################################################ @@ -443,6 +522,7 @@ def empty_trash(*args, **kwargs): # End of Trash-related Functions ############################################################################################################ + ############################################################################################################ # # DB-Backup Functions @@ -458,6 +538,31 @@ def create_automated_backup(*args, **kwargs): # End of DB-Backup Functions ############################################################################################################ + +############################################################################################################ +# +# Document Versioning Functions + +def create_document_version(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_create_document_version(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of create_document_version not yet implemented") + +def get_document_version(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_get_document_version(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of get_document_version not yet implemented") + +# +# End of Document Versioning Functions +############################################################################################################ + + + ############################################################################################################ # # Function to close the database connection for SQLite diff --git a/App_Function_Libraries/SQLite_DB.py b/App_Function_Libraries/DB/SQLite_DB.py similarity index 94% rename from App_Function_Libraries/SQLite_DB.py rename to App_Function_Libraries/DB/SQLite_DB.py index 84f3ac097..3fcba2578 100644 --- a/App_Function_Libraries/SQLite_DB.py +++ b/App_Function_Libraries/DB/SQLite_DB.py @@ -58,7 +58,7 @@ from datetime import datetime, timedelta from typing import List, Tuple, Dict, Any # Local Libraries -from App_Function_Libraries.Utils import is_valid_url +from App_Function_Libraries.Utils.Utils import is_valid_url # Third-Party Libraries import gradio as gr import pandas as pd @@ -337,10 +337,9 @@ def create_tables(db) -> None: chunk_text TEXT, start_index INTEGER, end_index INTEGER, - vector_embedding BLOB, + chunk_id TEXT, FOREIGN KEY (media_id) REFERENCES Media(id) - ) - ''', + )''', ''' CREATE TABLE IF NOT EXISTS UnvectorizedMediaChunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -356,7 +355,17 @@ def create_tables(db) -> None: metadata TEXT, FOREIGN KEY (media_id) REFERENCES Media(id) ) + ''', ''' + CREATE TABLE IF NOT EXISTS DocumentVersions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + media_id INTEGER NOT NULL, + version_number INTEGER NOT NULL, + content TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (media_id) REFERENCES Media(id) + ) + ''', ] index_queries = [ @@ -379,7 +388,9 @@ def create_tables(db) -> None: 'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_chunk_type ON UnvectorizedMediaChunks(chunk_type)', # CREATE UNIQUE INDEX statements 'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_url ON Media(url)', - 'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id)' + 'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id)', + 'CREATE INDEX IF NOT EXISTS idx_document_versions_media_id ON DocumentVersions(media_id)', + 'CREATE INDEX IF NOT EXISTS idx_document_versions_version_number ON DocumentVersions(version_number)', ] virtual_table_queries = [ @@ -481,6 +492,30 @@ def check_media_and_whisper_model(title=None, url=None, current_whisper_model=No return False, f"Media found with same whisper model (ID: {media_id})" +def sqlite_add_media_chunk(db, media_id: int, chunk_text: str, start_index: int, end_index: int, chunk_id: str): + with db.get_connection() as conn: + cursor = conn.cursor() + cursor.execute( + "INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index, chunk_id) VALUES (?, ?, ?, ?, ?)", + (media_id, chunk_text, start_index, end_index, chunk_id) + ) + conn.commit() + +def sqlite_update_fts_for_media(db, media_id: int): + with db.get_connection() as conn: + cursor = conn.cursor() + cursor.execute("INSERT OR REPLACE INTO media_fts (rowid, title, content) SELECT id, title, content FROM Media WHERE id = ?", (media_id,)) + conn.commit() + + +def sqlite_get_unprocessed_media(db): + with db.get_connection() as conn: + cursor = conn.cursor() + cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)") + return cursor.fetchall() + +# +# End of Media-related Functions ####################################################################################################################### # Keyword-related Functions # @@ -823,7 +858,7 @@ def add_media_version(media_id: int, prompt: str, summary: str) -> None: # Function to search the database with advanced options, including keyword search and full-text search -def search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10): +def sqlite_search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10): if page < 1: raise ValueError("Page number must be 1 or greater.") @@ -874,7 +909,7 @@ def search_db(search_query: str, search_fields: List[str], keywords: str, page: # Gradio function to handle user input and display results with pagination, with better feedback def search_and_display(search_query, search_fields, keywords, page): - results = search_db(search_query, search_fields, keywords, page) + results = sqlite_search_db(search_query, search_fields, keywords, page) if isinstance(results, pd.DataFrame): # Convert DataFrame to a list of tuples or lists @@ -952,7 +987,7 @@ def format_results(results): # Function to export search results to CSV or markdown with pagination def export_to_file(search_query: str, search_fields: List[str], keyword: str, page: int = 1, results_per_file: int = 1000, export_format: str = 'csv'): try: - results = search_db(search_query, search_fields, keyword, page, results_per_file) + results = sqlite_search_db(search_query, search_fields, keyword, page, results_per_file) if not results: return "No results found to export." @@ -999,6 +1034,7 @@ def is_valid_date(date_string: str) -> bool: except ValueError: return False + # Add ingested media to DB def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'): try: @@ -1076,6 +1112,9 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr logging.info("Adding new media version") add_media_version(media_id, custom_prompt_input, summary) + # Create initial document version + create_document_version(media_id, content) + conn.commit() logging.info(f"Media '{info_dict.get('title', 'Untitled')}' successfully added/updated with ID: {media_id}") @@ -1088,44 +1127,6 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr except Exception as e: logging.error(f"Unexpected Error: {e}") raise DatabaseError(f"Unexpected error: {e}") -# def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'): -# try: -# # Extract content from segments -# if isinstance(segments, list): -# content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment]) -# elif isinstance(segments, dict): -# content = segments.get('text', '') or segments.get('content', '') -# else: -# content = str(segments) -# -# logging.debug(f"Extracted content (first 500 chars): {content[:500]}") -# -# # Set default custom prompt if not provided -# if custom_prompt_input is None: -# custom_prompt_input = """No Custom Prompt Provided or Was Used.""" -# -# logging.info(f"Adding media to database: URL={url}, Title={info_dict.get('title', 'Untitled')}, Type={media_type}") -# -# result = add_media_with_keywords( -# url=url, -# title=info_dict.get('title', 'Untitled'), -# media_type=media_type, -# content=content, -# keywords=','.join(keywords) if isinstance(keywords, list) else keywords, -# prompt=custom_prompt_input or 'No prompt provided', -# summary=summary or 'No summary provided', -# transcription_model=whisper_model, -# author=info_dict.get('uploader', 'Unknown'), -# ingestion_date=datetime.now().strftime('%Y-%m-%d') -# ) -# -# logging.info(f"Media added successfully: {result}") -# return result -# -# except Exception as e: -# logging.error(f"Error in add_media_to_database: {str(e)}") -# raise - # # End of .... @@ -1400,15 +1401,19 @@ def update_media_content(selected_item, item_mapping, content_input, prompt_inpu VALUES (?, ?, ?, CURRENT_TIMESTAMP) """, (media_id, prompt_input, summary_input)) + # Create new document version + new_version = create_document_version(media_id, content_input) + conn.commit() - return f"Content updated successfully for media ID: {media_id}" + return f"Content updated successfully for media ID: {media_id}. New version: {new_version}" else: return "No item selected or invalid selection" except Exception as e: logging.error(f"Error updating media content: {e}") return f"Error updating content: {str(e)}" + def search_media_database(query: str) -> List[Tuple[int, str, str]]: try: with db.get_connection() as conn: @@ -2012,4 +2017,73 @@ def user_delete_item(media_id: int, force: bool = False) -> str: # # End of Functions to handle deletion of media items ####################################################################################################################### +# +# Functions to manage document versions + +def create_document_version(media_id: int, content: str) -> int: + try: + with db.get_connection() as conn: + cursor = conn.cursor() + + # Get the latest version number + cursor.execute(''' + SELECT MAX(version_number) + FROM DocumentVersions + WHERE media_id = ? + ''', (media_id,)) + + latest_version = cursor.fetchone()[0] or 0 + new_version = latest_version + 1 + + # Insert new version + cursor.execute(''' + INSERT INTO DocumentVersions (media_id, version_number, content) + VALUES (?, ?, ?) + ''', (media_id, new_version, content)) + + conn.commit() + return new_version + except sqlite3.Error as e: + logging.error(f"Error creating document version: {e}") + raise DatabaseError(f"Error creating document version: {e}") + +def get_document_version(media_id: int, version_number: int = None) -> Dict[str, Any]: + try: + with db.get_connection() as conn: + cursor = conn.cursor() + + if version_number is None: + # Get the latest version + cursor.execute(''' + SELECT id, version_number, content, created_at + FROM DocumentVersions + WHERE media_id = ? + ORDER BY version_number DESC + LIMIT 1 + ''', (media_id,)) + else: + cursor.execute(''' + SELECT id, version_number, content, created_at + FROM DocumentVersions + WHERE media_id = ? AND version_number = ? + ''', (media_id, version_number)) + + result = cursor.fetchone() + + if result: + return { + 'id': result[0], + 'version_number': result[1], + 'content': result[2], + 'created_at': result[3] + } + else: + return None + except sqlite3.Error as e: + logging.error(f"Error retrieving document version: {e}") + raise DatabaseError(f"Error retrieving document version: {e}") + +# +# End of Functions to manage document versions +####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/DB/__init__.py b/App_Function_Libraries/DB/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/App_Function_Libraries/Gradio_Related.py b/App_Function_Libraries/Gradio_Related.py index d587a12d8..6df7d591a 100644 --- a/App_Function_Libraries/Gradio_Related.py +++ b/App_Function_Libraries/Gradio_Related.py @@ -14,7 +14,7 @@ import gradio as gr # # Local Imports -from App_Function_Libraries.DB_Manager import get_db_config +from App_Function_Libraries.DB.DB_Manager import get_db_config from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab from App_Function_Libraries.Gradio_UI.Chat_ui import chat_workflows_tab, create_chat_management_tab, \ create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface @@ -33,7 +33,8 @@ from App_Function_Libraries.Gradio_UI.Podcast_tab import create_podcast_tab from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \ - create_search_summaries_tab, create_viewing_tab, create_embeddings_tab, create_rag_tab, create_search_tab + create_search_summaries_tab, create_viewing_tab, create_embeddings_tab, create_rag_tab, create_search_tab, \ + create_view_embeddings_tab from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \ create_delete_trash_tab from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \ @@ -250,6 +251,7 @@ def launch_ui(share_public=None, server_mode=False): create_search_tab() create_rag_tab() create_embeddings_tab() + create_view_embeddings_tab() create_viewing_tab() create_search_summaries_tab() create_prompt_search_tab() diff --git a/App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py b/App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py index d89a556d3..5ca761069 100644 --- a/App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py +++ b/App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py @@ -8,7 +8,7 @@ # # Local Imports from App_Function_Libraries.Audio_Files import process_audio_files -from App_Function_Libraries.DB_Manager import load_preset_prompts +from App_Function_Libraries.DB.DB_Manager import load_preset_prompts from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt from App_Function_Libraries.Gradio_UI.Gradio_Shared import whisper_models # diff --git a/App_Function_Libraries/Gradio_UI/Chat_ui.py b/App_Function_Libraries/Gradio_UI/Chat_ui.py index 95c338261..7c4a2164d 100644 --- a/App_Function_Libraries/Gradio_UI/Chat_ui.py +++ b/App_Function_Libraries/Gradio_UI/Chat_ui.py @@ -15,7 +15,7 @@ # # Local Imports from App_Function_Libraries.Chat import chat, save_chat_history, update_chat_content, save_chat_history_to_db_wrapper -from App_Function_Libraries.DB_Manager import add_chat_message, search_chat_conversations, create_chat_conversation, \ +from App_Function_Libraries.DB.DB_Manager import add_chat_message, search_chat_conversations, create_chat_conversation, \ get_chat_messages, update_chat_message, delete_chat_message, load_preset_prompts, db from App_Function_Libraries.Gradio_UI.Gradio_Shared import update_dropdown, update_user_prompt @@ -941,12 +941,6 @@ def parse_formatted_content(formatted_content): return search_query, search_button, conversation_list, conversation_mapping, chat_content, save_button, result_message, chat_preview -# FIXME - busted and incomplete -# Mock function to simulate LLM processing -def process_with_llm(workflow, context, prompt): - return f"LLM output for {workflow} with context: {context[:30]}... and prompt: {prompt[:30]}..." - - # Load workflows from a JSON file json_path = Path('./Helper_Scripts/Workflows/Workflows.json') with json_path.open('r') as f: @@ -956,61 +950,139 @@ def process_with_llm(workflow, context, prompt): # FIXME - broken Completely. Doesn't work. def chat_workflows_tab(): with gr.TabItem("Chat Workflows"): - with gr.Blocks() as chat_workflows_block: - gr.Markdown("# Workflows using LLMs") + gr.Markdown("# Workflows using LLMs") + with gr.Row(): workflow_selector = gr.Dropdown(label="Select Workflow", choices=[wf['name'] for wf in workflows]) - context_input = gr.Textbox(label="Context", lines=5) + api_selector = gr.Dropdown( + label="Select API Endpoint", + choices=["OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter", + "Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"], + value="OpenAI" + ) + api_key_input = gr.Textbox(label="API Key (optional)", type="password") - # Create lists to hold UI components - prompt_inputs = [] - process_buttons = [] + context_input = gr.Textbox(label="Initial Context (optional)", lines=5) + + # Create a container for dynamic components + with gr.Column() as dynamic_components: + prompt_displays = [] + user_inputs = [] output_boxes = [] - max_prompts = max(len(wf['prompts']) for wf in workflows) - - # Pre-create the maximum number of prompt sections - for i in range(max_prompts): - prompt_input = gr.Textbox(label=f"Prompt {i + 1}", lines=2, visible=False) - output_box = gr.Textbox(label=f"Output {i + 1}", lines=5, visible=False) - process_button = gr.Button(f"Process Prompt {i + 1}", visible=False) - - prompt_inputs.append(prompt_input) - output_boxes.append(output_box) - process_buttons.append(process_button) - - process_button.click( - fn=lambda context, prompt, workflow_name, step=i: process(context, prompt, workflow_name, step), - inputs=[context_input, prompt_input, workflow_selector], - outputs=[output_box] - ) + process_buttons = [] + regenerate_buttons = [] + + # Create the maximum number of components needed + max_steps = max(len(wf['prompts']) for wf in workflows) + for i in range(max_steps): + prompt_displays.append(gr.Markdown(visible=False)) + user_inputs.append(gr.Textbox(label=f"Your Response", lines=2, visible=False)) + output_boxes.append(gr.Textbox(label=f"AI Output", lines=5, visible=False)) + with gr.Row(): + process_buttons.append(gr.Button(f"Process Step {i + 1}", visible=False)) + regenerate_buttons.append(gr.Button(f"🔄 Regenerate", visible=False)) + + def update_workflow_ui(workflow_name): + selected_workflow = next(wf for wf in workflows if wf['name'] == workflow_name) + num_prompts = len(selected_workflow['prompts']) + + prompt_updates = [] + input_updates = [] + output_updates = [] + button_updates = [] + regenerate_updates = [] + + for i in range(max_steps): + if i < num_prompts: + prompt_updates.append( + gr.update(value=f"**Step {i + 1}:** {selected_workflow['prompts'][i]}", visible=True)) + input_updates.append(gr.update(value="", visible=True, interactive=(i == 0))) + output_updates.append(gr.update(value="", visible=True)) + button_updates.append(gr.update(visible=(i == 0))) + regenerate_updates.append(gr.update(visible=False)) + else: + prompt_updates.append(gr.update(visible=False)) + input_updates.append(gr.update(visible=False)) + output_updates.append(gr.update(visible=False)) + button_updates.append(gr.update(visible=False)) + regenerate_updates.append(gr.update(visible=False)) + + return prompt_updates + input_updates + output_updates + button_updates + regenerate_updates + + def process(context, user_inputs, workflow_name, api_endpoint, api_key, step): + selected_workflow = next(wf for wf in workflows if wf['name'] == workflow_name) + + # Build up the context from previous steps + full_context = context + "\n\n" + for i in range(step + 1): + full_context += f"Question: {selected_workflow['prompts'][i]}\n" + full_context += f"Answer: {user_inputs[i]}\n" + if i < step: + full_context += f"AI Output: {output_boxes[i].value}\n\n" + + result = process_with_llm(workflow_name, full_context, selected_workflow['prompts'][step], api_endpoint, + api_key) + + prompt_updates = [gr.update() for _ in range(max_steps)] + input_updates = [] + output_updates = [gr.update() for _ in range(max_steps)] + button_updates = [] + regenerate_updates = [] + + for i in range(len(selected_workflow['prompts'])): + if i == step: + regenerate_updates.append(gr.update(visible=True)) + elif i == step + 1: + input_updates.append(gr.update(interactive=True)) + button_updates.append(gr.update(visible=True)) + regenerate_updates.append(gr.update(visible=False)) + elif i > step + 1: + input_updates.append(gr.update(interactive=False)) + button_updates.append(gr.update(visible=False)) + regenerate_updates.append(gr.update(visible=False)) + else: + input_updates.append(gr.update(interactive=False)) + button_updates.append(gr.update(visible=False)) + regenerate_updates.append(gr.update(visible=True)) + + return [result] + prompt_updates + input_updates + output_updates + button_updates + regenerate_updates + + # Set up event handlers + workflow_selector.change( + update_workflow_ui, + inputs=[workflow_selector], + outputs=prompt_displays + user_inputs + output_boxes + process_buttons + regenerate_buttons + ) + + # Set up process button click events + for i, button in enumerate(process_buttons): + button.click( + fn=lambda context, *user_inputs, wf_name, api_endpoint, api_key, step=i: process(context, user_inputs, + wf_name, api_endpoint, + api_key, step), + inputs=[context_input] + user_inputs + [workflow_selector, api_selector, api_key_input], + outputs=[output_boxes[ + i]] + prompt_displays + user_inputs + output_boxes + process_buttons + regenerate_buttons + ) - def process(context, prompt, workflow_name, step): - selected_workflow = next(wf for wf in workflows if wf['name'] == workflow_name) - # Update context with previous outputs - for j in range(step): - context += f"\n\n{output_boxes[j].value}" - result = process_with_llm(selected_workflow['name'], context, prompt) - return result - - def update_prompt_sections(workflow_name): - selected_workflow = next(wf for wf in workflows if wf['name'] == workflow_name) - num_prompts = len(selected_workflow['prompts']) - - for i in range(max_prompts): - if i < num_prompts: - prompt_inputs[i].visible = True - prompt_inputs[i].value = selected_workflow['prompts'][i] - process_buttons[i].visible = True - output_boxes[i].visible = True - else: - prompt_inputs[i].visible = False - process_buttons[i].visible = False - output_boxes[i].visible = False - - # Bind the workflow selector to update the UI - workflow_selector.change(update_prompt_sections, inputs=[workflow_selector], outputs=[]) - - return chat_workflows_block + # Set up regenerate button click events + for i, button in enumerate(regenerate_buttons): + button.click( + fn=lambda context, *user_inputs, wf_name, api_endpoint, api_key, step=i: process(context, user_inputs, + wf_name, api_endpoint, + api_key, step), + inputs=[context_input] + user_inputs + [workflow_selector, api_selector, api_key_input], + outputs=[output_boxes[ + i]] + prompt_displays + user_inputs + output_boxes + process_buttons + regenerate_buttons + ) + + return workflow_selector, api_selector, api_key_input, context_input, dynamic_components + + +# Mock function to simulate LLM processing +def process_with_llm(workflow, context, prompt, api_endpoint, api_key): + api_key_snippet = api_key[:5] + "..." if api_key else "Not provided" + return f"LLM output using {api_endpoint} (API Key: {api_key_snippet}) for {workflow} with context: {context[:30]}... and prompt: {prompt[:30]}..." # # End of Chat_ui.py diff --git a/App_Function_Libraries/Gradio_UI/Export_Functionality.py b/App_Function_Libraries/Gradio_UI/Export_Functionality.py index a3cfb6f60..4926095d4 100644 --- a/App_Function_Libraries/Gradio_UI/Export_Functionality.py +++ b/App_Function_Libraries/Gradio_UI/Export_Functionality.py @@ -8,7 +8,7 @@ import tempfile from typing import List, Dict, Optional, Tuple import gradio as gr -from App_Function_Libraries.DB_Manager import DatabaseError, create_automated_backup, db_path, backup_dir +from App_Function_Libraries.DB.DB_Manager import DatabaseError, create_automated_backup, db_path, backup_dir from App_Function_Libraries.Gradio_UI.Gradio_Shared import fetch_item_details, fetch_items_by_keyword, browse_items logger = logging.getLogger(__name__) diff --git a/App_Function_Libraries/Gradio_UI/Gradio_Shared.py b/App_Function_Libraries/Gradio_UI/Gradio_Shared.py index ea80aa78f..5420bb44b 100644 --- a/App_Function_Libraries/Gradio_UI/Gradio_Shared.py +++ b/App_Function_Libraries/Gradio_UI/Gradio_Shared.py @@ -12,9 +12,9 @@ import gradio as gr # # Local Imports -from App_Function_Libraries.DB_Manager import list_prompts, db, search_and_display, fetch_prompt_details -from App_Function_Libraries.SQLite_DB import DatabaseError -from App_Function_Libraries.Utils import format_transcription +from App_Function_Libraries.DB.DB_Manager import list_prompts, db, search_and_display, fetch_prompt_details +from App_Function_Libraries.DB.SQLite_DB import DatabaseError +from App_Function_Libraries.Utils.Utils import format_transcription # ############################################################################################################## # diff --git a/App_Function_Libraries/Gradio_UI/Import_Functionality.py b/App_Function_Libraries/Gradio_UI/Import_Functionality.py index dbdf70f02..09d07f8de 100644 --- a/App_Function_Libraries/Gradio_UI/Import_Functionality.py +++ b/App_Function_Libraries/Gradio_UI/Import_Functionality.py @@ -16,7 +16,7 @@ import pypandoc # # Local Imports -from App_Function_Libraries.DB_Manager import insert_prompt_to_db, load_preset_prompts, import_obsidian_note_to_db, \ +from App_Function_Libraries.DB.DB_Manager import insert_prompt_to_db, load_preset_prompts, import_obsidian_note_to_db, \ add_media_to_database from App_Function_Libraries.Prompt_Handling import import_prompt_from_file, import_prompts_from_zip# from App_Function_Libraries.Summarization_General_Lib import perform_summarization diff --git a/App_Function_Libraries/Gradio_UI/Introduction_tab.py b/App_Function_Libraries/Gradio_UI/Introduction_tab.py index 7f1125f09..f6d22b06f 100644 --- a/App_Function_Libraries/Gradio_UI/Introduction_tab.py +++ b/App_Function_Libraries/Gradio_UI/Introduction_tab.py @@ -6,7 +6,7 @@ # External Imports import gradio as gr -from App_Function_Libraries.DB_Manager import get_db_config +from App_Function_Libraries.DB.DB_Manager import get_db_config # diff --git a/App_Function_Libraries/Gradio_UI/Keywords.py b/App_Function_Libraries/Gradio_UI/Keywords.py index 8e821be6f..3a6579e87 100644 --- a/App_Function_Libraries/Gradio_UI/Keywords.py +++ b/App_Function_Libraries/Gradio_UI/Keywords.py @@ -10,7 +10,7 @@ import gradio as gr # # Internal Imports -from App_Function_Libraries.DB_Manager import add_keyword, delete_keyword, keywords_browser_interface, export_keywords_to_csv +from App_Function_Libraries.DB.DB_Manager import add_keyword, delete_keyword, keywords_browser_interface, export_keywords_to_csv # # ###################################################################################################################### diff --git a/App_Function_Libraries/Gradio_UI/Media_edit.py b/App_Function_Libraries/Gradio_UI/Media_edit.py index 3f1246656..105ee9719 100644 --- a/App_Function_Libraries/Gradio_UI/Media_edit.py +++ b/App_Function_Libraries/Gradio_UI/Media_edit.py @@ -9,10 +9,10 @@ import gradio as gr # # Local Imports -from App_Function_Libraries.DB_Manager import add_prompt, update_media_content, db, add_or_update_prompt, \ +from App_Function_Libraries.DB.DB_Manager import add_prompt, update_media_content, db, add_or_update_prompt, \ load_prompt_details from App_Function_Libraries.Gradio_UI.Gradio_Shared import update_dropdown, update_prompt_dropdown -from App_Function_Libraries.SQLite_DB import fetch_item_details +from App_Function_Libraries.DB.SQLite_DB import fetch_item_details def create_media_edit_tab(): @@ -173,7 +173,7 @@ def save_cloned_item(selected_item, item_mapping, content, prompt, summary, new_ def create_prompt_edit_tab(): - with gr.TabItem("Edit Prompts"): + with gr.TabItem("Add & Edit Prompts"): with gr.Row(): with gr.Column(): prompt_dropdown = gr.Dropdown( diff --git a/App_Function_Libraries/Gradio_UI/PDF_ingestion_tab.py b/App_Function_Libraries/Gradio_UI/PDF_ingestion_tab.py index 21d816a8e..b8f0db50d 100644 --- a/App_Function_Libraries/Gradio_UI/PDF_ingestion_tab.py +++ b/App_Function_Libraries/Gradio_UI/PDF_ingestion_tab.py @@ -10,7 +10,7 @@ import gradio as gr # # Local Imports -from App_Function_Libraries.DB_Manager import load_preset_prompts +from App_Function_Libraries.DB.DB_Manager import load_preset_prompts from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt from App_Function_Libraries.PDF_Ingestion_Lib import extract_metadata_from_pdf, extract_text_and_format_from_pdf, \ process_and_cleanup_pdf diff --git a/App_Function_Libraries/Gradio_UI/Podcast_tab.py b/App_Function_Libraries/Gradio_UI/Podcast_tab.py index 96a8550b2..210b76691 100644 --- a/App_Function_Libraries/Gradio_UI/Podcast_tab.py +++ b/App_Function_Libraries/Gradio_UI/Podcast_tab.py @@ -9,7 +9,7 @@ # # Local Imports from App_Function_Libraries.Audio_Files import process_podcast -from App_Function_Libraries.DB_Manager import load_preset_prompts +from App_Function_Libraries.DB.DB_Manager import load_preset_prompts from App_Function_Libraries.Gradio_UI.Gradio_Shared import whisper_models, update_user_prompt diff --git a/App_Function_Libraries/Gradio_UI/Re_summarize_tab.py b/App_Function_Libraries/Gradio_UI/Re_summarize_tab.py index c3c419953..9ad8dd9f1 100644 --- a/App_Function_Libraries/Gradio_UI/Re_summarize_tab.py +++ b/App_Function_Libraries/Gradio_UI/Re_summarize_tab.py @@ -10,12 +10,12 @@ # # Local Imports from App_Function_Libraries.Chunk_Lib import improved_chunking_process -from App_Function_Libraries.DB_Manager import update_media_content, load_preset_prompts +from App_Function_Libraries.DB.DB_Manager import update_media_content, load_preset_prompts from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt from App_Function_Libraries.Gradio_UI.Gradio_Shared import fetch_item_details, fetch_items_by_keyword, \ fetch_items_by_content, fetch_items_by_title_or_url from App_Function_Libraries.Summarization_General_Lib import summarize_chunk -from App_Function_Libraries.Utils import load_comprehensive_config +from App_Function_Libraries.Utils.Utils import load_comprehensive_config # # ###################################################################################################################### diff --git a/App_Function_Libraries/Gradio_UI/Search_Tab.py b/App_Function_Libraries/Gradio_UI/Search_Tab.py index 637881778..50a540eef 100644 --- a/App_Function_Libraries/Gradio_UI/Search_Tab.py +++ b/App_Function_Libraries/Gradio_UI/Search_Tab.py @@ -10,9 +10,11 @@ # External Imports import gradio as gr -from App_Function_Libraries.DB_Manager import view_database, search_and_display_items +from App_Function_Libraries.DB.DB_Manager import view_database, search_and_display_items from App_Function_Libraries.Gradio_UI.Gradio_Shared import update_dropdown, update_detailed_view -from App_Function_Libraries.RAG_Libary_2 import rag_search +from App_Function_Libraries.RAG.ChromaDB_Library import get_all_content_from_database, chroma_client, \ + store_in_chroma, create_embedding +from App_Function_Libraries.RAG.RAG_Libary_2 import rag_search # # Local Imports @@ -24,9 +26,6 @@ logger = logging.getLogger() - - - # FIXME - SQL functions to be moved to DB_Manager def search_prompts(query): try: @@ -42,16 +41,6 @@ def search_prompts(query): return [] - - - - - - - - - - def create_rag_tab(): with gr.TabItem("RAG Search"): gr.Markdown("# Retrieval-Augmented Generation (RAG) Search") @@ -83,28 +72,203 @@ def create_embeddings_tab(): with gr.Row(): with gr.Column(): - embedding_api_choice = gr.Dropdown( - choices=["OpenAI", "Local", "HuggingFace"], + embedding_api_choice = gr.Radio( + choices=["Llama.cpp", "OpenAI"], label="Select API for Embeddings", value="OpenAI" ) + openai_model_choice = gr.Radio( + choices=["text-embedding-3-small", "text-embedding-3-large"], + label="OpenAI Embedding Model (Assumes you have your API key set up in 'config.txt')", + value="text-embedding-3-small", + visible=True + ) + llamacpp_url = gr.Textbox( + label="Llama.cpp Embedding API URL", + placeholder="http://localhost:8080/embedding", + value="http://localhost:8080/embedding", # Default value + visible=False + ) create_button = gr.Button("Create Embeddings") with gr.Column(): status_output = gr.Textbox(label="Status", lines=10) - def create_embeddings(api_choice): + def update_api_options(api_choice): + return ( + gr.update(visible=api_choice == "OpenAI"), + gr.update(visible=api_choice == "Llama.cpp") + ) + + embedding_api_choice.change( + fn=update_api_options, + inputs=[embedding_api_choice], + outputs=[openai_model_choice, llamacpp_url] + ) + + def create_embeddings(api_choice, openai_model, llamacpp_url): try: - # Assuming you have a function that handles the creation of embeddings - from App_Function_Libraries.ChromaDB_Library import create_all_embeddings - status = create_all_embeddings(api_choice) + from App_Function_Libraries.RAG.ChromaDB_Library import create_all_embeddings + if api_choice == "OpenAI": + status = create_all_embeddings("openai", openai_model) + else: # Llama.cpp + status = create_all_embeddings("llamacpp", llamacpp_url) return status except Exception as e: return f"Error: {str(e)}" - create_button.click(create_embeddings, inputs=[embedding_api_choice], outputs=status_output) + create_button.click( + fn=create_embeddings, + inputs=[embedding_api_choice, openai_model_choice, llamacpp_url], + outputs=status_output + ) + +def create_view_embeddings_tab(): + with gr.TabItem("View/Update Embeddings"): + gr.Markdown("# View and Update Embeddings") + item_mapping = gr.State({}) + with gr.Row(): + with gr.Column(): + item_dropdown = gr.Dropdown(label="Select Item", choices=[], interactive=True) + refresh_button = gr.Button("Refresh Item List") + embedding_status = gr.Textbox(label="Embedding Status", interactive=False) + embedding_preview = gr.Textbox(label="Embedding Preview", interactive=False, lines=5) + + with gr.Column(): + create_new_embedding_button = gr.Button("Create New Embedding") + embedding_provider = gr.Radio( + choices=["openai", "local", "huggingface"], + label="Embedding Provider", + value="openai" + ) + embedding_model = gr.Textbox( + label="Embedding Model", + value="text-embedding-3-small", + visible=True + ) + embedding_api_url = gr.Textbox( + label="API URL (for local provider)", + value="http://localhost:8080/embedding", + visible=False + ) + + def get_items_with_embedding_status(): + try: + items = get_all_content_from_database() + collection = chroma_client.get_or_create_collection(name="all_content_embeddings") + choices = [] + new_item_mapping = {} + for item in items: + try: + result = collection.get(ids=[f"doc_{item['id']}"]) + embedding_exists = result is not None and result.get('ids') and len(result['ids']) > 0 + status = "Embedding exists" if embedding_exists else "No embedding" + except Exception as e: + print(f"Error checking embedding for item {item['id']}: {str(e)}") + status = "Error checking" + choice = f"{item['title']} ({status})" + choices.append(choice) + new_item_mapping[choice] = item['id'] + return gr.update(choices=choices), new_item_mapping + except Exception as e: + print(f"Error in get_items_with_embedding_status: {str(e)}") + return gr.update(choices=["Error: Unable to fetch items"]), {} + + def check_embedding_status(selected_item, item_mapping): + if not selected_item: + return "Please select an item", "" + + try: + item_id = item_mapping.get(selected_item) + if item_id is None: + return f"Invalid item selected: {selected_item}", "" + + item_title = selected_item.rsplit(' (', 1)[0] + collection = chroma_client.get_or_create_collection(name="all_content_embeddings") + + try: + result = collection.get(ids=[f"doc_{item_id}"]) + except Exception as e: + print(f"Error getting embedding for item {item_id}: {str(e)}") + return f"Error retrieving embedding for item '{item_title}' (ID: {item_id})", "" + + if result is None: + return f"No result returned for item '{item_title}' (ID: {item_id})", "" + + if not result.get('ids'): + return f"No embedding found for item '{item_title}' (ID: {item_id})", "" + + if not result.get('embeddings'): + return f"Embedding data missing for item '{item_title}' (ID: {item_id})", "" + + embedding = result['embeddings'][0] + embedding_preview = str(embedding[:500]) # Convert first 500 elements to string + status = f"Embedding exists for item '{item_title}' (ID: {item_id})" + return status, f"First 500 elements of embedding:\n{embedding_preview}" + except Exception as e: + print(f"Error in check_embedding_status: {str(e)}") + return f"Error processing item: {selected_item}. Details: {str(e)}", "" + + def create_new_embedding(selected_item, provider, model, api_url, item_mapping): + if not selected_item: + return "Please select an item", "" + + try: + item_id = item_mapping.get(selected_item) + if item_id is None: + return f"Invalid item selected: {selected_item}", "" + + item_title = selected_item.rsplit(' (', 1)[0] + items = get_all_content_from_database() + item = next((item for item in items if item['id'] == item_id), None) + if not item: + return f"Item not found: {item_title}", "" + + global embedding_provider, embedding_model, embedding_api_url + embedding_provider = provider + embedding_model = model + embedding_api_url = api_url + + embedding = create_embedding(item['content']) + + collection_name = "all_content_embeddings" + store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item_id}"]) + + embedding_preview = str(embedding[:500]) # Convert first 500 elements to string + status = f"New embedding created and stored for item: {item_title} (ID: {item_id})" + return status, f"First 500 elements of new embedding:\n{embedding_preview}" + except Exception as e: + print(f"Error in create_new_embedding: {str(e)}") + return f"Error creating embedding: {str(e)}", "" + + def update_provider_options(provider): + return ( + gr.update(visible=True), + gr.update(visible=provider == "local") + ) + + refresh_button.click( + get_items_with_embedding_status, + outputs=[item_dropdown, item_mapping] + ) + item_dropdown.change( + check_embedding_status, + inputs=[item_dropdown, item_mapping], + outputs=[embedding_status, embedding_preview] + ) + create_new_embedding_button.click( + create_new_embedding, + inputs=[item_dropdown, embedding_provider, embedding_model, embedding_api_url, item_mapping], + outputs=[embedding_status, embedding_preview] + ) + embedding_provider.change( + update_provider_options, + inputs=[embedding_provider], + outputs=[embedding_model, embedding_api_url] + ) + return item_dropdown, refresh_button, embedding_status, embedding_preview, create_new_embedding_button, embedding_provider, embedding_model, embedding_api_url def create_search_tab(): diff --git a/App_Function_Libraries/Gradio_UI/Transcript_comparison.py b/App_Function_Libraries/Gradio_UI/Transcript_comparison.py index 3bbf08d48..46333fefc 100644 --- a/App_Function_Libraries/Gradio_UI/Transcript_comparison.py +++ b/App_Function_Libraries/Gradio_UI/Transcript_comparison.py @@ -8,9 +8,9 @@ # External Imports import gradio as gr -from App_Function_Libraries.DB_Manager import get_transcripts +from App_Function_Libraries.DB.DB_Manager import get_transcripts from App_Function_Libraries.Gradio_UI.Gradio_Shared import browse_items -from App_Function_Libraries.Utils import format_transcription +from App_Function_Libraries.Utils.Utils import format_transcription # diff --git a/App_Function_Libraries/Gradio_UI/Trash.py b/App_Function_Libraries/Gradio_UI/Trash.py index 9060c446f..47781fed5 100644 --- a/App_Function_Libraries/Gradio_UI/Trash.py +++ b/App_Function_Libraries/Gradio_UI/Trash.py @@ -9,7 +9,7 @@ import gradio as gr # # Local Imports -from App_Function_Libraries.DB_Manager import delete_prompt, empty_trash, get_trashed_items, user_delete_item +from App_Function_Libraries.DB.DB_Manager import delete_prompt, empty_trash, get_trashed_items, user_delete_item def delete_item(media_id, force): diff --git a/App_Function_Libraries/Gradio_UI/Utilities.py b/App_Function_Libraries/Gradio_UI/Utilities.py index 77454c6f4..e0ea3a62a 100644 --- a/App_Function_Libraries/Gradio_UI/Utilities.py +++ b/App_Function_Libraries/Gradio_UI/Utilities.py @@ -6,7 +6,7 @@ import gradio as gr import yt_dlp -from App_Function_Libraries.Utils import sanitize_filename, downloaded_files +from App_Function_Libraries.Utils.Utils import sanitize_filename, downloaded_files def create_utilities_yt_video_tab(): diff --git a/App_Function_Libraries/Gradio_UI/Video_transcription_tab.py b/App_Function_Libraries/Gradio_UI/Video_transcription_tab.py index f78ccc3cd..b6adc0c59 100644 --- a/App_Function_Libraries/Gradio_UI/Video_transcription_tab.py +++ b/App_Function_Libraries/Gradio_UI/Video_transcription_tab.py @@ -13,12 +13,12 @@ from App_Function_Libraries.Confabulation_check import simplified_geval # # Local Imports -from App_Function_Libraries.DB_Manager import load_preset_prompts, add_media_to_database +from App_Function_Libraries.DB.DB_Manager import load_preset_prompts, add_media_to_database from App_Function_Libraries.Gradio_UI.Gradio_Shared import whisper_models, update_user_prompt from App_Function_Libraries.Gradio_UI.Gradio_Shared import error_handler from App_Function_Libraries.Summarization_General_Lib import perform_transcription, perform_summarization, \ save_transcription_and_summary -from App_Function_Libraries.Utils import convert_to_seconds, safe_read_file, format_transcription, \ +from App_Function_Libraries.Utils.Utils import convert_to_seconds, safe_read_file, format_transcription, \ create_download_directory, generate_unique_identifier, extract_text_from_segments from App_Function_Libraries.Video_DL_Ingestion_Lib import parse_and_expand_urls, extract_metadata, download_video # diff --git a/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py b/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py index d0c741345..76decb71b 100644 --- a/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py +++ b/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py @@ -7,7 +7,7 @@ import gradio as gr from App_Function_Libraries.Article_Summarization_Lib import scrape_and_summarize_multiple -from App_Function_Libraries.DB_Manager import load_preset_prompts +from App_Function_Libraries.DB.DB_Manager import load_preset_prompts from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt diff --git a/App_Function_Libraries/Gradio_UI/Writing_tab.py b/App_Function_Libraries/Gradio_UI/Writing_tab.py index 5f7f159ee..fe99f662e 100644 --- a/App_Function_Libraries/Gradio_UI/Writing_tab.py +++ b/App_Function_Libraries/Gradio_UI/Writing_tab.py @@ -304,7 +304,7 @@ def create_document_feedback_tab(): "Virginia Woolf", "Virginia Woolf", "Zadie Smith"], - label="Compare Multiple Persona's Feedback at Once" + label="Compare Multiple Persona's Feedback at Once(Compares existing feedback, doesn't create new ones)" ) with gr.Row(): compare_button = gr.Button("Compare Feedback") diff --git a/App_Function_Libraries/LLM_API_Calls.py b/App_Function_Libraries/LLM_API_Calls.py index 678386c85..b64fce499 100644 --- a/App_Function_Libraries/LLM_API_Calls.py +++ b/App_Function_Libraries/LLM_API_Calls.py @@ -25,13 +25,15 @@ import logging import os import time +from typing import List + import requests # # Import 3rd-Party Libraries from requests import RequestException # # Import Local libraries -from App_Function_Libraries.Utils import load_and_log_configs +from App_Function_Libraries.Utils.Utils import load_and_log_configs # ####################################################################################################################### # Function Definitions @@ -60,6 +62,63 @@ def extract_text_from_segments(segments): +def get_openai_embeddings(input_data: str, model: str) -> List[float]: + """ + Get embeddings for the input text from OpenAI API. + + Args: + input_data (str): The input text to get embeddings for. + model (str): The model to use for generating embeddings. + + Returns: + List[float]: The embeddings generated by the API. + """ + loaded_config_data = load_and_log_configs() + api_key = loaded_config_data['api_keys']['openai'] + + if not api_key: + logging.error("OpenAI: API key not found or is empty") + raise ValueError("OpenAI: API Key Not Provided/Found in Config file or is empty") + + logging.debug(f"OpenAI: Using API Key: {api_key[:5]}...{api_key[-5:]}") + logging.debug(f"OpenAI: Raw input data (first 500 chars): {str(input_data)[:500]}...") + logging.debug(f"OpenAI: Using model: {model}") + + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + request_data = { + "input": input_data, + "model": model, + } + + try: + logging.debug("OpenAI: Posting request to embeddings API") + response = requests.post('https://api.openai.com/v1/embeddings', headers=headers, json=request_data) + + if response.status_code == 200: + response_data = response.json() + if 'data' in response_data and len(response_data['data']) > 0: + embedding = response_data['data'][0]['embedding'] + logging.debug("OpenAI: Embeddings retrieved successfully") + return embedding + else: + logging.warning("OpenAI: Embedding data not found in the response") + raise ValueError("OpenAI: Embedding data not available in the response") + else: + logging.error(f"OpenAI: Embeddings request failed with status code {response.status_code}") + logging.error(f"OpenAI: Error response: {response.text}") + raise ValueError(f"OpenAI: Failed to retrieve embeddings. Status code: {response.status_code}") + except requests.RequestException as e: + logging.error(f"OpenAI: Error making API request: {str(e)}", exc_info=True) + raise ValueError(f"OpenAI: Error making API request: {str(e)}") + except Exception as e: + logging.error(f"OpenAI: Unexpected error: {str(e)}", exc_info=True) + raise ValueError(f"OpenAI: Unexpected error occurred: {str(e)}") + + def chat_with_openai(api_key, input_data, custom_prompt_arg, temp=None, system_message=None): loaded_config_data = load_and_log_configs() openai_api_key = api_key @@ -612,7 +671,7 @@ def chat_with_huggingface(api_key, input_data, custom_prompt_arg, system_prompt= response = requests.post(API_URL, headers=headers, json=data) if response.status_code == 200: - summary = response.json()[0]['summary_text'] + summary = response.json()[0]['generated_text'].strip() logging.debug("huggingface: Chat request successful") print("Chat request successful.") return summary diff --git a/App_Function_Libraries/LLM_API_Calls_Local.py b/App_Function_Libraries/LLM_API_Calls_Local.py index 893e07456..b3201e735 100644 --- a/App_Function_Libraries/LLM_API_Calls_Local.py +++ b/App_Function_Libraries/LLM_API_Calls_Local.py @@ -4,10 +4,11 @@ # This library is used to perform summarization with a 'local' inference engine. # #### +from typing import Union #################### # Function List -# FIXME - UPDATE Function Arguments +# FIXME - UPDATE # 1. chat_with_local_llm(text, custom_prompt_arg) # 2. chat_with_llama(api_url, text, token, custom_prompt) # 3. chat_with_kobold(api_url, text, kobold_api_token, custom_prompt) @@ -20,7 +21,7 @@ #################### # Import necessary libraries # Import Local -from Utils import * +from App_Function_Libraries.Utils.Utils import * # ####################################################################################################################### # Function Definitions @@ -399,8 +400,205 @@ def chat_with_aphrodite(input_data, custom_prompt_input, api_key=None, api_IP="h return "Error summarizing with Aphrodite." -def chat_with_ollama(input_data, prompt, temp, system_message): - pass +# FIXME +def chat_with_ollama(input_data, custom_prompt, api_url="http://127.0.0.1:11434/api/generate", api_key=None, temp=None, system_message=None, model=None): + try: + logging.debug("ollama: Loading and validating configurations") + loaded_config_data = load_and_log_configs() + if loaded_config_data is None: + logging.error("Failed to load configuration data") + ollama_api_key = None + else: + # Prioritize the API key passed as a parameter + if api_key and api_key.strip(): + ollama_api_key = api_key + logging.info("Ollama: Using API key provided as parameter") + else: + # If no parameter is provided, use the key from the config + ollama_api_key = loaded_config_data['api_keys'].get('ollama') + if ollama_api_key: + logging.info("Ollama: Using API key from config file") + else: + logging.warning("Ollama: No API key found in config file") + + model = loaded_config_data['models']['ollama'] + + # Load transcript + logging.debug("Ollama: Loading JSON data") + if isinstance(input_data, str) and os.path.isfile(input_data): + logging.debug("Ollama: Loading json data for summarization") + with open(input_data, 'r') as file: + data = json.load(file) + else: + logging.debug("Ollama: Using provided string data for summarization") + data = input_data + + logging.debug(f"Ollama: Loaded data: {data}") + logging.debug(f"Ollama: Type of data: {type(data)}") + + if isinstance(data, dict) and 'summary' in data: + # If the loaded data is a dictionary and already contains a summary, return it + logging.debug("Ollama: Summary already exists in the loaded data") + return data['summary'] + + # If the loaded data is a list of segment dictionaries or a string, proceed with summarization + if isinstance(data, list): + segments = data + text = extract_text_from_segments(segments) + elif isinstance(data, str): + text = data + else: + raise ValueError("Ollama: Invalid input data format") + + headers = { + 'accept': 'application/json', + 'content-type': 'application/json', + } + if len(ollama_api_key) > 5: + headers['Authorization'] = f'Bearer {ollama_api_key}' + + ollama_prompt = f"{custom_prompt} \n\n\n\n{text}" + if system_message is None: + system_message = "You are a helpful AI assistant." + logging.debug(f"llama: Prompt being sent is {ollama_prompt}") + if system_message is None: + system_message = "You are a helpful AI assistant." + + data = { + "model": model, + "messages": [ + {"role": "system", + "content": system_message + }, + {"role": "user", + "content": ollama_prompt + } + ], + } + + logging.debug("Ollama: Submitting request to API endpoint") + print("Ollama: Submitting request to API endpoint") + response = requests.post(api_url, headers=headers, json=data) + response_data = response.json() + logging.debug("API Response Data: %s", response_data) + + if response.status_code == 200: + # if 'X' in response_data: + logging.debug(response_data) + summary = response_data['content'].strip() + logging.debug("Ollama: Chat request successful") + print("\n\nChat request successful.") + return summary + else: + logging.error(f"\n\nOllama: API request failed with status code {response.status_code}: {response.text}") + return f"Ollama: API request failed: {response.text}" + + except Exception as e: + logging.error("\n\nOllama: Error in processing: %s", str(e)) + return f"Ollama: Error occurred while processing summary with ollama: {str(e)}" + +def chat_with_vllm( + input_data: Union[str, dict, list], + custom_prompt_input: str, + api_key: str = None, + vllm_api_url: str = "http://127.0.0.1:8000/v1/chat/completions", + model: str = None, + system_prompt: str = None, + temp: float = 0.7 +) -> str: + logging.debug("vLLM: Summarization process starting...") + try: + logging.debug("vLLM: Loading and validating configurations") + loaded_config_data = load_and_log_configs() + if loaded_config_data is None: + logging.error("Failed to load configuration data") + vllm_api_key = None + else: + # Prioritize the API key passed as a parameter + if api_key and api_key.strip(): + vllm_api_key = api_key + logging.info("vLLM: Using API key provided as parameter") + else: + # If no parameter is provided, use the key from the config + vllm_api_key = loaded_config_data['api_keys'].get('vllm') + if vllm_api_key: + logging.info("vLLM: Using API key from config file") + else: + logging.warning("vLLM: No API key found in config file") + + logging.debug(f"vLLM: Using API Key: {vllm_api_key[:5]}...{vllm_api_key[-5:]}") + # Process input data + if isinstance(input_data, str) and os.path.isfile(input_data): + logging.debug("vLLM: Loading json data for summarization") + with open(input_data, 'r') as file: + data = json.load(file) + else: + logging.debug("vLLM: Using provided data for summarization") + data = input_data + + logging.debug(f"vLLM: Type of data: {type(data)}") + + # Extract text for summarization + if isinstance(data, dict) and 'summary' in data: + logging.debug("vLLM: Summary already exists in the loaded data") + return data['summary'] + elif isinstance(data, list): + text = extract_text_from_segments(data) + elif isinstance(data, str): + text = data + elif isinstance(data, dict): + text = json.dumps(data) + else: + raise ValueError("Invalid input data format") + + logging.debug(f"vLLM: Extracted text (showing first 500 chars): {text[:500]}...") + + if system_prompt is None: + system_prompt = "You are a helpful AI assistant." + + model = model or loaded_config_data['models']['vllm'] + if system_prompt is None: + system_prompt = "You are a helpful AI assistant." + + # Prepare the API request + headers = { + "Content-Type": "application/json" + } + + payload = { + "model": model, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"{custom_prompt_input}\n\n{text}"} + ] + } + + # Make the API call + logging.debug(f"vLLM: Sending request to {vllm_api_url}") + response = requests.post(vllm_api_url, headers=headers, json=payload) + + # Check for successful response + response.raise_for_status() + + # Extract and return the summary + response_data = response.json() + if 'choices' in response_data and len(response_data['choices']) > 0: + summary = response_data['choices'][0]['message']['content'] + logging.debug("vLLM: Summarization successful") + logging.debug(f"vLLM: Summary (first 500 chars): {summary[:500]}...") + return summary + else: + raise ValueError("Unexpected response format from vLLM API") + + except requests.RequestException as e: + logging.error(f"vLLM: API request failed: {str(e)}") + return f"Error: vLLM API request failed - {str(e)}" + except json.JSONDecodeError as e: + logging.error(f"vLLM: Failed to parse API response: {str(e)}") + return f"Error: Failed to parse vLLM API response - {str(e)}" + except Exception as e: + logging.error(f"vLLM: Unexpected error during summarization: {str(e)}") + return f"Error: Unexpected error during vLLM summarization - {str(e)}" def save_summary_to_file(summary, file_path): diff --git a/App_Function_Libraries/Local_File_Processing_Lib.py b/App_Function_Libraries/Local_File_Processing_Lib.py index 2b58c0b95..dd324e025 100644 --- a/App_Function_Libraries/Local_File_Processing_Lib.py +++ b/App_Function_Libraries/Local_File_Processing_Lib.py @@ -23,7 +23,7 @@ from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav from App_Function_Libraries.Video_DL_Ingestion_Lib import * from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube -from App_Function_Libraries.Utils import normalize_title, create_download_directory +from App_Function_Libraries.Utils.Utils import normalize_title, create_download_directory ####################################################################################################################### # Function Definitions diff --git a/App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py b/App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py index c6fbc8bd8..66a4ab510 100644 --- a/App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py +++ b/App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py @@ -27,7 +27,7 @@ import sys import time -from App_Function_Libraries.Utils import download_file +from App_Function_Libraries.Utils.Utils import download_file # Import 3rd-pary Libraries # # Import Local diff --git a/App_Function_Libraries/Local_Summarization_Lib.py b/App_Function_Libraries/Local_Summarization_Lib.py index 2fb21a066..62b53a60f 100644 --- a/App_Function_Libraries/Local_Summarization_Lib.py +++ b/App_Function_Libraries/Local_Summarization_Lib.py @@ -26,8 +26,7 @@ import requests # Import 3rd-party Libraries # Import Local -from App_Function_Libraries.Utils import load_and_log_configs -from App_Function_Libraries.Utils import extract_text_from_segments +from App_Function_Libraries.Utils.Utils import load_and_log_configs, extract_text_from_segments # ####################################################################################################################### # Function Definitions diff --git a/App_Function_Libraries/PDF_Ingestion_Lib.py b/App_Function_Libraries/PDF_Ingestion_Lib.py index 6a19e4875..86e89e85a 100644 --- a/App_Function_Libraries/PDF_Ingestion_Lib.py +++ b/App_Function_Libraries/PDF_Ingestion_Lib.py @@ -168,7 +168,7 @@ import pymupdf -from App_Function_Libraries.DB_Manager import add_media_with_keywords +from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords def extract_text_and_format_from_pdf(pdf_path): diff --git a/App_Function_Libraries/ChromaDB_Library.py b/App_Function_Libraries/RAG/ChromaDB_Library.py similarity index 70% rename from App_Function_Libraries/ChromaDB_Library.py rename to App_Function_Libraries/RAG/ChromaDB_Library.py index 21a62a780..1bcbaeabe 100644 --- a/App_Function_Libraries/ChromaDB_Library.py +++ b/App_Function_Libraries/RAG/ChromaDB_Library.py @@ -8,6 +8,8 @@ from chromadb import Settings from App_Function_Libraries.Chunk_Lib import improved_chunking_process +from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media +from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings ####################################################################################################################### # @@ -75,14 +77,12 @@ def process_and_store_content(content: str, collection_name: str, media_id: int) # Store the texts, embeddings, and IDs in ChromaDB store_in_chroma(collection_name, texts, embeddings, ids) - # Store the chunks in SQLite FTS as well - from App_Function_Libraries.DB_Manager import db - with db.get_connection() as conn: - cursor = conn.cursor() - for text in texts: - cursor.execute("INSERT INTO media_fts (content) VALUES (?)", (text,)) - conn.commit() + # Store the chunk metadata in SQLite + for i, chunk in enumerate(chunks): + add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i]) + # Update the FTS table + update_fts_for_media(media_id) # Function to store documents and their embeddings in ChromaDB def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]): @@ -105,20 +105,17 @@ def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]: def create_embedding(text: str) -> List[float]: + global embedding_provider, embedding_model, embedding_api_url, embedding_api_key + if embedding_provider == 'openai': - import openai - openai.api_key = embedding_api_key - response = openai.Embedding.create(input=text, model=embedding_model) - return response['data'][0]['embedding'] + return get_openai_embeddings(text, embedding_model) elif embedding_provider == 'local': - # FIXME - This is a placeholder for API calls to a local embedding model response = requests.post( embedding_api_url, json={"text": text, "model": embedding_model}, headers={"Authorization": f"Bearer {embedding_api_key}"} ) return response.json()['embedding'] - # FIXME - this seems correct, but idk.... elif embedding_provider == 'huggingface': from transformers import AutoTokenizer, AutoModel import torch @@ -137,11 +134,8 @@ def create_embedding(text: str) -> List[float]: raise ValueError(f"Unsupported embedding provider: {embedding_provider}") -def create_all_embeddings(api_choice: str) -> str: +def create_all_embeddings(api_choice: str, model_or_url: str) -> str: try: - global embedding_provider - embedding_provider = api_choice - all_content = get_all_content_from_database() if not all_content: @@ -167,7 +161,10 @@ def create_all_embeddings(api_choice: str) -> str: continue # Skip if embedding already exists # Create the embedding - embedding = create_embedding(text) + if api_choice == "openai": + embedding = create_openai_embedding(text, model_or_url) + else: # Llama.cpp + embedding = create_llamacpp_embedding(text, model_or_url) # Collect the text, embedding, and ID for batch storage texts_to_embed.append(text) @@ -184,6 +181,23 @@ def create_all_embeddings(api_choice: str) -> str: return f"Error: {str(e)}" +def create_openai_embedding(text: str, model: str) -> List[float]: + openai_api_key = config['API']['openai_api_key'] + embedding = get_openai_embeddings(text, model) + return embedding + + +def create_llamacpp_embedding(text: str, api_url: str) -> List[float]: + response = requests.post( + api_url, + json={"input": text} + ) + if response.status_code == 200: + return response.json()['embedding'] + else: + raise Exception(f"Error from Llama.cpp API: {response.text}") + + def get_all_content_from_database() -> List[Dict[str, Any]]: """ Retrieve all media content from the database that requires embedding. @@ -192,7 +206,7 @@ def get_all_content_from_database() -> List[Dict[str, Any]]: List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields. """ try: - from App_Function_Libraries.DB_Manager import db + from App_Function_Libraries.DB.DB_Manager import db with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" @@ -218,9 +232,56 @@ def get_all_content_from_database() -> List[Dict[str, Any]]: except sqlite3.Error as e: logging.error(f"Error retrieving all content from database: {e}") - from App_Function_Libraries.SQLite_DB import DatabaseError + from App_Function_Libraries.DB.SQLite_DB import DatabaseError raise DatabaseError(f"Error retrieving all content from database: {e}") + +def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]): + collection = chroma_client.get_or_create_collection(name=collection_name) + collection.add( + documents=texts, + embeddings=embeddings, + ids=ids, + metadatas=[{'source': source} for source in sources] + ) + + +def check_embedding_status(selected_item): + if not selected_item: + return "Please select an item", "" + item_id = selected_item.split('(')[0].strip() + collection = chroma_client.get_or_create_collection(name="all_content_embeddings") + result = collection.get(ids=[f"doc_{item_id}"]) + if result['ids']: + embedding = result['embeddings'][0] + embedding_preview = str(embedding[:50]) # Convert first 50 elements to string + return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..." + else: + return f"No embedding found for item: {item_id}", "" + + +def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url): + if not selected_item: + return "Please select an item" + item_id = selected_item.split('(')[0].strip() + items = get_all_content_from_database() + item = next((item for item in items if item['title'] == item_id), None) + if not item: + return f"Item not found: {item_id}" + + try: + if api_choice == "OpenAI": + embedding = create_embedding(item['content']) + else: # Llama.cpp + embedding = create_embedding(item['content']) + + collection_name = "all_content_embeddings" + store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"]) + return f"New embedding created and stored for item: {item_id}" + except Exception as e: + return f"Error creating embedding: {str(e)}" + + # # End of Functions for ChromaDB ####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/RAG_Libary_2.py b/App_Function_Libraries/RAG/RAG_Examples.md similarity index 75% rename from App_Function_Libraries/RAG_Libary_2.py rename to App_Function_Libraries/RAG/RAG_Examples.md index 4c0d80ae1..0ca8b3936 100644 --- a/App_Function_Libraries/RAG_Libary_2.py +++ b/App_Function_Libraries/RAG/RAG_Examples.md @@ -1,147 +1,8 @@ -# Import necessary modules and functions -import configparser -from typing import Dict, Any -# Local Imports -from App_Function_Libraries.ChromaDB_Library import process_and_store_content, vector_search, chroma_client -from Article_Extractor_Lib import scrape_article -from SQLite_DB import search_db, db -# 3rd-Party Imports -import openai -# Initialize OpenAI client (adjust this based on your API key management) -openai.api_key = "your-openai-api-key" - - -# Main RAG pipeline function -def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]: - # Extract content - article_data = scrape_article(url) - content = article_data['content'] - - # Process and store content - collection_name = "article_" + str(hash(url)) - process_and_store_content(content, collection_name) - - # Perform searches - vector_results = vector_search(collection_name, query, k=5) - fts_results = search_db(query, ["content"], "", page=1, results_per_page=5) - - # Combine results - all_results = vector_results + [result['content'] for result in fts_results] - context = "\n".join(all_results) - - # Generate answer using the selected API - answer = generate_answer(api_choice, context, query) - - return { - "answer": answer, - "context": context - } - -config = configparser.ConfigParser() -config.read('config.txt') - -def generate_answer(api_choice: str, context: str, query: str) -> str: - prompt = f"Context: {context}\n\nQuestion: {query}" - if api_choice == "OpenAI": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai - return summarize_with_openai(config['API']['openai_api_key'], prompt, "") - elif api_choice == "Anthropic": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic - return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "") - elif api_choice == "Cohere": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere - return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "") - elif api_choice == "Groq": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq - return summarize_with_groq(config['API']['groq_api_key'], prompt, "") - elif api_choice == "OpenRouter": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter - return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "") - elif api_choice == "HuggingFace": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface - return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "") - elif api_choice == "DeepSeek": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek - return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "") - elif api_choice == "Mistral": - from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral - return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "") - elif api_choice == "Local-LLM": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm - return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "") - elif api_choice == "Llama.cpp": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama - return summarize_with_llama(config['API']['llama_api_key'], prompt, "") - elif api_choice == "Kobold": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold - return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "") - elif api_choice == "Ooba": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga - return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "") - elif api_choice == "TabbyAPI": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi - return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "") - elif api_choice == "vLLM": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm - return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "") - elif api_choice == "ollama": - from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama - return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "") - else: - raise ValueError(f"Unsupported API choice: {api_choice}") - -# Function to preprocess and store all existing content in the database -def preprocess_all_content(): - with db.get_connection() as conn: - cursor = conn.cursor() - cursor.execute("SELECT id, content FROM Media") - for row in cursor.fetchall(): - process_and_store_content(row[1], f"media_{row[0]}") - - -# Function to perform RAG search across all stored content -def rag_search(query: str, api_choice: str) -> Dict[str, Any]: - # Perform vector search across all collections - all_collections = chroma_client.list_collections() - vector_results = [] - for collection in all_collections: - vector_results.extend(vector_search(collection.name, query, k=2)) - - # Perform FTS search - fts_results = search_db(query, ["content"], "", page=1, results_per_page=10) - - # Combine results - all_results = vector_results + [result['content'] for result in fts_results] - context = "\n".join(all_results[:10]) # Limit to top 10 results - - # Generate answer using the selected API - answer = generate_answer(api_choice, context, query) - - return { - "answer": answer, - "context": context - } - - -# Example usage: -# 1. Initialize the system: -# create_tables(db) # Ensure FTS tables are set up -# preprocess_all_content() # Process and store all existing content - -# 2. Perform RAG on a specific URL: -# result = rag_pipeline("https://example.com/article", "What is the main topic of this article?") -# print(result['answer']) - -# 3. Perform RAG search across all content: -# result = rag_search("What are the key points about climate change?") -# print(result['answer']) - - - +``` ################################################################################################################## # RAG Pipeline 1 -#0.62 0.61 0.75 63402.0 +# 0.62 0.61 0.75 63402.0 # from langchain_openai import ChatOpenAI # # from langchain_community.document_loaders import WebBaseLoader @@ -202,7 +63,7 @@ def rag_search(query: str, api_choice: str) -> Dict[str, Any]: # print(f"An error occurred: {e}") -##To get the answer and context, use the following code +# To get the answer and context, use the following code # res=rag_pipeline().invoke("your prompt here") # print(res["answer"]) # print(res["context"]) @@ -210,11 +71,10 @@ def rag_search(query: str, api_choice: str) -> Dict[str, Any]: ############################################################################################################ - ############################################################################################################ # RAG Pipeline 2 -#0.6 0.73 0.68 3125.0 +# 0.6 0.73 0.68 3125.0 # from langchain_openai import ChatOpenAI # # from langchain_community.document_loaders import WebBaseLoader @@ -282,17 +142,14 @@ def rag_search(query: str, api_choice: str) -> Dict[str, Any]: # print(f"An error occurred: {e}") -##To get the answer and context, use the following code +# To get the answer and context, use the following code # res=rag_pipeline().invoke("your prompt here") # print(res["answer"]) # print(res["context"]) - - - - - - +# +# +# ############################################################################################################ # Plain bm25 retriever # class BM25Retriever(BaseRetriever): @@ -696,25 +553,4 @@ def rag_search(query: str, api_choice: str) -> Dict[str, Any]: # """ # return _unique_documents(documents) ############################################################################################################ - - - - - - - - -############################################################################################################ -# ElasticSearch Retriever - -# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch -# -# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query - - - - - - - - +``` \ No newline at end of file diff --git a/App_Function_Libraries/RAG/RAG_Libary_2.py b/App_Function_Libraries/RAG/RAG_Libary_2.py new file mode 100644 index 000000000..24bb755e0 --- /dev/null +++ b/App_Function_Libraries/RAG/RAG_Libary_2.py @@ -0,0 +1,172 @@ +# RAG_Library_2.py +# Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline. +# +# Import necessary modules and functions +import configparser +from typing import Dict, Any +# Local Imports +from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client +from App_Function_Libraries.Article_Extractor_Lib import scrape_article +from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media +# 3rd-Party Imports +import openai +# +######################################################################################################################## +# +# Functions: + +# Initialize OpenAI client (adjust this based on your API key management) +openai.api_key = "your-openai-api-key" + +config = configparser.ConfigParser() +config.read('config.txt') + +# Main RAG pipeline function +def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]: + # Extract content + article_data = scrape_article(url) + content = article_data['content'] + title = article_data['title'] + + # Store the article in the database and get the media_id + media_id = add_media_to_database(url, title, 'article', content) + + # Process and store content + collection_name = f"article_{media_id}" + process_and_store_content(content, collection_name, media_id) + + # Perform searches + vector_results = vector_search(collection_name, query, k=5) + fts_results = search_db(query, ["content"], "", page=1, results_per_page=5) + + # Combine results + all_results = vector_results + [result['content'] for result in fts_results] + context = "\n".join(all_results) + + # Generate answer using the selected API + answer = generate_answer(api_choice, context, query) + + return { + "answer": answer, + "context": context + } + + +def generate_answer(api_choice: str, context: str, query: str) -> str: + prompt = f"Context: {context}\n\nQuestion: {query}" + if api_choice == "OpenAI": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai + return summarize_with_openai(config['API']['openai_api_key'], prompt, "") + elif api_choice == "Anthropic": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic + return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "") + elif api_choice == "Cohere": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere + return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "") + elif api_choice == "Groq": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq + return summarize_with_groq(config['API']['groq_api_key'], prompt, "") + elif api_choice == "OpenRouter": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter + return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "") + elif api_choice == "HuggingFace": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface + return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "") + elif api_choice == "DeepSeek": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek + return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "") + elif api_choice == "Mistral": + from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral + return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "") + elif api_choice == "Local-LLM": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm + return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "") + elif api_choice == "Llama.cpp": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama + return summarize_with_llama(config['API']['llama_api_key'], prompt, "") + elif api_choice == "Kobold": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold + return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "") + elif api_choice == "Ooba": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga + return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "") + elif api_choice == "TabbyAPI": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi + return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "") + elif api_choice == "vLLM": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm + return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "") + elif api_choice == "ollama": + from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama + return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "") + else: + raise ValueError(f"Unsupported API choice: {api_choice}") + +# Function to preprocess and store all existing content in the database +def preprocess_all_content(): + unprocessed_media = get_unprocessed_media() + for row in unprocessed_media: + media_id = row[0] + content = row[1] + media_type = row[2] + collection_name = f"{media_type}_{media_id}" + process_and_store_content(content, collection_name, media_id) + + +# Function to perform RAG search across all stored content +def rag_search(query: str, api_choice: str) -> Dict[str, Any]: + # Perform vector search across all collections + all_collections = chroma_client.list_collections() + vector_results = [] + for collection in all_collections: + vector_results.extend(vector_search(collection.name, query, k=2)) + + # Perform FTS search + fts_results = search_db(query, ["content"], "", page=1, results_per_page=10) + + # Combine results + all_results = vector_results + [result['content'] for result in fts_results] + context = "\n".join(all_results[:10]) # Limit to top 10 results + + # Generate answer using the selected API + answer = generate_answer(api_choice, context, query) + + return { + "answer": answer, + "context": context + } + + +# Example usage: +# 1. Initialize the system: +# create_tables(db) # Ensure FTS tables are set up +# +# 2. Create ChromaDB +# chroma_client = ChromaDBClient() +# +# 3. Create Embeddings +# Store embeddings in ChromaDB +# preprocess_all_content() or create_embeddings() +# +# 4. Perform RAG search across all content: +# result = rag_search("What are the key points about climate change?") +# print(result['answer']) +# +# (Extra)5. Perform RAG on a specific URL: +# result = rag_pipeline("https://example.com/article", "What is the main topic of this article?") +# print(result['answer']) +# +######################################################################################################################## + + +############################################################################################################ +# +# ElasticSearch Retriever + +# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch +# +# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query + +# +# End of RAG_Library_2.py +############################################################################################################ diff --git a/App_Function_Libraries/RAG_Library.py b/App_Function_Libraries/RAG/RAG_Library.py similarity index 100% rename from App_Function_Libraries/RAG_Library.py rename to App_Function_Libraries/RAG/RAG_Library.py diff --git a/App_Function_Libraries/RAG/RAPTOR-Skeleton.py b/App_Function_Libraries/RAG/RAPTOR-Skeleton.py new file mode 100644 index 000000000..3f24eff6b --- /dev/null +++ b/App_Function_Libraries/RAG/RAPTOR-Skeleton.py @@ -0,0 +1,361 @@ +# Requirements +# scikit-learn umap-learn +from itertools import chain +from typing import List, Dict + +from App_Function_Libraries.RAG.ChromaDB_Library import store_in_chroma, create_embedding, vector_search, chroma_client +from App_Function_Libraries.Chunk_Lib import improved_chunking_process, recursive_summarize_chunks +import logging +from sklearn.mixture import GaussianMixture +import umap +from nltk.corpus import wordnet + + +# Logging setup +logging.basicConfig(filename='raptor.log', level=logging.DEBUG) + +# FIXME +MAX_LEVELS = 3 + + +def log_and_summarize(text, prompt): + logging.debug(f"Summarizing text: {text[:100]} with prompt: {prompt}") + return dummy_summarize(text, prompt) + +# 1. Data Preparation +def prepare_data(content: str, media_id: int, chunk_options: dict): + chunks = improved_chunking_process(content, chunk_options) + embeddings = [create_embedding(chunk['text']) for chunk in chunks] + return chunks, embeddings + +# 2. Recursive Summarization +def recursive_summarization(chunks, summarize_func, custom_prompt): + summarized_chunks = recursive_summarize_chunks( + [chunk['text'] for chunk in chunks], + summarize_func=summarize_func, + custom_prompt=custom_prompt + ) + return summarized_chunks + +# Initial gen +# 3. Tree Organization +#def build_tree_structure(chunks, embeddings, collection_name, level=0): +# if len(chunks) <= 1: +# return chunks # Base case: if chunks are small enough, return as is + + # Recursive case: cluster and summarize +# summarized_chunks = recursive_summarization(chunks, summarize_func=dummy_summarize, custom_prompt="Summarize:") +# new_chunks, new_embeddings = prepare_data(' '.join(summarized_chunks), media_id, chunk_options) + + # Store in ChromaDB +# ids = [f"{media_id}_L{level}_chunk_{i}" for i in range(len(new_chunks))] +# store_in_chroma(collection_name, [chunk['text'] for chunk in new_chunks], new_embeddings, ids) + + # Recursively build tree +# return build_tree_structure(new_chunks, new_embeddings, collection_name, level+1) + +# Second iteration +def build_tree_structure(chunks, collection_name, level=0): + # Dynamic clustering + clustered_texts = dynamic_clustering([chunk['text'] for chunk in chunks]) + + # Summarize each cluster + summarized_clusters = {} + for cluster_id, cluster_texts in clustered_texts.items(): + summary = dummy_summarize(' '.join(cluster_texts), custom_prompt="Summarize:") + summarized_clusters[cluster_id] = summary + + # Store summaries at current level + ids = [] + embeddings = [] + summaries = [] + for cluster_id, summary in summarized_clusters.items(): + ids.append(f"{collection_name}_L{level}_C{cluster_id}") + embeddings.append(create_embedding(summary)) + summaries.append(summary) + + store_in_chroma(collection_name, summaries, embeddings, ids) + + # Recursively build tree structure if necessary + if level < MAX_LEVELS: + for cluster_id, cluster_texts in clustered_texts.items(): + build_tree_structure(cluster_texts, collection_name, level + 1) + + + + +# Dummy summarize function (replace with actual summarization) +def dummy_summarize(text, custom_prompt, temp=None, system_prompt=None): + return text # Replace this with actual call to summarization model (like GPT-3.5-turbo) + +# 4. Retrieval +def raptor_retrieve(query, collection_name, level=0): + results = vector_search(collection_name, query, k=5) + return results + +# Main function integrating RAPTOR +def raptor_pipeline(media_id, content, chunk_options): + collection_name = f"media_{media_id}_raptor" + + # Step 1: Prepare Data + chunks, embeddings = prepare_data(content, media_id, chunk_options) + + # Step 2: Build Tree + build_tree_structure(chunks, embeddings, collection_name) + + # Step 3: Retrieve Information + query = "Your query here" + result = raptor_retrieve(query, collection_name) + print(result) + +# Example usage +content = "Your long document content here" +chunk_options = { + 'method': 'sentences', + 'max_size': 300, + 'overlap': 50 +} +media_id = 1 +raptor_pipeline(media_id, content, chunk_options) + + +# +# +################################################################################################################### +# +# Additions: + + +def dynamic_clustering(texts, n_components=2): + # Step 1: Convert text to embeddings + embeddings = [create_embedding(text) for text in texts] + + # Step 2: Dimensionality reduction (UMAP) + reducer = umap.UMAP(n_components=n_components) + reduced_embeddings = reducer.fit_transform(embeddings) + + # Step 3: Find optimal number of clusters using BIC + best_gmm = None + best_bic = float('inf') + n_clusters = range(2, 10) + for n in n_clusters: + gmm = GaussianMixture(n_components=n, covariance_type='full') + gmm.fit(reduced_embeddings) + bic = gmm.bic(reduced_embeddings) + if bic < best_bic: + best_bic = bic + best_gmm = gmm + + # Step 4: Cluster the reduced embeddings + cluster_labels = best_gmm.predict(reduced_embeddings) + clustered_texts = {i: [] for i in range(best_gmm.n_components)} + for label, text in zip(cluster_labels, texts): + clustered_texts[label].append(text) + + return clustered_texts + + +def tree_traversal_retrieve(query, collection_name, max_depth=3): + logging.info(f"Starting tree traversal for query: {query}") + results = [] + current_level = 0 + current_nodes = [collection_name + '_L0'] + + while current_level <= max_depth and current_nodes: + next_level_nodes = [] + for node_id in current_nodes: + documents = vector_search(node_id, query, k=5) + results.extend(documents) + next_level_nodes.extend([doc['id'] for doc in documents]) # Assuming your doc structure includes an 'id' field + current_nodes = next_level_nodes + current_level += 1 + + logging.info(f"Tree traversal completed with {len(results)} results") + return results + + +def collapsed_tree_retrieve(query, collection_name): + all_layers = [f"{collection_name}_L{level}" for level in range(MAX_LEVELS)] + all_results = [] + + for layer in all_layers: + all_results.extend(vector_search(layer, query, k=5)) + + # Sort and rank results by relevance + sorted_results = sorted(all_results, key=lambda x: x['relevance'], reverse=True) # Assuming 'relevance' is a key + return sorted_results[:5] # Return top 5 results + +# Test collaped tree retrieval +query = "Your broad query here" +results = collapsed_tree_retrieve(query, collection_name=f"media_{media_id}_raptor") +print(results) + + +# Parallel processing +# pip install joblib +from joblib import Parallel, delayed + +def parallel_process_chunks(chunks): + return Parallel(n_jobs=-1)(delayed(create_embedding)(chunk['text']) for chunk in chunks) + +def build_tree_structure(chunks, collection_name, level=0): + clustered_texts = dynamic_clustering([chunk['text'] for chunk in chunks]) + + summarized_clusters = {} + for cluster_id, cluster_texts in clustered_texts.items(): + summary = dummy_summarize(' '.join(cluster_texts), custom_prompt="Summarize:") + summarized_clusters[cluster_id] = summary + + # Parallel processing of embeddings + embeddings = parallel_process_chunks([{'text': summary} for summary in summarized_clusters.values()]) + + ids = [f"{collection_name}_L{level}_C{cluster_id}" for cluster_id in summarized_clusters.keys()] + store_in_chroma(collection_name, list(summarized_clusters.values()), embeddings, ids) + + if len(summarized_clusters) > 1 and level < MAX_LEVELS: + build_tree_structure(summarized_clusters.values(), collection_name, level + 1) + +# Asynchronous processing +import asyncio + +async def async_create_embedding(text): + return create_embedding(text) # Assuming create_embedding is now async + +async def build_tree_structure_async(chunks, collection_name, level=0): + clustered_texts = dynamic_clustering([chunk['text'] for chunk in chunks]) + + summarized_clusters = {} + for cluster_id, cluster_texts in clustered_texts.items(): + summary = await async_create_embedding(' '.join(cluster_texts)) + summarized_clusters[cluster_id] = summary + + embeddings = await asyncio.gather(*[async_create_embedding(summary) for summary in summarized_clusters.values()]) + + ids = [f"{collection_name}_L{level}_C{cluster_id}" for cluster_id in summarized_clusters.keys()] + store_in_chroma(collection_name, list(summarized_clusters.values()), embeddings, ids) + + if len(summarized_clusters) > 1 and level < MAX_LEVELS: + await build_tree_structure_async(summarized_clusters.values(), collection_name, level + 1) + + +# User feedback Loop +def get_user_feedback(results): + print("Please review the following results:") + for i, result in enumerate(results): + print(f"{i + 1}: {result['text'][:100]}...") + + feedback = input("Enter the numbers of the results that were relevant (comma-separated): ") + relevant_indices = [int(i.strip()) - 1 for i in feedback.split(",")] + return relevant_indices + + +def raptor_pipeline_with_feedback(media_id, content, chunk_options): + # ... Existing pipeline steps ... + + query = "Your query here" + initial_results = tree_traversal_retrieve(query, collection_name=f"media_{media_id}_raptor") + relevant_indices = get_user_feedback(initial_results) + + if relevant_indices: + relevant_results = [initial_results[i] for i in relevant_indices] + refined_query = " ".join([res['text'] for res in relevant_results]) + try: + final_results = tree_traversal_retrieve(refined_query, collection_name=f"media_{media_id}_raptor") + except Exception as e: + logging.error(f"Error during retrieval: {str(e)}") + raise + print("Refined Results:", final_results) + else: + print("No relevant results were found in the initial search.") + + +def identify_uncertain_results(results): + threshold = 0.5 # Define a confidence threshold + uncertain_results = [res for res in results if res['confidence'] < threshold] + return uncertain_results + + +def raptor_pipeline_with_active_learning(media_id, content, chunk_options): + # ... Existing pipeline steps ... + + query = "Your query here" + initial_results = tree_traversal_retrieve(query, collection_name=f"media_{media_id}_raptor") + uncertain_results = identify_uncertain_results(initial_results) + + if uncertain_results: + print("The following results are uncertain. Please provide feedback:") + feedback_indices = get_user_feedback(uncertain_results) + # Use feedback to adjust retrieval or refine the query + refined_query = " ".join([uncertain_results[i]['text'] for i in feedback_indices]) + final_results = tree_traversal_retrieve(refined_query, collection_name=f"media_{media_id}_raptor") + print("Refined Results:", final_results) + else: + print("No uncertain results were found.") + + +# Query Expansion +def expand_query_with_synonyms(query): + words = query.split() + expanded_query = [] + for word in words: + synonyms = wordnet.synsets(word) + lemmas = set(chain.from_iterable([syn.lemma_names() for syn in synonyms])) + expanded_query.append(" ".join(lemmas)) + return " ".join(expanded_query) + + +def contextual_query_expansion(query, context): + # FIXME: Replace with actual contextual model + expanded_terms = some_contextual_model.get_expansions(query, context) + return query + " " + " ".join(expanded_terms) + + +def raptor_pipeline_with_query_expansion(media_id, content, chunk_options): + # ... Existing pipeline steps ... + + query = "Your initial query" + expanded_query = expand_query_with_synonyms(query) + initial_results = tree_traversal_retrieve(expanded_query, collection_name=f"media_{media_id}_raptor") + # ... Continue with feedback loop ... + + +def generate_summary_with_citations(query: str, collection_name: str): + results = vector_search_with_citation(collection_name, query) + # FIXME + summary = summarize([res['text'] for res in results]) + # Deduplicate sources + sources = list(set(res['source'] for res in results)) + return f"{summary}\n\nCitations:\n" + "\n".join(sources) + + +def vector_search_with_citation(collection_name: str, query: str, k: int = 10) -> List[Dict[str, str]]: + query_embedding = create_embedding(query) + collection = chroma_client.get_collection(name=collection_name) + results = collection.query( + query_embeddings=[query_embedding], + n_results=k + ) + return [{'text': doc, 'source': meta['source']} for doc, meta in zip(results['documents'], results['metadatas'])] + + +def generate_summary_with_footnotes(query: str, collection_name: str): + results = vector_search_with_citation(collection_name, query) + summary_parts = [] + citations = [] + for i, res in enumerate(results): + summary_parts.append(f"{res['text']} [{i + 1}]") + citations.append(f"[{i + 1}] {res['source']}") + return " ".join(summary_parts) + "\n\nFootnotes:\n" + "\n".join(citations) + + +def generate_summary_with_hyperlinks(query: str, collection_name: str): + results = vector_search_with_citation(collection_name, query) + summary_parts = [] + for res in results: + summary_parts.append(f'{res["text"][:100]}...') + return " ".join(summary_parts) + + +# +# End of Additions +############################################3############################################3############################## \ No newline at end of file diff --git a/App_Function_Libraries/RAG/__init__.py b/App_Function_Libraries/RAG/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/App_Function_Libraries/Summarization_General_Lib.py b/App_Function_Libraries/Summarization_General_Lib.py index d4a8bb9b1..eb2bf3a12 100644 --- a/App_Function_Libraries/Summarization_General_Lib.py +++ b/App_Function_Libraries/Summarization_General_Lib.py @@ -30,9 +30,9 @@ from App_Function_Libraries.Diarization_Lib import combine_transcription_and_diarization from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \ summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm -from App_Function_Libraries.DB_Manager import add_media_to_database +from App_Function_Libraries.DB.DB_Manager import add_media_to_database # Import Local -from App_Function_Libraries.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \ +from App_Function_Libraries.Utils.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \ clean_youtube_url, create_download_directory, is_valid_url from App_Function_Libraries.Video_DL_Ingestion_Lib import download_video, extract_video_info @@ -777,7 +777,7 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None response = requests.post(API_URL, headers=headers, json=data) if response.status_code == 200: - summary = response.json()[0]['summary_text'] + summary = response.json()[0]['generated_text'].strip() logging.debug("huggingface: Summarization successful") print("Summarization successful.") return summary diff --git a/App_Function_Libraries/System_Checks_Lib.py b/App_Function_Libraries/Utils/System_Checks_Lib.py similarity index 100% rename from App_Function_Libraries/System_Checks_Lib.py rename to App_Function_Libraries/Utils/System_Checks_Lib.py diff --git a/App_Function_Libraries/Utils.py b/App_Function_Libraries/Utils/Utils.py similarity index 99% rename from App_Function_Libraries/Utils.py rename to App_Function_Libraries/Utils/Utils.py index 9529a4b97..c8213b9ce 100644 --- a/App_Function_Libraries/Utils.py +++ b/App_Function_Libraries/Utils/Utils.py @@ -93,11 +93,12 @@ def cleanup_downloads(): # Config loading # + def load_comprehensive_config(): # Get the directory of the current script current_dir = os.path.dirname(os.path.abspath(__file__)) - # Go up one level to the project root directory - project_root = os.path.dirname(current_dir) + # Go up two levels to the project root directory + project_root = os.path.dirname(os.path.dirname(current_dir)) # Construct the path to the config file in the project root directory config_path = os.path.join(project_root, 'config.txt') # Create a ConfigParser object diff --git a/App_Function_Libraries/Utils/__init__.py b/App_Function_Libraries/Utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/App_Function_Libraries/Video_DL_Ingestion_Lib.py b/App_Function_Libraries/Video_DL_Ingestion_Lib.py index 92347b278..752b4d011 100644 --- a/App_Function_Libraries/Video_DL_Ingestion_Lib.py +++ b/App_Function_Libraries/Video_DL_Ingestion_Lib.py @@ -33,7 +33,7 @@ # 3rd-Party Imports import yt_dlp -from App_Function_Libraries.DB_Manager import check_media_and_whisper_model +from App_Function_Libraries.DB.DB_Manager import check_media_and_whisper_model # Import Local diff --git a/Docs/RAG_Notes.md b/Docs/RAG_Notes.md new file mode 100644 index 000000000..0ac81f10e --- /dev/null +++ b/Docs/RAG_Notes.md @@ -0,0 +1,606 @@ +# RAG Notes + + +RAG 101 + https://www.youtube.com/watch?v=nc0BupOkrhI + https://arxiv.org/abs/2401.08406 + https://github.com/NirDiamant/RAG_Techniques?tab=readme-ov-file + https://github.com/jxnl/n-levels-of-rag + https://winder.ai/llm-architecture-rag-implementation-design-patterns/ + https://medium.com/@yufan1602/modular-rag-and-rag-flow-part-%E2%85%B0-e69b32dc13a3 + + +201 + https://medium.com/@cdg2718/why-your-rag-doesnt-work-9755726dd1e9 + https://www.cazton.com/blogs/technical/advanced-rag-techniques + https://medium.com/@krtarunsingh/advanced-rag-techniques-unlocking-the-next-level-040c205b95bc + https://pub.towardsai.net/advanced-rag-techniques-an-illustrated-overview-04d193d8fec6 + https://winder.ai/llm-architecture-rag-implementation-design-patterns/ + https://towardsdatascience.com/17-advanced-rag-techniques-to-turn-your-rag-app-prototype-into-a-production-ready-solution-5a048e36cdc8 + https://medium.com/@samarrana407/mastering-rag-advanced-methods-to-enhance-retrieval-augmented-generation-4b611f6ca99a + https://generativeai.pub/advanced-rag-retrieval-strategy-query-rewriting-a1dd61815ff0 + https://medium.com/@yufan1602/modular-rag-and-rag-flow-part-%E2%85%B0-e69b32dc13a3 + https://pub.towardsai.net/rag-architecture-advanced-rag-3fea83e0d189?gi=47c0b76dbee0 + +Articles + https://posts.specterops.io/summoning-ragnarok-with-your-nemesis-7c4f0577c93b?gi=7318858af6c3 + https://blog.demir.io/advanced-rag-implementing-advanced-techniques-to-enhance-retrieval-augmented-generation-systems-0e07301e46f4 + https://arxiv.org/abs/2312.10997 + https://jxnl.co/writing/2024/05/22/systematically-improving-your-rag/ + https://www.arcus.co/blog/rag-at-planet-scale + https://d-star.ai/embeddings-are-not-all-you-need + +Architecture Design + - https://medium.com/@yufan1602/modular-rag-and-rag-flow-part-ii-77b62bf8a5d3 + - https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1 + * https://github.com/ray-project/llm-applications + +Papers + - Rags to Riches - https://huggingface.co/papers/2406.12824 + * LLMs will use foreign knowledge sooner than parametric information. + +Building + https://techcommunity.microsoft.com/t5/microsoft-developer-community/building-the-ultimate-nerdland-podcast-chatbot-with-rag-and-llm/ba-p/4175577 + https://medium.com/@LakshmiNarayana_U/advanced-rag-techniques-in-ai-retrieval-a-deep-dive-into-the-chroma-course-d8b06118cde3 + https://rito.hashnode.dev/building-a-multi-hop-qa-with-dspy-and-qdrant + https://blog.gopenai.com/advanced-retrieval-augmented-generation-rag-techniques-5abad385ac66?gi=09e684acab4d + https://www.youtube.com/watch?v=bNqSRNMgwhQ + https://www.youtube.com/watch?v=7h6uDsfD7bg + https://www.youtube.com/watch?v=Balro-DxFyk&list=PLwPYSl1MQp4FpIzn48ypesKYzLvUBQpPF&index=5 + https://github.com/jxnl/n-levels-of-rag + https://rito.hashnode.dev/building-a-multi-hop-qa-with-dspy-and-qdrant + + + + + +### Building my RAG Solution +- **Outline** + * Modular architecture design +- **Pre-Retrieval** + * F +- **Retrieval** + * F +- **Post-Retrieval** + * +- **Generation & Post-Generation** + - Prompt Compression + * https://github.com/microsoft/LLMLingua + - **Citations** + * Contextcite: https://github.com/MadryLab/context-cite + + + +### RAG Process +1. Pre-Retrieval + - Raw data creation / Preparation + 1. Prepare data so that text-chunks are self-explanatory +2. **Retrieval** + 1. **Chunk Optimization** + - Naive - Fixed-size (in characters) Overlapping Sliding windows + * `limitations include imprecise control over context size, the risk of cutting words or sentences, and a lack of semantic consideration. Suitable for exploratory analysis but not recommended for tasks requiring deep semantic understanding.` + - Recursive Structure Aware Splitting + * `A hybrid method combining fixed-size sliding window and structure-aware splitting. It attempts to balance fixed chunk sizes with linguistic boundaries, offering precise context control. Implementation complexity is higher, with a risk of variable chunk sizes. Effective for tasks requiring granularity and semantic integrity but not recommended for quick tasks or unclear structural divisions.` + - Structure Aware Splitting (by sentence/paragraph) + * ` Respecting linguistic boundaries preserves semantic integrity, but challenges arise with varying structural complexity. Effective for tasks requiring context and semantics, but unsuitable for texts lacking defined structural divisions.` + - Context-Aware Splitting (Markdown/LaTeX/HTML) + * `ensures content types are not mixed within chunks, maintaining integrity. Challenges include understanding specific syntax and unsuitability for unstructured documents. Useful for structured documents but not recommended for unstructured content.` + - NLP Chunking: Tracking Topic Changes + * `based on semantic understanding, dividing text into chunks by detecting significant shifts in topics. Ensures semantic consistency but demands advanced NLP techniques. Effective for tasks requiring semantic context and topic continuity but not suitable for high topic overlap or simple chunking tasks.` + 2. **Enhancing Data Quality** + - Abbreviations/technical terms/links + * `To mitigate that issue, we can try to ingest that necessary additional context while processing the data, e.g. replace abbreviations with the full text by using an abbreviation translation table.` + 3. **Meta-data** + - You can add metadata to your vector data in all vector databases. Metadata can later help to (pre-)filter the entire vector database before we perform a vector search. + 4. **Optimize Indexing Structure** + * `Full Search vs. Approximate Nearest Neighbor, HNSW vs. IVFPQ` + 1. Chunk Optimization + - Semantic splitter - optimize chunk size used for embedding + - Small-to-Big + - Sliding Window + - Summary of chunks + - Metadata Attachment + 2. **Multi-Representation Indexing** - Convert into compact retrieval units (i.e. summaries) + 1. Parent Document + 2. Dense X + 3. **Specialized Embeddings** + 1. Fine-tuned + 2. ColBERT + 4. **Heirarchical Indexing** - Tree of document summarization at various abstraction levels + 1. **RAPTOR** - Recursive Abstractive Processing for Tree-Organized Retrieval + * https://arxiv.org/pdf/2401.18059 + * `RAPTOR is a novel tree-based retrieval system designed for recursively embedding, clustering, and summarizing text segments. It constructs a tree from the bottom up, offering varying levels of summarization. During inference, RAPTOR retrieves information from this tree, incorporating data from longer documents at various levels of abstraction.` + * https://archive.is/Zgb13 - README + 5. **Knowledge Graphs / GraphRAG** - Use an LLM to construct a graph-based text index + * https://arxiv.org/pdf/2404.16130 + * https://github.com/microsoft/graphrag + - Occurs in two steps: + 1. Derives a knowledge graph from the source documents + 2. Generates community summaries for all closely connected entity groups + * Given a query, each community summary contributes to a partial response. These partial responses are then aggregated to form the final global answer. + - Workflow: + 1. Chunk Source documents + 2. Construct a knowledge graph by extracting entities and their relationships from each chunk. + 3. Simultaneously, Graph RAG employs a multi-stage iterative process. This process requires the LLM to determine if all entities have been extracted, similar to a binary classification problem. + 4. Element Instances → Element Summaries → Graph Communities → Community Summaries + * Graph RAG employs community detection algorithms to identify community structures within the graph, incorporating closely linked entities into the same community. + * `In this scenario, even if LLM fails to identify all variants of an entity consistently during extraction, community detection can help establish the connections between these variants. Once grouped into a community, it signifies that these variants refer to the same entity connotation, just with different expressions or synonyms. This is akin to entity disambiguation in the field of knowledge graph.` + * `After identifying the community, we can generate report-like summaries for each community within the Leiden hierarchy. These summaries are independently useful in understanding the global structure and semantics of the dataset. They can also be used to comprehend the corpus without any problems.` + 5. Community Summaries → Community Answers → Global Answer + 6. **HippoRAG** + * https://github.com/OSU-NLP-Group/HippoRAG + * https://arxiv.org/pdf/2405.14831 + * https://archive.is/Zgb13#selection-2093.24-2093.34 + 7. **spRAG/dsRAG** - README + * https://github.com/D-Star-AI/dsRAG + 5. **Choosing the right embedding model** + * F. + 6. **Self query** + * https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/self_query/ + 7. **Hybrid & Filtered Vector Search** + * Perform multiple search methods and combine results together + 1. Keyword Search(BM25) + Vector + 2. f + 8. **Query Construction** + - Create a query to interact with a specific DB + 1. Text-to-SQL + * Relational DBs + * Rewrite a query into a SQL query + 2. Text-to-Cyber + * Graph DBs + * Rewrite a query into a cypher query + 3. Self-query Retriever + * Vector DBs + * Auto-generate metadata filters from query + 9. **Query Translation** + 1. Query Decomposition - Decompose or re-phrase the input question + 1. Multi-Query + * https://archive.is/5y4iI + - Sub-Question Querying + * `The core idea of the sub-question strategy is to generate and propose sub-questions related to the main question during the question-answering process to better understand and answer the main question. These sub-questions are usually more specific and can help the system to understand the main question more deeply, thereby improving retrieval accuracy and providing correct answers.` + 1. First, the sub-question strategy generates multiple sub-questions from the user query using LLM (Large Language Model). + 2. Then, each sub-question undergoes the RAG process to obtain its own answer (retrieval generation). + 3. Finally, the answers to all sub-questions are merged to obtain the final answer. + 2. Step-Back Prompting + * http://arxiv.org/pdf/2310.06117 + * `technique that guides LLM to extract advanced concepts and basic principles from specific instances through abstraction, using these concepts and principles to guide reasoning. This approach significantly improves LLM’s ability to follow the correct reasoning path to solve problems.` + - Flow: + 1. Take in a question - `Estella Leopold went to what school in Aug 1954 and Nov 1954?` + 2. Create a(or multiple) stepback question - `What was Estella Leopold's education history?` + 3. Answer Stepback answer + 4. Perform reasoning using stepback question + answer to create final answer + 3. RAG-Fusion - Combining multiple data sources in one RAG (Walking RAG?) + - 3 parts: + 1. Query Generation - Generate multiple sub-queries from the user’s input to capture diverse perspectives and fully understand the user’s intent. + 2. Sub-query Retrieval - Retrieve relevant information for each sub-query from large datasets and repositories, ensuring comprehensive and in-depth search results. + 3. Reciprocal Rank Fusion - Merge the retrieved documents using Reciprocal Rank Fusion (RRF) to combine their ranks, prioritizing the most relevant and comprehensive results. + 2. Pseudo-Documents - Hypothetical documents + 1. HyDE + * https://arxiv.org/abs/2212.10496 + 10. **Query Enhancement / Rewriting** + - Replacing Acronyms with full phrasing + - Providing synonyms to industry terms + - Literally just ask the LLM to do it. + 11. **Query Extension** + 12. **Query Expansion** + 1. Query Expansion with generated answers + * `We use the LLM to generate an answer, before performing the similarity search. If it is a question that can only be answered using our internal knowledge, we indirectly ask the model to hallucinate, and use the hallucinated answer to search for content that is similar to the answer and not the user query itself.` + - Implementations: + - HyDE (Hypothetical Document Embeddings) + - Rewrite-Retrieve-Read + - Step-Back Prompting + - Query2Doc + - ITER-RETGEN + - Others? + 13. **Multiple System Prompts** + * Generate multiple prompts, consolidate answer + 14. **Query Routing** - Let LLM decide which datastore to use for information retrieval based on user's query + 1. Logical Routing - Let LLM choose DB based on question + 2. Semantic Routing - embed question and choose prompt based on similarity + 15. **Response Summarization** - Using summaries of returned items + 16. **Ranking*** + 1. Re-Rank + * https://div.beehiiv.com/p/advanced-rag-series-retrieval + 2. RankGPT + 3. RAG-Fusion + 17. **Refinement** + 1. CRAG + * https://arxiv.org/pdf/2401.15884 + * https://medium.com/@kbdhunga/corrective-rag-c-rag-and-control-flow-in-langgraph-d9edad7b5a2c + * https://medium.com/@djangoist/how-to-create-accurate-llm-responses-on-large-code-repositories-presenting-cgrag-a-new-feature-of-e77c0ffe432d + 18. **Active Retrieval** - re-retrieve and or retrieve from new data sources if retrieved documents are not relevant. + 1. CRAG +3. **Post-Retrieval** + 1. **Context Enrichment** + 1. Sentence Window Retriever + * `The text chunk with the highest similarity score represents the best-matching content found. Before sending the content to the LLM we add the k-sentences before and after the text chunk found. This makes sense since the information has a high probability to be connected to the middle part and maybe the piece of information in the middle text chunk is not complete.` + 2. Auto-Merging Retriever + * `The text chunk with the highest similarity score represents the best-matching content found. Before sending the content to the LLM we add each small text chunk's assigned “parent” chunks, which do not necessarily have to be the chunk before and after the text chunk found.` + * We can build on top of that concept and set up a whole hierarchy like a decision tree with different levels of Parent Nodes, Child Nodes and Leaf Nodes. We could for example have 3 levels, with different chunk sizes - See https://docs.llamaindex.ai/en/stable/examples/retrievers/auto_merging_retriever/ +4. **Generation & Post-Generation** + 1. **Self-RAG** + * https://github.com/AkariAsai/self-rag + 2. **Rewrite-Retrieve-Read (RRR)** + * https://arxiv.org/pdf/2305.14283 + 3. **Choosing the appropriate/correct model** + 4. **Agents** + 5. **Evaluation** + - Metrics: + - Generation + 1. Faithfulness - How factually accurate is the generated answer? + 2. Answer Relevancy - How relevant is the generated answer to the question? + - Retrieval + 1. Context Precision + 2. Context Recall + - Others + 1. Answer semantic Similarity + 2. Answer correctness + 1. Normalized Discounted Cumulative Gain (NDCG) + * https://www.evidentlyai.com/ranking-metrics/ndcg-metric#:~:text=DCG%20measures%20the%20total%20item,ranking%20quality%20in%20the%20dataset. + 2. Existing RAG Eval Frameworks + * RAGAS - https://archive.is/I8f2w + 3. LLM as a Judge + * We generate an evaluation dataset -> Then define a so-called critique agent with suitable criteria we want to evaluate -> Set up a test pipeline that automatically evaluates the responses of the LLMs based on the defined criteria. + 4. Usage Metrics + * Nothing beats real-world data. +5. **Delivery** + + + + +RAG-Fusion - Combining multiple data source in one RAG search + + +JSON file store Vector indexing + +### Chunking - https://github.com/D-Star-AI/dsRAG' +- **Improvements/Ideas** + * As part of chunk header summary, include where in the document this chunk is located, besides chunk #x, so instead this comes from the portion of hte document talking about XYZ in the greater context +- Chunk Headers + * The idea here is to add in higher-level context to the chunk by prepending a chunk header. This chunk header could be as simple as just the document title, or it could use a combination of document title, a concise document summary, and the full hierarchy of section and sub-section titles. +- Chunks -> segments* + * Large chunks provide better context to the LLM than small chunks, but they also make it harder to precisely retrieve specific pieces of information. Some queries (like simple factoid questions) are best handled by small chunks, while other queries (like higher-level questions) require very large chunks. + * We break documents up into chunks with metadata at the head of each chunk to help categorize it to the document/align it with the greater context +- **Semantic Sectioning** + * Semantic sectioning uses an LLM to break a document into sections. It works by annotating the document with line numbers and then prompting an LLM to identify the starting and ending lines for each “semantically cohesive section.” These sections should be anywhere from a few paragraphs to a few pages long. The sections then get broken into smaller chunks if needed. The LLM is also prompted to generate descriptive titles for each section. These section titles get used in the contextual chunk headers created by AutoContext, which provides additional context to the ranking models (embeddings and reranker), enabling better retrieval. + 1. Identify sections + 2. Split sections into chunks + 3. Add metadata header to each chunk + * `Document: X` + * `Section: X1` + * Alt: `Concise parent document summary` + * Other approaches/bits of info can help/experiment... +- **AutoContext** + * `AutoContext creates contextual chunk headers that contain document-level and section-level context, and prepends those chunk headers to the chunks prior to embedding them. This gives the embeddings a much more accurate and complete representation of the content and meaning of the text. In our testing, this feature leads to a dramatic improvement in retrieval quality. In addition to increasing the rate at which the correct information is retrieved, AutoContext also substantially reduces the rate at which irrelevant results show up in the search results. This reduces the rate at which the LLM misinterprets a piece of text in downstream chat and generation applications.` +- **Relevant Segment Extraction** + * Relevant Segment Extraction (RSE) is a query-time post-processing step that takes clusters of relevant chunks and intelligently combines them into longer sections of text that we call segments. These segments provide better context to the LLM than any individual chunk can. For simple factual questions, the answer is usually contained in a single chunk; but for more complex questions, the answer usually spans a longer section of text. The goal of RSE is to intelligently identify the section(s) of text that provide the most relevant information, without being constrained to fixed length chunks. +- **Topic Aware Chunking by Sentence** + * https://blog.gopenai.com/mastering-rag-chunking-techniques-for-enhanced-document-processing-8d5fd88f6b72?gi=2f39fdede29b + + +### Vector DBs +- Indexing mechanisms + * Locality-Sensitive Hashing (LSH) + * Hierarchical Graph Structure + * Inverted File Indexing + * Product Quantization + * Spatial Hashing + * Tree-Based Indexing variations +- Embedding algos + * Word2Vec + * GloVe + * Ada + * BERT + * Instructor +- Similarity Measurement Algos + * Cosine similarity - measuring the cosine of two angles + * Euclidean distance - measuring the distance between two points +- Indexing and Searching Algos + - Approximate Nearest Neighbor (ANN) + * FAISS + * Annoy + * IVF + * HNSW (Heirarchical Navigable small worlds) +- Vector Similarity Search + - `Inverted File (IVF)` - `indexes are used in vector similarity search to map the query vector to a smaller subset of the vector space, reducing the number of vectors compared to the query vector and speeding up Approximate Nearest Neighbor (ANN) search. IVF vectors are efficient and scalable, making them suitable for large-scale datasets. However, the results provided by IVF vectors are approximate, not exact, and creating an IVF index can be resource-intensive, especially for large datasets.` + - `Hierarchical Navigable Small World (HNSW)` - `graphs are among the top-performing indexes for vector similarity search. HNSW is a robust algorithm that produces state-of-the-art performance with fast search speeds and excellent recall. It creates a multi-layered graph, where each layer represents a subset of the data, to quickly traverse these layers to find approximate nearest neighbors. HNSW vectors are versatile and suitable for a wide range of applications, including those that require high-dimensional data spaces. However, the parameters of the HNSW algorithm can be tricky to tune for optimal performance, and creating an HNSW index can also be resource intensive.` +- **Vectorization Process** + - Usually several stages: + 1. Data Pre-processing + * `The initial stage involves preparing the raw data. For text, this might include tokenization (breaking down text into words or phrases), removing stop words, and normalizing the text (like lowercasing). For images, preprocessing might involve resizing, normalization, or augmentation.` + 2. Feature Extraction + * `The system extracts features from the preprocessed data. In text, features could be the frequency of words or the context in which they appear. For images, features could be various visual elements like edges, textures, or color histograms.` + 3. Embedding Generation + * `Using algorithms like Word2Vec for text or CNNs for images, the extracted features are transformed into numerical vectors. These vectors capture the essential qualities of the data in a dense format, typically in a high-dimensional space.` + 4. Dimensionality Reduction + * `Sometimes, the generated vectors might be very high-dimensional, which can be computationally intensive to process. Techniques like PCA (Principal Component Analysis) or t-SNE (t-Distributed Stochastic Neighbor Embedding) are used to reduce the dimensionality while preserving as much of the significant information as possible.` + 5. Normalization + * `Finally, the vectors are often normalized to have a uniform length. This step ensures consistency across the dataset and is crucial for accurately measuring distances or similarities between vectors.` + + + +### Semantic Re-Ranker +* `enhances retrieval quality by re-ranking search results based on deep learning models, ensuring the most relevant results are prioritized.` +- General Steps + 1. Initial Retrieval: a query is processed, and a set of potentially relevant results is fetched. This set is usually larger and broader, encompassing a wide array of documents or data points that might be relevant to the query. + 2. LLM / ML model used to identify relevance + 3. Re-Ranking Process: In this stage, the retrieved results are fed into the deep learning model along with the query. The model assesses each result for its relevance, considering factors such as semantic similarity, context matching, and the query's intent. + 4. Generating a Score: Each result is assigned a relevance score by the model. This scoring is based on how well the content of the result matches the query in terms of meaning, context, and intent. + 5. Sorting Results: Based on the scores assigned, the results are then sorted in descending order of relevance. The top-scoring results are deemed most relevant to the query and are presented to the user. + 6. Continuous Learning and Adaptation: Many Semantic Rankers are designed to learn and adapt over time. By analyzing user interactions with the search results (like which links are clicked), the Ranker can refine its scoring and sorting algorithms, enhancing its accuracy and relevance. +- **Relevance Metrics** +- List of: + 1. Precision and Recall: These are fundamental metrics in information retrieval. Precision measures the proportion of retrieved documents that are relevant, while recall measures the proportion of relevant documents that were retrieved. High precision means that most of the retrieved items are relevant, and high recall means that most of the relevant items are retrieved. + 2. F1 Score: The F1 Score is the harmonic mean of precision and recall. It provides a single metric that balances both precision and recall, useful in scenarios where it's important to find an equilibrium between finding as many relevant items as possible (recall) and ensuring that the retrieved items are mostly relevant (precision). + 3. Normalized Discounted Cumulative Gain (NDCG): Particularly useful in scenarios where the order of results is important (like web search), NDCG takes into account the position of relevant documents in the result list. The more relevant documents appearing higher in the search results, the better the NDCG. + 4. Mean Average Precision (MAP): MAP considers the order of retrieval and the precision at each rank in the result list. It’s especially useful in tasks where the order of retrieval is important but the user is likely to view only the top few results. + + + +### Issues in RAG +1. Indexing + - Issues: + 1. Chunking + 1. Relevance & Precision + * `Properly chunked documents ensure that the retrieved information is highly relevant to the query. If the chunks are too large, they may contain a lot of irrelevant information, diluting the useful content. Conversely, if they are too small, they might miss the broader context, leading to accurate responses but not sufficiently comprehensive.` + 2. Efficiency & Performance + * `The size and structure of the chunks affect the efficiency of the retrieval process. Smaller chunks can be retrieved and processed more quickly, reducing the overall latency of the system. However, there is a balance to be struck, as too many small chunks can overwhelm the retrieval system and negatively impact performance.` + 3. Quality of Generation + * `The quality of the generated output heavily depends on the input retrieved. Well-chunked documents ensure that the generator has access to coherent and contextually rich information, which leads to more informative, coherent, and contextually appropriate responses.` + 4. Scalability + * `As the corpus size grows, chunking becomes even more critical. A well-thought-out chunking strategy ensures that the system can scale effectively, managing more documents without a significant drop in retrieval speed or quality.` + 1. Incomplete Content Representation + * `The semantic information of chunks is influenced by the segmentation method, resulting in the loss or submergence of important information within longer contexts.` + 2. Inaccurate Chunk Similarity Search. + * `As data volume increases, noise in retrieval grows, leading to frequent matching with erroneous data, making the retrieval system fragile and unreliable.` + 3. Unclear Reference Trajectory. + * `The retrieved chunks may originate from any document, devoid of citation trails, potentially resulting in the presence of chunks from multiple different documents that, despite being semantically similar, contain content on entirely different topics.` + - Potential Solutions + - Chunk Optimization + - Sliding window + * overlapping chunks + - Small to Big + * Retrieve small chunks then collect parent from meta data + - Enhance data granularity - apply data cleaning techniques, like removing irrelevant information, confirming factual accuracy, updating outdated information, etc. + - Adding metadata, such as dates, purposes, or chapters, for filtering purposes. + - Structural Organization + - Heirarchical Index + * `In the hierarchical structure of documents, nodes are arranged in parent-child relationships, with chunks linked to them. Data summaries are stored at each node, aiding in the swift traversal of data and assisting the RAG system in determining which chunks to extract. This approach can also mitigate the illusion caused by block extraction issues.` + - Methods for constructing index: + 1. Structural awareness - paragraph and sentence segmentation in docs + 2. Content Awareness - inherent structure in PDF, HTML, Latex + 3. Semantic Awareness - Semantic recognition and segmentation of text based on NLP techniques, such as leveraging NLTK. + 4. Knowledge Graphs +2. Pre-Retrieval + - Issues: + - Poorly worded queries + - Language complexity and ambiguity + - Potential Solutions: + - Multi-Query - Expand original question into multiple + - Sub-Query - `The process of sub-question planning represents the generation of the necessary sub-questions to contextualize and fully answer the original question when combined. ` + - Chain-of-Verification(CoVe) - The expanded queries undergo validation by LLM to achieve the effect of reducing hallucinations. Validated expanded queries typically exhibit higher reliability. + * https://arxiv.org/abs/2309.11495 + - Query Transformation + - Rewrite + * The original queries are not always optimal for LLM retrieval, especially in real-world scenarios. Therefore, we can prompt LLM to rewrite the queries. + - HyDE + * `When responding to queries, LLM constructs hypothetical documents (assumed answers) instead of directly searching the query and its computed vectors in the vector database. It focuses on embedding similarity from answer to answer rather than seeking embedding similarity for the problem or query. In addition, it also includes Reverse HyDE, which focuses on retrieval from query to query.` + * https://medium.aiplanet.com/advanced-rag-improving-retrieval-using-hypothetical-document-embeddings-hyde-1421a8ec075a?gi=b7fa45dc0f32&source=post_page-----e69b32dc13a3-------------------------------- + - Reverse HyDE + * + - Step-back prompting + * https://arxiv.org/abs/2310.06117 + * https://cobusgreyling.medium.com/a-new-prompt-engineering-technique-has-been-introduced-called-step-back-prompting-b00e8954cacb + - Query Routing + * Based on varying queries, routing to distinct RAG pipeline,which is suitable for a versatile RAG system designed to accommodate diverse scenarios. + - Metadata Router/Filter + * `involves extracting keywords (entity) from the query, followed by filtering based on the keywords and metadata within the chunks to narrow down the search scope.` + - Semantic Router + * https://medium.com/ai-insights-cobet/beyond-basic-chatbots-how-semantic-router-is-changing-the-game-783dd959a32d + - CoVe + * https://sourajit16-02-93.medium.com/chain-of-verification-cove-understanding-implementation-e7338c7f4cb5 + * https://www.domingosenise.com/artificial-intelligence/chain-of-verification-cove-an-approach-for-reducing-hallucinations-in-llm-outcomes.html + - Multi-Query + - SubQuery + - Query Construction + - Text-to-Cypher + - Text-to-SQL + * https://blog.langchain.dev/query-construction/?source=post_page-----e69b32dc13a3-------------------------------- +3. Retrieval + - 3 Main considerations: + 1. Retrieval Efficiency + 2. Embedding Quality + 3. Alignment of tasks, data and models + - Sparse Retreiver + * EX: BM25, TF-IDF + - Dense Retriever + * ColBERT + * BGE/Cohere embedding/OpenAI-Ada-002 + - Retriever Fine-tuning + - SFT + - LSR (LM-Supervised Retriever) + - Reinforcement learning + - Adapter + * https://arxiv.org/pdf/2310.18347 + * https://arxiv.org/abs/2305.17331 + ` +4. Post-Retrieval + - Primary Challenges: + 1. Lost in the middle + 2. Noise/anti-fact chunks + 3. Context windows. + - Potential Solutions + - Re-Rank + * Re-rank implementation: https://towardsdatascience.com/enhancing-rag-pipelines-in-haystack-45f14e2bc9f5 + - Rule-based re-rank + * According to certain rules, metrics are calculated to rerank chunks. + * Some: Diversity; Relevance; MRR (Maximal Marginal Relevance, 1998) + - Model based rerank + * Utilize a language model to reorder the document chunks + - Compression & Selection + - LLMLingua + * https://github.com/microsoft/LLMLingua + * https://llmlingua.com/ + - RECOMP + * https://arxiv.org/pdf/2310.04408 + - Selective Context + * https://aclanthology.org/2023.emnlp-main.391.pdf + - Tagging Filter + * https://python.langchain.com/v0.1/docs/use_cases/tagging/ + - LLM Critique +5. Generator + * Utilize the LLM to generate answers based on the user’s query and the retrieved context information. + - Finetuning + * SFT + * RL + * Distillation + - Dual FT + * `In the RAG system, fine-tuning both the retriever and the generator simultaneously is a unique feature of the RAG system. It is important to note that the emphasis of system fine-tuning is on the coordination between the retriever and the generator. Fine-tuning the retriever and the generator separately separately belongs to the combination of the former two, rather than being part of Dual FT.` + * https://arxiv.org/pdf/2310.01352 +6. Orchestration + * `Orchestration refers to the modules used to control the RAG process. RAG no longer follows a fixed process, and it involves making decisions at key points and dynamically selecting the next step based on the results.` + - Scheduling + * `The Judge module assesses critical point in the RAG process, determining the need to retrieve external document repositories, the satisfaction of the answer, and the necessity of further exploration. It is typically used in recursive, iterative, and adaptive retrieval.` + - `Rule-base` + * `The next course of action is determined based on predefined rules. Typically, the generated answers are scored, and then the decision to continue or stop is made based on whether the scores meet predefined thresholds. Common thresholds include confidence levels for tokens.` + - `Prompt-base` + * `LLM autonomously determines the next course of action. There are primarily two approaches to achieve this. The first involves prompting LLM to reflect or make judgments based on the conversation history, as seen in the ReACT framework. The benefit here is the elimination of the need for fine-tuning the model. However, the output format of the judgment depends on the LLM’s adherence to instructions.` + * https://arxiv.org/pdf/2305.06983 + - Tuning based + * The second approach entails LLM generating specific tokens to trigger particular actions, a method that can be traced back to Toolformer and is applied in RAG, such as in Self-RAG. + * https://arxiv.org/pdf/2310.11511 + - Fusion + * `This concept originates from RAG Fusion. As mentioned in the previous section on Query Expansion, the current RAG process is no longer a singular pipeline. It often requires the expansion of retrieval scope or diversity through multiple branches. Therefore, following the expansion to multiple branches, the Fusion module is relied upon to merge multiple answers.` + - Possibility Ensemble + * `The fusion method is based on the weighted values of different tokens generated from multiple beranches, leading to the comprehensive selection of the final output. Weighted averaging is predominantly employed.` + * https://arxiv.org/pdf/2301.12652 + - Reciprocal Rank Fusion + * `RRF, is a technique that combines the rankings of multiple search result lists to generate a single unified ranking. Developed in collaboration with the University of Waterloo (CAN) and Google, RRF produces results that are more effective than reordering chunks under any single branch.` + * https://towardsdatascience.com/forget-rag-the-future-is-rag-fusion-1147298d8ad1 +- Semantic dissonance + * `the discordance between your task’s intended meaning, the RAG’s understanding of it, and the underlying knowledge that’s stored.` +- Poor explainability of embeddings +- Semantic Search tends to be directionally correct but inherently fuzzy + * Good for finding top-k results +- Significance of Dimensionality in Vector Embeddings + * `The dimensionality of a vector, which is the length of the vector, plays a crucial role. Higher-dimensional vectors can capture more information and subtle nuances of the data, leading to more accurate models. However, higher dimensionality also increases computational complexity. Therefore, finding the right balance in vector dimensionality is key to efficient and effective model performance.` + + +### Potential Improvements when building +https://gist.github.com/Donavan/62e238aa0a40ca88191255a070e356a2 +- **Chunking** + - Relevance & Precision + - Efficiency and Performance + - Quality of Generation + - Scalability +- **Embeddings** + 1. **Encoder Fine-Tuning** + * `Despite the high efficiency of modern Transformer Encoders, fine-tuning can still yield modest improvements in retrieval quality, especially when tailored to specific domains.` + 2. Ranker Fine-Tuning + * `Employing a cross-encoder for re-ranking can refine the selection of context, ensuring that only the most relevant text chunks are considered.` + 3. LLM Fine-Tuning + * `The advent of LLM fine-tuning APIs allows for the adaptation of models to specific datasets or tasks, enhancing their effectiveness and accuracy in generating responses.` +- **Constructing the Search Index** + 1. **Vector store index** + 2. **Heirarchical Indices** + * Two-tiered index, one for doc summaries the other for detailed chunks + * Filter through the summaries first then search the chunks + 3. **Hypothetical Questions and HyDE approach** + * A novel approach involves the generation of hypothetical questions for each text chunk. These questions are then vectorized and stored, replacing the traditional text vectors in the index. This method enhances semantic alignment between user queries and stored data, potentially leading to more accurate retrievals. The HyDE method reverses this process by generating hypothetical responses to queries, using these as additional data points to refine search accuracy. +- **Context Enrichment** + 1. **Sentence-Window retrieval** + * `This technique enhances search precision by embedding individual sentences and extending the search context to include neighboring sentences. This not only improves the relevance of the retrieved data but also provides the LLM with a richer context for generating responses.` + 2. **Auto-merging Retriever** (Parent Document Retriever) + * `Similar to the Sentence Window Retrieval, this method focuses on granularity but extends the context more broadly. Documents are segmented into a hierarchy of chunks, and smaller, more relevant pieces are initially retrieved. If multiple small chunks relate to a larger segment, they are merged to form a comprehensive context, which is then presented to the LLM.` + 3. **Fusion Retrieval** + * `The concept of fusion retrieval combines traditional keyword-based search methods, like TF-IDF or BM25, with modern vector-based search techniques. This hybrid approach, often implemented using algorithms like Reciprocal Rank Fusion (RRF), optimizes retrieval by integrating diverse similarity measures.` +- **Re-Ranking & Filtering** + * `After the initial retrieval of results using any of the aforementioned sophisticated algorithms, the focus shifts to refining these results through various post-processing techniques.` + * `Various systems enabling the fine-tuning of retrieval outcomes based on similarity scores, keywords, metadata, or through re-ranking with additional models. These models could include an LLM, a sentence-transformer cross-encoder, or even external reranking services like Cohere. Moreover, filtering can also be adjusted based on metadata attributes, such as the recency of the data, ensuring that the most relevant and timely information is prioritized. This stage is critical as it prepares the retrieved data for the final step — feeding it into an LLM to generate the precise answer.` + 1. f + 2. f +- **Query Transformations** + 1. **(Sub-)Query Decomposition** + * `For complex queries that are unlikely to yield direct comparisons or results from existing data (e.g., comparing GitHub stars between Langchain and LlamaIndex), an LLM can break down the query into simpler, more manageable sub-queries. Each sub-query can then be processed independently, with their results synthesized later to form a comprehensive response.` + * Multi Query Retriever and Sub Question Query Engine + - Step-back Prompting + * `method involves using an LLM to generate a broader or more general query from the original, complex query. The aim is to retrieve a higher-level context that can serve as a foundation for answering the more specific original query. The contexts from both the original and the generalized queries are then combined to enhance the final answer generation.` + - Query Rewriting + * https://archive.is/FCiaW + * `Another technique involves using an LLM to reformulate the initial query to improve the retrieval process` + 2. **Reference Citations** + - Direct Source Mention + * Require mention of source IDs directly in generated response. + - Fuzzy Matching + * Align portions of the response with their corresponding text chunks in the index. + - Research: + - Attribution Bench: https://osu-nlp-group.github.io/AttributionBench/ + * Finetuning T5 models outperform otherwise SOTA models. + * Complexity of questions and data are issues. + - ContextCite: https://gradientscience.org/contextcite/ + * Hot shit? + * https://gradientscience.org/contextcite-applications/ + - Metrics - Enabling LLMs to generate text with citations paper + * https://arxiv.org/abs/2305.14627 +- **Chat Engine** + 1. ContextChatEngine: + * `A straightforward approach where the LLM retrieves context relevant to the user’s query along with any previous chat history. This history is then used to inform the LLM’s response, ensuring continuity and relevance in the dialogue.` + 2. CondensePlusContextMode + * ` A more advanced technique where each interaction’s chat history and the last message are condensed into a new query. This refined query is used to retrieve relevant context, which, along with the original user message, is passed to the LLM for generating a response.` +- **Query Routing** + * `Query routing involves strategic decision-making powered by an LLM to determine the most effective subsequent action based on the user’s query. This could include decisions to summarize information, search specific data indices, or explore multiple routes to synthesize a comprehensive answer. Query routers are crucial for selecting the appropriate data source or index, especially in systems where data is stored across multiple platforms, such as vector stores, graph databases, or relational databases.` + - Query Routers + * F +- **Agents in RAG Systems** + 1. **Multi-Document Agent Scheme** + 2. **Walking RAG** - Multi-shot retrieval + - Have the LLM ask for more information as needed and perform searches for said information, to loop back in to asking the LLM if there's enough info. + - Things necessary to facillitate: + * We need to extract partial information from retrieved pieces of source data, so we can learn as we go. + * We need to find new places to look, informed by the source data as well as the question. + * We need to retrieve information from those specific places. + * Links: + * https://olickel.com/retrieval-augmented-research-1-basics + * https://olickel.com/retrieval-augmented-research-2-walking + * https://olickel.com/retrieval-augmented-research-3-use-the-whole-brain + 3. F +- **Response Synthesizer** + * `The simplest method might involve merely concatenating all relevant context with the query and processing it through an LLM. However, more nuanced approaches involve multiple LLM interactions to refine the context and enhance the quality of the final answer.` + 1. Iterative Refinement + * `Breaking down the retrieved context into manageable chunks and sequentially refining the response through multiple LLM interactions.` + 2. Context Summarization + * `Compressing the extensive retrieved context to fit within an LLM’s prompt limitations.` + 3. Multi-Answer Generation + * `Producing several responses from different context segments and then synthesizing these into a unified answer.` +- **Evaluating RAG Performance** + + + +- Semantic + Relevance Ranking + - One example: + * `rank = (cosine similarity) + (weight) x (relevance score)` +- Embedding models need to be fine-tuned to your data for best results + * `For your Q&A system built on support docs, you very well may find that question→question comparisons will materially improve performance opposed to question→support doc. Pragmatically, you can ask ChatGPT to generate example questions for each support doc and have a human expert curate them. In essence you’d be pre-populating your own Stack Overflow.` + - Can create semi-synthetic training data based on your documents - Want to take this “Stack Overflow” methodology one step further? + 1. For each document, ask ChatGPT to generate a list of 100 questions it can answer + 2. These questions won’t be perfect, so for each question you generate, compute cosine similarities with each other document + 3. Filter those questions which would rank the correct document #1 against every other document + 4. Identify the highest-quality questions by sorting those which have the highest difference between cosine similarity of the correct document and the second ranked document + 5. Send to human for further curation +- **Balancing Precision vs Recall** + - List of: + 1. Threshold Tuning: Adjusting the threshold for deciding whether a document is relevant or not can shift the balance between precision and recall. Lowering the threshold may increase recall but decrease precision, and vice versa. + 2. Query Expansion and Refinement: Enhancing the query with additional keywords (query expansion) can increase recall by retrieving a broader set of documents. Conversely, refining the query by adding more specific terms can improve precision. + 3. Relevance Feedback: Incorporating user feedback into the retrieval process can help refine the search results. Users' interactions with the results (clicks, time spent on a document, etc.) can provide valuable signals to adjust the balance between precision and recall. + 4. Use of Advanced Models: Employing more sophisticated models like deep neural networks can improve both precision and recall. These models are better at understanding complex queries and documents, leading to more accurate retrieval. + 5. Customizing Based on Use Case: Different applications may require a different balance of precision and recall. For instance, in a legal document search, precision might be more important to ensure that all retrieved documents are highly relevant. In a medical research scenario, recall might be prioritized to ensure no relevant studies are missed. + + + +- **Prompt Complexity** + 1. Single Fact Retrieval + 2. Multi-Fact Retrieval + 3. Discontigous multi-fact retrieval + 4. Simple Analysis questions + 5. Complex Analysis + 6. Research Level Questions diff --git a/HF/app.py b/HF/app.py index df8ee8779..af0a331d8 100644 --- a/HF/app.py +++ b/HF/app.py @@ -16,20 +16,17 @@ from App_Function_Libraries.Chunk_Lib import semantic_chunk_long_file#, rolling_summarize_function, from App_Function_Libraries.Gradio_Related import launch_ui from App_Function_Libraries.Local_LLM_Inference_Engine_Lib import cleanup_process, local_llm_function -from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \ - summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm +from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, \ - summarize_with_cohere, summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, \ - summarize_with_huggingface, perform_transcription, perform_summarization -from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav, speech_to_text + summarize_with_cohere, summarize_with_groq, perform_transcription, perform_summarization +from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text from App_Function_Libraries.Local_File_Processing_Lib import read_paths_from_file, process_local_file from App_Function_Libraries.DB_Manager import add_media_to_database -from App_Function_Libraries.System_Checks_Lib import cuda_check, platform_check, check_ffmpeg -from App_Function_Libraries.Utils import load_and_log_configs, create_download_directory, extract_text_from_segments +from App_Function_Libraries.Utils.System_Checks_Lib import cuda_check, platform_check, check_ffmpeg +from App_Function_Libraries.Utils.Utils import load_and_log_configs, create_download_directory, extract_text_from_segments from App_Function_Libraries.Video_DL_Ingestion_Lib import download_video, extract_video_info # # 3rd-Party Module Imports -import requests # OpenAI Tokenizer support # # Other Tokenizers diff --git a/HF/requirements.txt b/HF/requirements.txt index 14f5de190..2afc557aa 100644 --- a/HF/requirements.txt +++ b/HF/requirements.txt @@ -5,40 +5,31 @@ annotated-types==0.6.0 anyio==4.3.0 attrs==23.2.0 av==11.0.0 -Babel==2.15.0 -beautifulsoup4==4.12.3 Brotli==1.1.0 -bs4==0.0.2 -certifi==2024.2.2 -cffi==1.16.0 +bs4 +certifi==2024.7.4 charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 coloredlogs==15.0.1 contourpy==1.2.1 -courlan==1.1.0 ctranslate2==4.2.1 cycler==0.12.1 -dateparser==1.2.0 -Deprecated==1.2.14 -distro==1.9.0 dnspython==2.6.1 email_validator==2.1.1 fastapi==0.111.0 fastapi-cli==0.0.3 faster-whisper==1.0.1 -ffmpeg +ffmpeg==1.4 ffmpy==0.3.2 filelock==3.13.4 fire==0.6.0 flatbuffers==24.3.25 fonttools==4.51.0 fsspec==2024.3.1 -gradio -gradio_client -greenlet==3.0.3 +gradio==4.29.0 +gradio_client==0.16.1 h11==0.14.0 -htmldate==1.8.1 httpcore==1.0.5 httptools==0.6.1 httpx==0.27.0 @@ -46,58 +37,51 @@ huggingface-hub==0.22.2 humanfriendly==10.0 idna==3.7 importlib_resources==6.4.0 -Jinja2==3.1.3 +Jinja2==3.1.4 jsonschema==4.22.0 jsonschema-specifications==2023.12.1 -jusText==3.0.1 kiwisolver==1.4.5 -lxml==5.1.1 markdown-it-py==3.0.0 MarkupSafe==2.1.5 matplotlib==3.8.4 mdurl==0.1.2 mpmath==1.3.0 -mss==9.0.1 mutagen==1.47.0 networkx==3.3 -nltk -nodriver==0.29rc2 numpy==1.26.4 onnxruntime==1.17.3 -openai==1.30.1 +openai orjson==3.10.3 -outcome==1.3.0.post0 packaging==24.0 pandas==2.2.2 pillow==10.3.0 -playwright==1.44.0 +playwright protobuf==5.26.1 psutil==5.9.8 pyannote.audio -pycparser==2.22 pycryptodomex==3.20.0 pydantic==2.7.1 pydantic_core==2.18.2 pydub==0.25.1 pyee==11.1.0 Pygments==2.18.0 +pypandoc_binary +pypandoc pyparsing==3.1.2 pyreadline3==3.4.1 -PySocks==1.7.1 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 -python-ffmpeg +python-ffmpeg==2.0.12 python-multipart==0.0.9 pytz==2024.1 PyYAML==6.0.1 referencing==0.35.1 regex==2023.12.25 -requests==2.31.0 +requests==2.32.2 rich==13.7.1 rpds-py==0.18.1 ruff==0.4.3 safetensors==0.4.2 -selenium==4.21.0 semantic-version==2.10.0 sentencepiece==0.2.0 setuptools==69.5.1 @@ -105,37 +89,40 @@ shellingham==1.5.4 six==1.16.0 scikit-learn sniffio==1.3.1 -sortedcontainers==2.4.0 -soupsieve==2.5 starlette==0.37.2 sympy==1.12 termcolor==2.4.0 -tiktoken==0.7.0 timm==0.9.16 -tld==0.13 tokenizers==0.15.2 tomlkit==0.12.0 toolz==0.12.1 - ---extra-index-url https://download.pytorch.org/whl/cu113 -torch -torchaudio torchvision==0.17.2 -tqdm==4.66.2 -trafilatura==1.9.0 +tqdm==4.66.3 +trafilatura transformers==4.39.3 -trio==0.25.1 -trio-websocket==0.11.1 typer==0.12.3 typing_extensions==4.11.0 tzdata==2024.1 -tzlocal==5.2 ujson==5.9.0 -undetected-chromedriver==3.5.5 -urllib3==2.2.1 +urllib3==2.2.2 uvicorn==0.29.0 watchfiles==0.21.0 -websockets -wrapt==1.16.0 -wsproto==1.2.0 +websockets==11.0.3 yt-dlp +nltk +torch +torchaudio +openai~=1.28.1 +tiktoken~=0.6.0 +pymupdf +chromadb +protobuf==5.26.1 +textstat +elasticsearch +lxml_html_clean + +#torch==2.2.2+cu121 +#torchaudio==2.2.2+cu121 +#websockets +#yt-dlp==2024.4.9 +#--extra-index-url https://download.pytorch.org/whl/cu113 \ No newline at end of file diff --git a/New.md b/New.md new file mode 100644 index 000000000..381cc7a81 --- /dev/null +++ b/New.md @@ -0,0 +1,3 @@ + +Chinese + Japanese language support for chunking: +pip install jieba fugashi langdetect textstat \ No newline at end of file diff --git a/README.md b/README.md index a75228bed..7302cc4f6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ # **tl/dw: Too Long, Didnt Watch** ## Download, Transcribe, Summarize & Chat with Video+Audio+Documents+Articles & Books. ### All automated. All local. All yours. -## [Public Demo](https://huggingface.co/spaces/oceansweep/Vid-Summarizer) +## [Public Demo](https://huggingface.co/spaces/oceansweep/Vid-Summarizer) +- The demo is now blocked by Youtube unless you provide a session cookie for a logged in session. Additionally, there's an ongoing issue with file permissions relating to HF spaces, so the Demo is more of a 'poke around and look' vs 'use it as intended' at the moment. #### More: Full-Text-Search across everything ingested (RAG is wip), Local LLM inference as part of it(llamafile) for those who don't want to mess with setting up an LLM, and a WebApp(gradio as PoC) to interact with the script in a more user-friendly manner. #### The original scripts by `the-crypt-keeper` are available here: [scripts here](https://github.com/the-crypt-keeper/tldw/tree/main/tldw-original-scripts) diff --git a/Server_API/Dockerfile b/Server_API/Dockerfile new file mode 100644 index 000000000..eb5075d0c --- /dev/null +++ b/Server_API/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11 + +WORKDIR /code + +COPY ./requirements.txt /code/requirements.txt +RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt + +COPY ./app /code/app + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file diff --git a/Server_API/app/api/v1/endpoints/video_processing.py b/Server_API/app/api/v1/endpoints/video_processing.py new file mode 100644 index 000000000..6a1915282 --- /dev/null +++ b/Server_API/app/api/v1/endpoints/video_processing.py @@ -0,0 +1,25 @@ +from fastapi import APIRouter, BackgroundTasks, HTTPException +from typing import List, Optional +from Server_API.app.services.video_processing_service import process_video_task + +router = APIRouter() + +# @router.post("/process-video", summary="Process a video", description="Download, transcribe, and summarize a video from the given URL.") +# async def process_video( +# url: str = Query(..., description="URL of the video to process"), +# whisper_model: str = Query(..., description="Whisper model to use for transcription"), +# custom_prompt: Optional[str] = Query(None, description="Custom prompt for summarization"), +# api_name: str = Query(..., description="Name of the API to use for summarization"), +# api_key: str = Query(..., description="API key for the summarization service"), +# keywords: List[str] = Query(default=[], description="Keywords to associate with the video"), +# diarize: bool = Query(False, description="Whether to perform speaker diarization"), +# start_time: Optional[str] = Query(None, description="Start time for processing (format: HH:MM:SS)"), +# end_time: Optional[str] = Query(None, description="End time for processing (format: HH:MM:SS)"), +# include_timestamps: bool = Query(True, description="Whether to include timestamps in the transcription"), +# keep_original_video: bool = Query(False, description="Whether to keep the original video file after processing"), +# background_tasks: BackgroundTasks = BackgroundTasks() +# ): +# task_id = f"task_{url.replace('://', '_').replace('/', '_')}" +# background_tasks.add_task(process_video_task, url, whisper_model, custom_prompt, api_name, api_key, +# keywords, diarize, start_time, end_time, include_timestamps, keep_original_video) +# return {"task_id": task_id, "message": "Video processing started"} \ No newline at end of file diff --git a/Server_API/app/core/config.py b/Server_API/app/core/config.py new file mode 100644 index 000000000..1050ef74c --- /dev/null +++ b/Server_API/app/core/config.py @@ -0,0 +1,9 @@ +import os + +def get_settings(): + return { + "DATABASE_URL": os.getenv("DATABASE_URL", "sqlite:///./tldw.db"), + # Add other configuration variables as needed + } + +settings = get_settings() \ No newline at end of file diff --git a/Server_API/app/core/exceptions.py b/Server_API/app/core/exceptions.py new file mode 100644 index 000000000..a3466e29e --- /dev/null +++ b/Server_API/app/core/exceptions.py @@ -0,0 +1,14 @@ +from fastapi import HTTPException, Request +from fastapi.responses import JSONResponse + +class VideoProcessingError(Exception): + pass + +async def video_processing_exception_handler(request: Request, exc: VideoProcessingError): + return JSONResponse( + status_code=500, + content={"message": f"An error occurred during video processing: {str(exc)}"}, + ) + +def setup_exception_handlers(app): + app.add_exception_handler(VideoProcessingError, video_processing_exception_handler) \ No newline at end of file diff --git a/Server_API/app/core/logging.py b/Server_API/app/core/logging.py new file mode 100644 index 000000000..ec6586d7c --- /dev/null +++ b/Server_API/app/core/logging.py @@ -0,0 +1,12 @@ +import logging + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(), + logging.FileHandler("app.log") + ] +) + +logger = logging.getLogger(__name__) \ No newline at end of file diff --git a/Server_API/app/db/database.py b/Server_API/app/db/database.py new file mode 100644 index 000000000..c137faddb --- /dev/null +++ b/Server_API/app/db/database.py @@ -0,0 +1,17 @@ +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker + +SQLALCHEMY_DATABASE_URL = "sqlite:///./tldw.db" + +engine = create_engine(SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +Base = declarative_base() + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() \ No newline at end of file diff --git a/Server_API/app/main.py b/Server_API/app/main.py new file mode 100644 index 000000000..578c50127 --- /dev/null +++ b/Server_API/app/main.py @@ -0,0 +1,11 @@ +from fastapi import FastAPI +from Server_API.app.api.v1.endpoints import video_processing +from Server_API.app.core.exceptions import setup_exception_handlers + +app = FastAPI(title="TLDW API", version="1.0.0") +setup_exception_handlers(app) +app.include_router(video_processing.router, prefix="/api/v1") + +@app.get("/") +async def root(): + return {"message": "Welcome to the TLDW API"} diff --git a/Server_API/app/services/media.py b/Server_API/app/services/media.py new file mode 100644 index 000000000..ff8310eea --- /dev/null +++ b/Server_API/app/services/media.py @@ -0,0 +1,12 @@ +from sqlalchemy import Column, Integer, String, Boolean, DateTime +from Server_API.db.database import Base + +class Media(Base): + __tablename__ = "media" + + id = Column(Integer, primary_key=True, index=True) + title = Column(String, index=True) + url = Column(String) + content = Column(String) + is_trash = Column(Boolean, default=False) + trash_date = Column(DateTime, nullable=True) \ No newline at end of file diff --git a/Server_API/app/services/video_processing_service.py b/Server_API/app/services/video_processing_service.py new file mode 100644 index 000000000..bbb2a9d03 --- /dev/null +++ b/Server_API/app/services/video_processing_service.py @@ -0,0 +1,56 @@ +from app.core.logging import logger +from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata, download_video +from App_Function_Libraries.Summarization_General_Lib import perform_transcription, perform_summarization, save_transcription_and_summary +from App_Function_Libraries.Utils.Utils import convert_to_seconds, create_download_directory, extract_text_from_segments +from App_Function_Libraries.DB_Manager import add_media_to_database + +async def process_video_task(url, whisper_model, custom_prompt, api_name, api_key, keywords, diarize, + start_time, end_time, include_timestamps, keep_original_video): + try: + # Create download path + download_path = create_download_directory("Video_Downloads") + logger.info(f"Download path created at: {download_path}") + + # Extract video information + video_metadata = extract_metadata(url, use_cookies=False, cookies=None) + if not video_metadata: + raise ValueError(f"Failed to extract metadata for {url}") + + # Download video + video_file_path = download_video(url, download_path, video_metadata, False, whisper_model) + if not video_file_path: + raise ValueError(f"Failed to download video/audio from {url}") + + # Perform transcription + start_seconds = convert_to_seconds(start_time) if start_time else 0 + end_seconds = convert_to_seconds(end_time) if end_time else None + audio_file_path, segments = perform_transcription(video_file_path, start_seconds, whisper_model, False, diarize) + + if audio_file_path is None or segments is None: + raise ValueError("Transcription failed or segments not available.") + + # Process segments and extract text + if not include_timestamps: + segments = [{'Text': segment['Text']} for segment in segments] + transcription_text = extract_text_from_segments(segments) + + # Perform summarization + full_text_with_metadata = f"{video_metadata}\n\n{transcription_text}" + summary_text = perform_summarization(api_name, full_text_with_metadata, custom_prompt, api_key) + + # Save transcription and summary + json_file_path, summary_file_path = save_transcription_and_summary(full_text_with_metadata, summary_text, download_path, video_metadata) + + # Add to database + add_media_to_database(video_metadata['webpage_url'], video_metadata, full_text_with_metadata, summary_text, keywords, custom_prompt, whisper_model) + + # Clean up files if not keeping original video + if not keep_original_video: + # Add cleanup logic here + pass + + logger.info(f"Video processing completed for {url}") + return True + except Exception as e: + logger.error(f"Error processing video for {url}: {str(e)}") + return False \ No newline at end of file diff --git a/Server_API/requirements.txt b/Server_API/requirements.txt new file mode 100644 index 000000000..7fd0995d1 --- /dev/null +++ b/Server_API/requirements.txt @@ -0,0 +1,4 @@ +pip install fastapi uvicorn + + +uvicorn app.main:app --reload \ No newline at end of file diff --git a/Tests/Chat_APIs/.env b/Tests/Chat_APIs/.env new file mode 100644 index 000000000..48422386f --- /dev/null +++ b/Tests/Chat_APIs/.env @@ -0,0 +1,28 @@ +# .env file template for LLM API keys +# Replace the placeholder values with your actual API keys + +# OpenAI API Key +OPENAI_API_KEY=your_openai_api_key_here + +# Anthropic API Key +ANTHROPIC_API_KEY=your_anthropic_api_key_here + +# Cohere API Key +COHERE_API_KEY=your_cohere_api_key_here + +# Groq API Key +GROQ_API_KEY=your_groq_api_key_here + +# OpenRouter API Key +OPENROUTER_API_KEY=your_openrouter_api_key_here + +# HuggingFace API Key +HUGGINGFACE_API_KEY=your_huggingface_api_key_here + +# DeepSeek API Key +DEEPSEEK_API_KEY=your_deepseek_api_key_here + +# Mistral API Key +MISTRAL_API_KEY=your_mistral_api_key_here + +# Add any other API keys or configuration variables as needed \ No newline at end of file diff --git a/Tests/Chat_APIs/Chat_APIs_Integration_test.py b/Tests/Chat_APIs/Chat_APIs_Integration_test.py new file mode 100644 index 000000000..da4643b53 --- /dev/null +++ b/Tests/Chat_APIs/Chat_APIs_Integration_test.py @@ -0,0 +1,100 @@ +# Chat_APIs_Integration_test.py +# Test file for testing the integration of the LLM API calls with the Chat APIs. +# +# Usage: +# First setup api keys as env variables: +# export OPENAI_API_KEY=your_openai_key +# export ANTHROPIC_API_KEY=your_anthropic_key +# ... set other API keys similarly +# then run it: +# python -m unittest test_llm_api_calls_integration.py + +import unittest +import os +from dotenv import load_dotenv +from App_Function_Libraries.LLM_API_Calls import ( + chat_with_openai, + chat_with_anthropic, + chat_with_cohere, + chat_with_groq, + chat_with_openrouter, + chat_with_huggingface, + chat_with_deepseek, + chat_with_mistral +) + +class TestLLMAPICallsIntegration(unittest.TestCase): + + @classmethod + def setUpClass(cls): + # Load environment variables from .env file + load_dotenv() + + # Load API keys from environment variables + cls.openai_api_key = os.getenv('OPENAI_API_KEY') + cls.anthropic_api_key = os.getenv('ANTHROPIC_API_KEY') + cls.cohere_api_key = os.getenv('COHERE_API_KEY') + cls.groq_api_key = os.getenv('GROQ_API_KEY') + cls.openrouter_api_key = os.getenv('OPENROUTER_API_KEY') + cls.huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY') + cls.deepseek_api_key = os.getenv('DEEPSEEK_API_KEY') + cls.mistral_api_key = os.getenv('MISTRAL_API_KEY') + + def test_chat_with_openai(self): + if not self.openai_api_key: + self.skipTest("OpenAI API key not available") + response = chat_with_openai(self.openai_api_key, "Hello, how are you?", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_anthropic(self): + if not self.anthropic_api_key: + self.skipTest("Anthropic API key not available") + response = chat_with_anthropic(self.anthropic_api_key, "Hello, how are you?", "claude-2", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_cohere(self): + if not self.cohere_api_key: + self.skipTest("Cohere API key not available") + response = chat_with_cohere(self.cohere_api_key, "Hello, how are you?", "command", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_groq(self): + if not self.groq_api_key: + self.skipTest("Groq API key not available") + response = chat_with_groq(self.groq_api_key, "Hello, how are you?", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_openrouter(self): + if not self.openrouter_api_key: + self.skipTest("OpenRouter API key not available") + response = chat_with_openrouter(self.openrouter_api_key, "Hello, how are you?", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_huggingface(self): + if not self.huggingface_api_key: + self.skipTest("HuggingFace API key not available") + response = chat_with_huggingface(self.huggingface_api_key, "Hello, how are you?", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_deepseek(self): + if not self.deepseek_api_key: + self.skipTest("DeepSeek API key not available") + response = chat_with_deepseek(self.deepseek_api_key, "Hello, how are you?", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + + def test_chat_with_mistral(self): + if not self.mistral_api_key: + self.skipTest("Mistral API key not available") + response = chat_with_mistral(self.mistral_api_key, "Hello, how are you?", "Respond briefly") + self.assertIsInstance(response, str) + self.assertTrue(len(response) > 0) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/requirements-no-versions-pinned.txt b/requirements-no-versions-pinned.txt deleted file mode 100644 index c7ac4152f..000000000 --- a/requirements-no-versions-pinned.txt +++ /dev/null @@ -1,122 +0,0 @@ -accelerate -aiofiles -altair -annotated-types -anyio -attrs -av -Brotli -bs4 -certifi -charset-normalizer -click -colorama -coloredlogs -contourpy -ctranslate2 -cycler -dnspython -email_validator -fastapi -fastapi-cli -faster-whisper -ffmpeg -ffmpy -filelock -fire -flatbuffers -fonttools -fsspec -gradio -gradio_client -h11 -httpcore -httptools -httpx -huggingface-hub -humanfriendly -idna -importlib_resources -Jinja2 -jsonschema -jsonschema-specifications -kiwisolver -markdown-it-py -MarkupSafe -matplotlib -mdurl -mpmath -mutagen -networkx -numpy -onnxruntime -openai -orjson -packaging -pandas -pillow -playwright -protobuf -psutil -pyannote.audio -pycryptodomex -pydantic -pydantic_core -pydub -pyee -Pygments -pymupdf -pypandoc -pyparsing -pyreadline3 -python-dateutil -python-dotenv -python-ffmpeg -python-multipart -pytz -PyYAML -referencing -regex -requests -rich -rpds-py -ruff -safetensors -scikit-learn -semantic-version -sentencepiece -setuptools -shellingham -six -sniffio -starlette -sympy -termcolor -textstat -timm -tiktoken -tokenizers -tomlkit -toolz -torchvision -tqdm -trafilatura -transformers -typer -typing_extensions -tzdata -ujson -urllib3 -uvicorn -watchfiles -websockets -yt-dlp -nltk -torch -torchaudio -chromadb -protobuf -textstat -elasticsearch - - diff --git a/requirements.txt b/requirements.txt index e3a30c941..ff9c6ba1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -117,7 +117,10 @@ pymupdf chromadb protobuf==4.25.4 textstat +langdetect elasticsearch +jieba +fugashi #torch==2.2.2+cu121 #torchaudio==2.2.2+cu121 #websockets diff --git a/summarize.py b/summarize.py index 670cf51fc..9f6f1bea0 100644 --- a/summarize.py +++ b/summarize.py @@ -21,9 +21,9 @@ summarize_with_cohere, summarize_with_groq, perform_transcription, perform_summarization from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text from App_Function_Libraries.Local_File_Processing_Lib import read_paths_from_file, process_local_file -from App_Function_Libraries.DB_Manager import add_media_to_database -from App_Function_Libraries.System_Checks_Lib import cuda_check, platform_check, check_ffmpeg -from App_Function_Libraries.Utils import load_and_log_configs, create_download_directory, extract_text_from_segments, \ +from App_Function_Libraries.DB.DB_Manager import add_media_to_database +from App_Function_Libraries.Utils.System_Checks_Lib import cuda_check, platform_check, check_ffmpeg +from App_Function_Libraries.Utils.Utils import load_and_log_configs, create_download_directory, extract_text_from_segments, \ cleanup_downloads from App_Function_Libraries.Video_DL_Ingestion_Lib import download_video, extract_video_info #