Skip to content

Commit

Permalink
Merge pull request #267 from rmusser01/dev
Browse files Browse the repository at this point in the history
Fix embeddings generation/chunking
Also updated gradio/torchvision versions
  • Loading branch information
rmusser01 authored Sep 15, 2024
2 parents 6ddea4a + 79c4c81 commit 80a3b99
Show file tree
Hide file tree
Showing 14 changed files with 845 additions and 206 deletions.
55 changes: 34 additions & 21 deletions App_Function_Libraries/Chunk_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
####
# Import necessary libraries
import hashlib
import json
import logging
import re
from typing import Any, Dict, List, Optional, Tuple
Expand Down Expand Up @@ -72,42 +73,53 @@ def load_document(file_path):

def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
logging.debug("Improved chunking process started...")

# Extract JSON metadata if present
json_content = {}
try:
json_end = text.index("}\n") + 1
json_content = json.loads(text[:json_end])
text = text[json_end:].strip()
logging.debug(f"Extracted JSON metadata: {json_content}")
except (ValueError, json.JSONDecodeError):
logging.debug("No JSON metadata found at the beginning of the text")

# Extract any additional header text
header_match = re.match(r"(This text was transcribed using.*?)\n\n", text, re.DOTALL)
header_text = ""
if header_match:
header_text = header_match.group(1)
text = text[len(header_text):].strip()
logging.debug(f"Extracted header text: {header_text}")

options = chunk_options.copy()
if custom_chunk_options:
options.update(custom_chunk_options)

chunk_method = options.get('method', 'words')
base_size = options.get('base_size', 1000)
min_size = options.get('min_size', 100)
max_size = options.get('max_size', 2000)
overlap = options.get('overlap', 0)
language = options.get('language', None)
adaptive = options.get('adaptive', False)
multi_level = options.get('multi_level', False)

if language is None:
language = detect_language(text)

if adaptive:
max_chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
else:
max_chunk_size = base_size

if multi_level:
chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
else:
chunks = chunk_text(text, chunk_method, max_chunk_size, overlap, language)
chunks = chunk_text(text, chunk_method, max_size, overlap, language)

chunks_with_metadata = []
total_chunks = len(chunks)
for i, chunk in enumerate(chunks):
metadata = get_chunk_metadata(
chunk,
text,
chunk_type=chunk_method,
language=language
)
metadata['chunk_index'] = i
metadata['total_chunks'] = len(chunks)
metadata = {
'chunk_index': i,
'total_chunks': total_chunks,
'chunk_method': chunk_method,
'max_size': max_size,
'overlap': overlap,
'language': language,
'relative_position': i / total_chunks
}
metadata.update(json_content) # Add the extracted JSON content to metadata
metadata['header_text'] = header_text # Add the header text to metadata

chunks_with_metadata.append({
'text': chunk,
Expand All @@ -117,6 +129,7 @@ def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] =
return chunks_with_metadata



def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
logging.debug("Multi-level chunking process started...")
# First level: chunk by paragraphs
Expand Down
14 changes: 9 additions & 5 deletions App_Function_Libraries/Gradio_Related.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@
from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \
create_search_summaries_tab, create_viewing_tab, create_search_tab
from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_embeddings_tab, create_rag_tab, \
create_view_embeddings_tab
from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab
from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \
create_purge_embeddings_tab
from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
create_delete_trash_tab, create_search_and_mark_trash_tab
from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
Expand Down Expand Up @@ -260,11 +261,9 @@ def launch_ui(share_public=None, server_mode=False):
create_search_tab()
create_search_summaries_tab()

with gr.TabItem("RAG Search / Embeddings"):
with gr.TabItem("RAG Search"):
create_rag_tab()
create_rag_qa_chat_tab()
create_embeddings_tab()
create_view_embeddings_tab()

with gr.TabItem("Chat with an LLM"):
create_chat_interface()
Expand Down Expand Up @@ -295,6 +294,11 @@ def launch_ui(share_public=None, server_mode=False):
# FIXME
#create_compare_transcripts_tab()

with gr.TabItem("Embeddings Management"):
create_embeddings_tab()
create_view_embeddings_tab()
create_purge_embeddings_tab()

with gr.TabItem("Writing Tools"):
with gr.Tabs():
from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
Expand Down
Loading

0 comments on commit 80a3b99

Please sign in to comment.