From c82fac1a824d4a3522499fcbda6048362020b7f0 Mon Sep 17 00:00:00 2001 From: Robert Date: Wed, 15 May 2024 22:15:36 -0700 Subject: [PATCH] And its back to working. Ok. So, rolling summary works with chatGPT via CLI. Passing in a list works via CLI. Summarization of said list works via CLI. Demo GUI works. Dark/Light mode toggle does not work. Simple/Advanced mode toggle does not work. Detail slider in the GUI does not work. No current option for rolling summarization in the GUI. Lack of 're-summarize/ask a question about the transcription' box in the GUI. --- Long_Summarize_openai.py | 227 --------------------------------------- summarize.py | 21 ++-- 2 files changed, 10 insertions(+), 238 deletions(-) delete mode 100644 Long_Summarize_openai.py diff --git a/Long_Summarize_openai.py b/Long_Summarize_openai.py deleted file mode 100644 index e0b16ace8..000000000 --- a/Long_Summarize_openai.py +++ /dev/null @@ -1,227 +0,0 @@ -from typing import List, Tuple, Optional -from openai import OpenAI -import tiktoken -from tqdm import tqdm - - -# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb - - -# Open dataset -with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file: - artificial_intelligence = file.read() - -# load encoding and check length of dataset -encoding = tiktoken.encoding_for_model('gpt-4-turbo') -print(len(encoding.encode(artificial_intelligence))) - -# Call wrapper to OpenAI -client = OpenAI(api_key="") - - -def get_chat_completion(messages, model='gpt-4-turbo'): - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - return response.choices[0].message.content - - -# Message Chunking <----- THE JUICY STUFF -def tokenize(text: str) -> List[str]: - encoding = tiktoken.encoding_for_model('gpt-4-turbo') - return encoding.encode(text) - - -# This function chunks a text into smaller pieces based on a maximum token count and a delimiter -def chunk_on_delimiter(input_string: str, - max_tokens: int, - delimiter: str) -> List[str]: - chunks = input_string.split(delimiter) - combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum( - chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True - ) - if dropped_chunk_count > 0: - print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.") - combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] - return combined_chunks - - -# This function combines text chunks into larger blocks without exceeding a specified token count. -# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow. -def combine_chunks_with_no_minimum( - chunks: List[str], - max_tokens: int, - chunk_delimiter="\n\n", - header: Optional[str] = None, - add_ellipsis_for_overflow=False, -) -> Tuple[List[str], List[int]]: - dropped_chunk_count = 0 - output = [] # list to hold the final combined chunks - output_indices = [] # list to hold the indices of the final combined chunks - candidate = ( - [] if header is None else [header] - ) # list to hold the current combined chunk candidate - candidate_indices = [] - for chunk_i, chunk in enumerate(chunks): - chunk_with_header = [chunk] if header is None else [header, chunk] - if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens: - print(f"warning: chunk overflow") - if ( - add_ellipsis_for_overflow - and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens - ): - candidate.append("...") - dropped_chunk_count += 1 - continue # this case would break downstream assumptions - # estimate token count with the current chunk added - extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk]))) - # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate - if extended_candidate_token_count > max_tokens: - output.append(chunk_delimiter.join(candidate)) - output_indices.append(candidate_indices) - candidate = chunk_with_header # re-initialize candidate - candidate_indices = [chunk_i] - # otherwise keep extending the candidate - else: - candidate.append(chunk) - candidate_indices.append(chunk_i) - # add the remaining candidate to output if it's not empty - if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0): - output.append(chunk_delimiter.join(candidate)) - output_indices.append(candidate_indices) - return output, output_indices, dropped_chunk_count - - -def summarize(text: str, - detail: float = 0, - model: str = 'gpt-4-turbo', - additional_instructions: Optional[str] = None, - minimum_chunk_size: Optional[int] = 500, - chunk_delimiter: str = ".", - summarize_recursively=False, - verbose=False): - """ - Summarizes a given text by splitting it into chunks, each of which is summarized individually. - The level of detail in the summary can be adjusted, and the process can optionally be made recursive. - - Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1 - indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more - detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to - 'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the - model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text - chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. - Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively, - using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the - chunking process. - - Returns: - - str: The final compiled summary of the text. - - The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count - based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If - `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the - summarization process. The function returns a compiled summary of all chunks. - """ - - # check detail is set correctly - assert 0 <= detail <= 1 - - # interpolate the number of chunks based to get specified level of detail - max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter)) - min_chunks = 1 - num_chunks = int(min_chunks + detail * (max_chunks - min_chunks)) - - # adjust chunk_size based on interpolated number of chunks - document_length = len(tokenize(text)) - chunk_size = max(minimum_chunk_size, document_length // num_chunks) - text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter) - if verbose: - print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.") - print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}") - - # set system message - system_message_content = "Rewrite this text in summarized form." - if additional_instructions is not None: - system_message_content += f"\n\n{additional_instructions}" - - accumulated_summaries = [] - for chunk in tqdm(text_chunks): - if summarize_recursively and accumulated_summaries: - # Creating a structured prompt for recursive summarization - accumulated_summaries_string = '\n\n'.join(accumulated_summaries) - user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}" - else: - # Directly passing the chunk for summarization without recursive context - user_message_content = chunk - - # Constructing messages based on whether recursive summarization is applied - messages = [ - {"role": "system", "content": system_message_content}, - {"role": "user", "content": user_message_content} - ] - - # Assuming this function gets the completion and works as expected - response = get_chat_completion(messages, model=model) - accumulated_summaries.append(response) - - # Compile final summary from partial summaries - final_summary = '\n\n'.join(accumulated_summaries) - - return final_summary - -# Summary at 0 detail -summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True) - - -# Summary at 0.25 detail -summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True) - - -# Summary at 0.5 detail -summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True) - - -# Summary at 0.75 detail -summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True) - - -# Summart at 1 detail -summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True) - - -# Lengths of summaries: -[len(tokenize(x)) for x in - [summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]] - -# print 0 detail summary -print(summary_with_detail_0) - - -# print 0.25 detail summary -print(summary_with_detail_pt25) - - -# print 0.5 detail summary -print(summary_with_detail_pt5) - - -# print 0.75 detail summary -print(summary_with_detail_pt75) - - -# print 1.0 detail summary -print(summary_with_detail_1) - - -# Print summary using additional instructions: -summary_with_additional_instructions = summarize(artificial_intelligence, detail=0.1, - additional_instructions="Write in point form and focus on numerical data.") -print(summary_with_additional_instructions) - - -# Print summary using recursive summarization: -recursive_summary = summarize(artificial_intelligence, detail=0.1, summarize_recursively=True) -print(recursive_summary) - diff --git a/summarize.py b/summarize.py index 88c7f953b..e6acc786a 100644 --- a/summarize.py +++ b/summarize.py @@ -318,12 +318,7 @@ def read_paths_from_file(file_path): """ Reads a file containing URLs or local file paths and returns them as a list. """ paths = [] # Initialize paths as an empty list with open(file_path, 'r') as file: - for line in file: - line = line.strip() - if line and not os.path.exists( - os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')): - logging.debug("line successfully imported from file and added to list to be transcribed") - paths.append(line) + paths = [line.strip() for line in file] return paths @@ -331,10 +326,12 @@ def process_path(path): """ Decides whether the path is a URL or a local file and processes accordingly. """ if path.startswith('http'): logging.debug("file is a URL") - return get_youtube(path) # For YouTube URLs, modify to download and extract info + # For YouTube URLs, modify to download and extract info + return get_youtube(path) elif os.path.exists(path): logging.debug("File is a path") - return process_local_file(path) # For local files, define a function to handle them + # For local files, define a function to handle them + return process_local_file(path) else: logging.error(f"Path does not exist: {path}") return None @@ -1668,7 +1665,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= # except requests.exceptions.ConnectionError: # requests.status_code = "Connection: " # Perform summarization based on the specified API - elif api_name and api_key: + elif api_name: logging.debug(f"MAIN: Summarization being performed by {api_name}") json_file_path = audio_file.replace('.wav', '.segments.json') if api_name.lower() == 'openai': @@ -1758,7 +1755,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= #end_time = time.monotonic() # print("Total program execution time: " + timedelta(seconds=end_time - start_time)) - return results + return results if __name__ == "__main__": @@ -1793,6 +1790,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= logging.basicConfig(level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s') custom_prompt = args.custom_prompt + if custom_prompt == "": logging.debug(f"Custom prompt defined, will use \n\nf{custom_prompt} \n\nas the prompt") print(f"Custom Prompt has been defined. Custom prompt: \n\n {args.custom_prompt}") @@ -1808,7 +1806,6 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= print("No custom prompt defined, will use default") if args.user_interface: - launch_ui(demo_mode=False) else: if not args.input_path: @@ -1835,6 +1832,8 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= # Get all API keys from the config api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')} + api_name = args.api_name + # Rolling Summarization will only be performed if an API is specified and the API key is available # and the rolling summarization flag is set #