From 588de15613df17506178e5a14e687a5f4aeb4ff2 Mon Sep 17 00:00:00 2001 From: LatekVon Date: Mon, 15 Apr 2024 23:10:54 +0200 Subject: [PATCH] added steps to black.yml reformatted all files --- .github/workflows/black.yml | 8 +++++ core/lookup.py | 4 ++- .../embedder_configuration.py | 2 +- .../llm_configuration.py | 2 +- core/models/configurations.py | 8 ++--- core/tools/scraper.py | 36 ++++++++++++------- terminal_gui.py | 12 +++---- 7 files changed, 46 insertions(+), 26 deletions(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index c8a5099..c925eff 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -12,3 +12,11 @@ jobs: uses: cytopia/docker-black@0.8 with: path: 'core/' + - name: Python Black (main) + uses: cytopia/docker-black@0.8 + with: + path: 'main.py' + - name: Python Black (crawler) + uses: cytopia/docker-black@0.8 + with: + path: 'crawler.py' diff --git a/core/lookup.py b/core/lookup.py index 292b256..18aeb19 100644 --- a/core/lookup.py +++ b/core/lookup.py @@ -13,7 +13,8 @@ web_news_lookup, web_docs_lookup_prompt, web_news_lookup_prompt, - web_wiki_lookup_prompt) + web_wiki_lookup_prompt, +) from core.tools.dbops import get_db_by_name from core.tools.model_loader import load_model @@ -56,6 +57,7 @@ def interpret_prompt_mode(): return web_wiki_lookup_prompt() else: return web_docs_lookup_prompt() + web_interpret_prompt_mode = interpret_prompt_mode() # NOTE: a detour has been performed here, more details: # web_chain_function will soon become just a tool playing a part of a larger mechanism. diff --git a/core/models/configuration_objects/embedder_configuration.py b/core/models/configuration_objects/embedder_configuration.py index 089955d..7737eca 100644 --- a/core/models/configuration_objects/embedder_configuration.py +++ b/core/models/configuration_objects/embedder_configuration.py @@ -4,7 +4,7 @@ @dataclass class EmbedderConfiguration: - supplier: Literal['ollama', 'hugging_face'] + supplier: Literal["ollama", "hugging_face"] model_name: str model_token_limit: int article_limit: int diff --git a/core/models/configuration_objects/llm_configuration.py b/core/models/configuration_objects/llm_configuration.py index 0844e84..96be422 100644 --- a/core/models/configuration_objects/llm_configuration.py +++ b/core/models/configuration_objects/llm_configuration.py @@ -4,7 +4,7 @@ @dataclass class LlmConfiguration: - supplier: Literal['ollama', 'hugging_face'] + supplier: Literal["ollama", "hugging_face"] model_name: str model_token_limit: int model_file: Optional[str] = None diff --git a/core/models/configurations.py b/core/models/configurations.py index 8b95ca8..7c17533 100644 --- a/core/models/configurations.py +++ b/core/models/configurations.py @@ -1,19 +1,20 @@ from core.models.configuration_objects.llm_configuration import LlmConfiguration -from core.models.configuration_objects.embedder_configuration import EmbedderConfiguration +from core.models.configuration_objects.embedder_configuration import ( + EmbedderConfiguration, +) from terminal_gui import USE_HUGGING_FACE llm_ollama_heavy = LlmConfiguration( supplier="ollama", model_name="zephyr:7b-beta-q5_K_M", model_token_limit=4096, - model_file="" + model_file="", ) embedder_ollama_heavy = EmbedderConfiguration( supplier="ollama", model_name="nomic-embed-text", model_token_limit=4096, - # chunk spliter options article_limit=10, buffer_stops=["\n\n\n", "\n\n", "\n", ". ", ", ", " ", ""], @@ -32,7 +33,6 @@ model_name="nomic-embed-text-v1.5.Q6_K.gguf", model_file="nomic-ai/nomic-embed-text-v1.5-GGUF", model_token_limit=4096, - # chunk spliter options article_limit=10, buffer_stops=["\n\n\n", "\n\n", "\n", ". ", ", ", " ", ""], diff --git a/core/tools/scraper.py b/core/tools/scraper.py index 83309c8..7baf29d 100644 --- a/core/tools/scraper.py +++ b/core/tools/scraper.py @@ -35,7 +35,9 @@ def docs_to_context(docs_and_scores: List[Document], token_limit: int) -> str: if document_index >= len(docs_and_scores): break - print(f"{Fore.CYAN}Used {document_index + 1} snippets with a total of {token_count} tokens as context.{Fore.RESET}") + print( + f"{Fore.CYAN}Used {document_index + 1} snippets with a total of {token_count} tokens as context.{Fore.RESET}" + ) print(f"{Fore.CYAN}Context itself: {Fore.RESET}", context_text) return context_text @@ -49,11 +51,12 @@ def query_for_urls(query: WebQuery, url_amount=embed_config.article_limit) -> Li url_list = search( query=query.web_query, - stop= url_amount, - lang='en', - safe='off', + stop=url_amount, + lang="en", + safe="off", tbs=query.web_tbs, - extra_params=query.web_extra_params) + extra_params=query.web_extra_params, + ) print(f"{Fore.CYAN}Web search completed.{Fore.RESET}") return url_list @@ -79,7 +82,8 @@ def populate_db_with_google_search(database: FAISS, query: WebQuery): chunk_size=query.db_chunk_size, chunk_overlap=embed_config.chunk_overlap, keep_separator=False, - strip_whitespace=True) + strip_whitespace=True, + ) chunks = text_splitter.split_documents(document) @@ -88,21 +92,25 @@ def populate_db_with_google_search(database: FAISS, query: WebQuery): chunks.remove(chunk) continue - chunk.page_content = remove(chunk.page_content, ['\n', '`']) - chunk.page_content = (query.db_embedding_prefix + - chunk.page_content + - query.db_embedding_postfix) + chunk.page_content = remove(chunk.page_content, ["\n", "`"]) + chunk.page_content = ( + query.db_embedding_prefix + + chunk.page_content + + query.db_embedding_postfix + ) if len(chunks) != 0: database.add_documents(documents=chunks, embeddings=embeddings) db_name = embedding_model_safe_name + query.db_save_file_extension - database.save_local(folder_path='store/vector', index_name=db_name) + database.save_local(folder_path="store/vector", index_name=db_name) print(f"{Fore.CYAN}Document vectorization completed.{Fore.RESET}") -def web_query_google_lookup(query: WebQuery, token_limit: int = embed_config.model_token_limit): +def web_query_google_lookup( + query: WebQuery, token_limit: int = embed_config.model_token_limit +): db_name = embedding_model_safe_name + query.db_save_file_extension db = get_db_by_name(db_name, embeddings) @@ -110,7 +118,9 @@ def web_query_google_lookup(query: WebQuery, token_limit: int = embed_config.mod # return the document with the highest prompt similarity score (for now only browsing the first search result) embedding_vector = embeddings.embed_query(query.db_embed_query) - docs_and_scores = db.similarity_search_by_vector(embedding_vector, k=round(token_limit / 64)) + docs_and_scores = db.similarity_search_by_vector( + embedding_vector, k=round(token_limit / 64) + ) print(f"{Fore.CYAN}Database search completed.{Fore.RESET}") diff --git a/terminal_gui.py b/terminal_gui.py index 07a2e2c..9e25725 100644 --- a/terminal_gui.py +++ b/terminal_gui.py @@ -109,15 +109,15 @@ def get_input(): parser = argparse.ArgumentParser() parser.add_argument( - '-H', - '--use-hugging-face', - dest='use_hugging_face', + "-H", + "--use-hugging-face", + dest="use_hugging_face", action="store_true", - help='Use Hugging Face as the model provider' + help="Use Hugging Face as the model provider", ) USE_HUGGING_FACE = parser.parse_args().use_hugging_face -''' +""" parser.add_argument( '-O', '--use-ollama', @@ -126,4 +126,4 @@ def get_input(): help='Use Ollama as the model provider' ) USE_OLLAMA = parser.parse_args().use_ollama -''' +"""