From ea927a588be7e7304cb0e261724a5ace359e0b00 Mon Sep 17 00:00:00 2001 From: LatekVon Date: Fri, 24 May 2024 12:54:47 +0200 Subject: [PATCH] remove last legacy config distributors --- arguments.py | 6 ------ core/tools/model_loader.py | 14 +++++++++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arguments.py b/arguments.py index de06791..ab5f052 100644 --- a/arguments.py +++ b/arguments.py @@ -113,9 +113,3 @@ def get_runtime_config(): llm_config=llm_config, embedder_config=embedder_config, ) - - -# todo: this is a legacy method of distributing config, remove ASAP -LLM_CHOICE = args.llm_choice -EMBEDDER_CHOICE = args.embed_choice -USE_HUGGING_FACE = False # temporarily disabled HF diff --git a/core/tools/model_loader.py b/core/tools/model_loader.py index 8d34fa1..f2177f7 100644 --- a/core/tools/model_loader.py +++ b/core/tools/model_loader.py @@ -9,19 +9,26 @@ llm_config = runtime_configuration.llm_config embedder_config = runtime_configuration.embedder_config +# problem with the caching requests below: we have to share those singletons across instances + def load_ollama_model(): + # todo: w/ caching same here, use singletons to avoid crashes llm = Ollama(model=llm_config.model_name) embeddings = OllamaEmbeddings(model=embedder_config.model_name) return llm, embeddings def load_hugging_face_model(): - # todo: for this to be error-proof, we have to cache returns as singletons, and serve them + # todo: for this to be memory efficient, + # we have to cache returns as singletons, and serve them + base_model_path = hf_hub_download( llm_config.model_file, filename=llm_config.model_name ) - # Instantiate model from downloaded file + + # fixme: n_gpu_layers=-1 is a poor approach, it can and will cause crashes. + # with llama.cpp we have to manually calculate and set this number llm = Llama( model_path=base_model_path, n_gpu_layers=-1, @@ -43,7 +50,8 @@ def load_hugging_face_model(): def load_model(): - if USE_HUGGING_FACE: + # todo: split up into separate llm and embedder functions + if llm_config.supplier == "hugging_face": return load_hugging_face_model() else: return load_ollama_model()