From ea927a588be7e7304cb0e261724a5ace359e0b00 Mon Sep 17 00:00:00 2001
From: LatekVon <latkaignacy@gmail.com>
Date: Fri, 24 May 2024 12:54:47 +0200
Subject: [PATCH] remove last legacy config distributors

---
 arguments.py               |  6 ------
 core/tools/model_loader.py | 14 +++++++++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arguments.py b/arguments.py
index de06791..ab5f052 100644
--- a/arguments.py
+++ b/arguments.py
@@ -113,9 +113,3 @@ def get_runtime_config():
         llm_config=llm_config,
         embedder_config=embedder_config,
     )
-
-
-# todo: this is a legacy method of distributing config, remove ASAP
-LLM_CHOICE = args.llm_choice
-EMBEDDER_CHOICE = args.embed_choice
-USE_HUGGING_FACE = False  # temporarily disabled HF
diff --git a/core/tools/model_loader.py b/core/tools/model_loader.py
index 8d34fa1..f2177f7 100644
--- a/core/tools/model_loader.py
+++ b/core/tools/model_loader.py
@@ -9,19 +9,26 @@
 llm_config = runtime_configuration.llm_config
 embedder_config = runtime_configuration.embedder_config
 
+# problem with the caching requests below: we have to share those singletons across instances
+
 
 def load_ollama_model():
+    # todo: w/ caching same here, use singletons to avoid crashes
     llm = Ollama(model=llm_config.model_name)
     embeddings = OllamaEmbeddings(model=embedder_config.model_name)
     return llm, embeddings
 
 
 def load_hugging_face_model():
-    # todo: for this to be error-proof, we have to cache returns as singletons, and serve them
+    # todo: for this to be memory efficient,
+    #       we have to cache returns as singletons, and serve them
+
     base_model_path = hf_hub_download(
         llm_config.model_file, filename=llm_config.model_name
     )
-    # Instantiate model from downloaded file
+
+    # fixme: n_gpu_layers=-1 is a poor approach, it can and will cause crashes.
+    #        with llama.cpp we have to manually calculate and set this number
     llm = Llama(
         model_path=base_model_path,
         n_gpu_layers=-1,
@@ -43,7 +50,8 @@ def load_hugging_face_model():
 
 
 def load_model():
-    if USE_HUGGING_FACE:
+    # todo: split up into separate llm and embedder functions
+    if llm_config.supplier == "hugging_face":
         return load_hugging_face_model()
     else:
         return load_ollama_model()