From ff5aa211eabe47575893a3b4658e182bd8e167ae Mon Sep 17 00:00:00 2001
From: janvanlooyml6 <56920221+janvanlooyml6@users.noreply.github.com>
Date: Thu, 18 Jan 2024 14:04:00 +0100
Subject: [PATCH] add total number of runs to logs & small fixes (#72)

---
 src/parameter_search.ipynb | 31 ++++++++++++++++-------------
 src/utils.py               | 40 +++++++++++++++-----------------------
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/src/parameter_search.ipynb b/src/parameter_search.ipynb
index cdda530..6cdb36b 100644
--- a/src/parameter_search.ipynb
+++ b/src/parameter_search.ipynb
@@ -129,7 +129,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Check if GPU is available**"
+    "Check if **GPU** is available"
    ]
   },
   {
@@ -165,7 +165,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q -r ../requirements.txt --disable-pip-version-check && echo \"Success\""
+    "%pip install -q -r ../requirements.txt --disable-pip-version-check && echo \"Success\""
    ]
   },
   {
@@ -320,11 +320,11 @@
    "outputs": [],
    "source": [
     "searchable_index_params = {\n",
-    "    'chunk_size' : [192, 256, 320],\n",
-    "    'chunk_overlap' : [64, 128, 192],\n",
+    "    'chunk_size' : [128, 256, 384],\n",
+    "    'chunk_overlap' : [16, 64, 128],\n",
     "}\n",
     "searchable_shared_params = {\n",
-    "    'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")]\n",
+    "    'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")] # add more as tuples: ,(\"huggingface\", \"BAAI/bge-base-en-v1.5\")\n",
     "}\n",
     "searchable_eval_params = {\n",
     "    'retrieval_top_k' : [2, 4, 8]\n",
@@ -355,7 +355,7 @@
     "shared_args = {\n",
     "    \"base_path\" : \"./data\", # where data goes\n",
     "    \"embed_api_key\" : {},\n",
-    "    \"weaviate_url\" : f\"http://{get_host_ip()}:8081\" # IP address\n",
+    "    \"weaviate_url\" : f\"http://{get_host_ip()}:8081\"\n",
     "}\n",
     "index_args = {\n",
     "    \"n_rows_to_load\" : 1000,\n",
@@ -366,8 +366,8 @@
     "    \"llm_module_name\": \"langchain.chat_models\",\n",
     "    \"llm_class_name\": \"ChatOpenAI\",\n",
     "    \"llm_kwargs\": {\n",
-    "      \"openai_api_key\": \"\" ,   # TODO: Update with your key or use a different model\n",
-    "      \"model_name\" : \"gpt-3.5-turbo\"\n",
+    "      \"openai_api_key\": \"\" ,   # TODO: update with your key or use a different model\n",
+    "      \"model_name\" : \"gpt-3.5-turbo\" # choose model, e.g. \"gpt-4\", \"gpt-3.5-turbo\"\n",
     "    },\n",
     "    \"evaluation_metrics\" : [\"context_precision\", \"context_relevancy\"]\n",
     "}\n",
@@ -400,7 +400,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "from utils import ParameterSearch\n",
@@ -417,6 +419,7 @@
     "    search_method = search_method,\n",
     "    target_metric = target_metric,\n",
     "    evaluation_set_path=evaluation_set_path,\n",
+    "    debug=False,\n",
     ")\n",
     "\n",
     "results = mysearch.run(weaviate_client)"
@@ -465,7 +468,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q \"plotly\" --disable-pip-version-check && echo \"Plotly installed successfully\""
+    "%pip install -q \"plotly\" --disable-pip-version-check && echo \"Plotly installed successfully\""
    ]
   },
   {
@@ -483,8 +486,8 @@
    "source": [
     "from utils import add_embed_model_numerical_column, show_legend_embed_models\n",
     "\n",
-    "parameter_search_results = add_embed_model_numerical_column(parameter_search_results)\n",
-    "show_legend_embed_models(parameter_search_results)"
+    "results = add_embed_model_numerical_column(results)\n",
+    "show_legend_embed_models(results)"
    ]
   },
   {
@@ -503,7 +506,7 @@
     "import plotly.express as px\n",
     "\n",
     "dimensions = ['chunk_size', 'chunk_overlap', 'embed_model_numerical', 'retrieval_top_k', 'context_precision']\n",
-    "fig = px.parallel_coordinates(parameter_search_results, color=\"context_precision\",\n",
+    "fig = px.parallel_coordinates(results, color=\"context_precision\",\n",
     "                              dimensions=dimensions,\n",
     "                              color_continuous_scale=px.colors.sequential.Bluered)\n",
     "fig.show()"
@@ -614,7 +617,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,
diff --git a/src/utils.py b/src/utils.py
index 2edd307..6e34f09 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -173,11 +173,12 @@ def run(self, weaviate_client: weaviate.Client):
         while True:
             configs = self.create_configs(run_count)
 
+            # stop running when out of configs
             if configs is None:
                 break
 
             # create configs
-            indexing_config, evaluation_config = configs
+            indexing_config, evaluation_config, n_runs = configs
 
             # create pipeline objects
             indexing_pipeline, evaluation_pipeline = self.create_pipelines(
@@ -186,7 +187,10 @@ def run(self, weaviate_client: weaviate.Client):
             )
 
             # run indexing pipeline
-            self.run_indexing_pipeline(run_count, indexing_config, indexing_pipeline)
+            logger.info(
+                f"Starting indexing pipeline of RUN {run_count}/{n_runs} with {indexing_config}",
+            )
+            self.runner.run(indexing_pipeline)
 
             check_weaviate_class_exists(
                 weaviate_client,
@@ -194,10 +198,13 @@ def run(self, weaviate_client: weaviate.Client):
             )
 
             # run evaluation pipeline
-            self.run_evaluation_pipeline(
-                run_count,
-                evaluation_config,
-                evaluation_pipeline,
+            logger.info(
+                f"Starting evaluation pipeline of run #{run_count} / \
+                {n_runs} with {evaluation_config}",
+            )
+            self.runner.run(
+                input=evaluation_pipeline,
+                extra_volumes=self.extra_volumes,
             )
 
             # read metrics from pipeline output
@@ -222,6 +229,7 @@ def create_configs(self, run_count: int):
         if self.search_method == "grid_search":
             # all possible combinations of parameters
             all_combinations = list(cartesian_product(self.searchable_params))
+            n_runs = len(all_combinations)
 
             # when all combinations have been tried, stop searching
             if run_count > len(all_combinations) - 1:
@@ -255,6 +263,7 @@ def create_configs(self, run_count: int):
             variations_to_try = [
                 {keys_to_try[i]: values_to_try[i]} for i in range(len(keys_to_try))
             ]
+            n_runs = len(variations_to_try) + 1
 
             # if there are no variations to try, just schedule one run
             if len(variations_to_try) == 0:
@@ -315,7 +324,7 @@ def create_configs(self, run_count: int):
             "embed_model"
         ] = indexing_config["embed_model"][1]
 
-        return indexing_config, evaluation_config
+        return indexing_config, evaluation_config, n_runs
 
     def create_pipelines(self, indexing_config, evaluation_config):
         # create indexing pipeline
@@ -352,20 +361,3 @@ def create_pipelines(self, indexing_config, evaluation_config):
             logger.info({**self.shared_args, **self.eval_args, **evaluation_config})
 
         return indexing_pipeline, evaluation_pipeline
-
-    def run_indexing_pipeline(self, run_count, indexing_config, indexing_pipeline):
-        logger.info(
-            f"Starting indexing pipeline of run #{run_count} with {indexing_config}",
-        )
-        self.runner.run(indexing_pipeline)
-
-    def run_evaluation_pipeline(
-        self,
-        run_count,
-        evaluation_config,
-        evaluation_pipeline,
-    ):
-        logger.info(
-            f"Starting evaluation pipeline of run #{run_count} with {evaluation_config}",
-        )
-        self.runner.run(input=evaluation_pipeline, extra_volumes=self.extra_volumes)