From ff5aa211eabe47575893a3b4658e182bd8e167ae Mon Sep 17 00:00:00 2001 From: janvanlooyml6 <56920221+janvanlooyml6@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:04:00 +0100 Subject: [PATCH] add total number of runs to logs & small fixes (#72) --- src/parameter_search.ipynb | 31 ++++++++++++++++------------- src/utils.py | 40 +++++++++++++++----------------------- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/src/parameter_search.ipynb b/src/parameter_search.ipynb index cdda530..6cdb36b 100644 --- a/src/parameter_search.ipynb +++ b/src/parameter_search.ipynb @@ -129,7 +129,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Check if GPU is available**" + "Check if **GPU** is available" ] }, { @@ -165,7 +165,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q -r ../requirements.txt --disable-pip-version-check && echo \"Success\"" + "%pip install -q -r ../requirements.txt --disable-pip-version-check && echo \"Success\"" ] }, { @@ -320,11 +320,11 @@ "outputs": [], "source": [ "searchable_index_params = {\n", - " 'chunk_size' : [192, 256, 320],\n", - " 'chunk_overlap' : [64, 128, 192],\n", + " 'chunk_size' : [128, 256, 384],\n", + " 'chunk_overlap' : [16, 64, 128],\n", "}\n", "searchable_shared_params = {\n", - " 'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")]\n", + " 'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")] # add more as tuples: ,(\"huggingface\", \"BAAI/bge-base-en-v1.5\")\n", "}\n", "searchable_eval_params = {\n", " 'retrieval_top_k' : [2, 4, 8]\n", @@ -355,7 +355,7 @@ "shared_args = {\n", " \"base_path\" : \"./data\", # where data goes\n", " \"embed_api_key\" : {},\n", - " \"weaviate_url\" : f\"http://{get_host_ip()}:8081\" # IP address\n", + " \"weaviate_url\" : f\"http://{get_host_ip()}:8081\"\n", "}\n", "index_args = {\n", " \"n_rows_to_load\" : 1000,\n", @@ -366,8 +366,8 @@ " \"llm_module_name\": \"langchain.chat_models\",\n", " \"llm_class_name\": \"ChatOpenAI\",\n", " \"llm_kwargs\": {\n", - " \"openai_api_key\": \"\" , # TODO: Update with your key or use a different model\n", - " \"model_name\" : \"gpt-3.5-turbo\"\n", + " \"openai_api_key\": \"\" , # TODO: update with your key or use a different model\n", + " \"model_name\" : \"gpt-3.5-turbo\" # choose model, e.g. \"gpt-4\", \"gpt-3.5-turbo\"\n", " },\n", " \"evaluation_metrics\" : [\"context_precision\", \"context_relevancy\"]\n", "}\n", @@ -400,7 +400,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "from utils import ParameterSearch\n", @@ -417,6 +419,7 @@ " search_method = search_method,\n", " target_metric = target_metric,\n", " evaluation_set_path=evaluation_set_path,\n", + " debug=False,\n", ")\n", "\n", "results = mysearch.run(weaviate_client)" @@ -465,7 +468,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q \"plotly\" --disable-pip-version-check && echo \"Plotly installed successfully\"" + "%pip install -q \"plotly\" --disable-pip-version-check && echo \"Plotly installed successfully\"" ] }, { @@ -483,8 +486,8 @@ "source": [ "from utils import add_embed_model_numerical_column, show_legend_embed_models\n", "\n", - "parameter_search_results = add_embed_model_numerical_column(parameter_search_results)\n", - "show_legend_embed_models(parameter_search_results)" + "results = add_embed_model_numerical_column(results)\n", + "show_legend_embed_models(results)" ] }, { @@ -503,7 +506,7 @@ "import plotly.express as px\n", "\n", "dimensions = ['chunk_size', 'chunk_overlap', 'embed_model_numerical', 'retrieval_top_k', 'context_precision']\n", - "fig = px.parallel_coordinates(parameter_search_results, color=\"context_precision\",\n", + "fig = px.parallel_coordinates(results, color=\"context_precision\",\n", " dimensions=dimensions,\n", " color_continuous_scale=px.colors.sequential.Bluered)\n", "fig.show()" @@ -614,7 +617,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.13" } }, "nbformat": 4, diff --git a/src/utils.py b/src/utils.py index 2edd307..6e34f09 100644 --- a/src/utils.py +++ b/src/utils.py @@ -173,11 +173,12 @@ def run(self, weaviate_client: weaviate.Client): while True: configs = self.create_configs(run_count) + # stop running when out of configs if configs is None: break # create configs - indexing_config, evaluation_config = configs + indexing_config, evaluation_config, n_runs = configs # create pipeline objects indexing_pipeline, evaluation_pipeline = self.create_pipelines( @@ -186,7 +187,10 @@ def run(self, weaviate_client: weaviate.Client): ) # run indexing pipeline - self.run_indexing_pipeline(run_count, indexing_config, indexing_pipeline) + logger.info( + f"Starting indexing pipeline of RUN {run_count}/{n_runs} with {indexing_config}", + ) + self.runner.run(indexing_pipeline) check_weaviate_class_exists( weaviate_client, @@ -194,10 +198,13 @@ def run(self, weaviate_client: weaviate.Client): ) # run evaluation pipeline - self.run_evaluation_pipeline( - run_count, - evaluation_config, - evaluation_pipeline, + logger.info( + f"Starting evaluation pipeline of run #{run_count} / \ + {n_runs} with {evaluation_config}", + ) + self.runner.run( + input=evaluation_pipeline, + extra_volumes=self.extra_volumes, ) # read metrics from pipeline output @@ -222,6 +229,7 @@ def create_configs(self, run_count: int): if self.search_method == "grid_search": # all possible combinations of parameters all_combinations = list(cartesian_product(self.searchable_params)) + n_runs = len(all_combinations) # when all combinations have been tried, stop searching if run_count > len(all_combinations) - 1: @@ -255,6 +263,7 @@ def create_configs(self, run_count: int): variations_to_try = [ {keys_to_try[i]: values_to_try[i]} for i in range(len(keys_to_try)) ] + n_runs = len(variations_to_try) + 1 # if there are no variations to try, just schedule one run if len(variations_to_try) == 0: @@ -315,7 +324,7 @@ def create_configs(self, run_count: int): "embed_model" ] = indexing_config["embed_model"][1] - return indexing_config, evaluation_config + return indexing_config, evaluation_config, n_runs def create_pipelines(self, indexing_config, evaluation_config): # create indexing pipeline @@ -352,20 +361,3 @@ def create_pipelines(self, indexing_config, evaluation_config): logger.info({**self.shared_args, **self.eval_args, **evaluation_config}) return indexing_pipeline, evaluation_pipeline - - def run_indexing_pipeline(self, run_count, indexing_config, indexing_pipeline): - logger.info( - f"Starting indexing pipeline of run #{run_count} with {indexing_config}", - ) - self.runner.run(indexing_pipeline) - - def run_evaluation_pipeline( - self, - run_count, - evaluation_config, - evaluation_pipeline, - ): - logger.info( - f"Starting evaluation pipeline of run #{run_count} with {evaluation_config}", - ) - self.runner.run(input=evaluation_pipeline, extra_volumes=self.extra_volumes)