From b91071019a0d10d596dad1742a72dba06b1f4c37 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Tue, 16 Jan 2024 10:59:26 +0100 Subject: [PATCH] Bump to 0.9.0 (#66) --- requirements.txt | 5 +- .../aggregate_eval_results/requirements.txt | 2 +- src/components/text_cleaning/requirements.txt | 2 +- src/evaluation.ipynb | 65 ++++++++++--------- src/parameter_search.ipynb | 55 +++++++++------- src/pipeline.ipynb | 33 ++++++++-- src/pipeline_eval.py | 12 ++-- src/pipeline_index.py | 6 +- src/utils.py | 31 ++++++++- .../docker-compose.yaml | 0 10 files changed, 135 insertions(+), 76 deletions(-) rename src/{weaviate => weaviate_service}/docker-compose.yaml (100%) diff --git a/requirements.txt b/requirements.txt index 78929e9..609e239 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -fondant==0.8.0 -notebook==7.0.6 \ No newline at end of file +fondant==0.9.0 +notebook==7.0.6 +weaviate-client==3.25.3 diff --git a/src/components/aggregate_eval_results/requirements.txt b/src/components/aggregate_eval_results/requirements.txt index 6d86734..53e5d83 100644 --- a/src/components/aggregate_eval_results/requirements.txt +++ b/src/components/aggregate_eval_results/requirements.txt @@ -1 +1 @@ -fondant[component]==0.8.dev6 \ No newline at end of file +fondant[component]==0.9.0 \ No newline at end of file diff --git a/src/components/text_cleaning/requirements.txt b/src/components/text_cleaning/requirements.txt index 29abcb4..f2c5454 100644 --- a/src/components/text_cleaning/requirements.txt +++ b/src/components/text_cleaning/requirements.txt @@ -1 +1 @@ -fondant[component]==0.8.dev4 +fondant[component]==0.9.0 diff --git a/src/evaluation.ipynb b/src/evaluation.ipynb index bb3085c..4bfd424 100644 --- a/src/evaluation.ipynb +++ b/src/evaluation.ipynb @@ -65,9 +65,7 @@ }, { "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "## Set up environment" ] @@ -177,9 +175,7 @@ }, { "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "## Spin up the Weaviate vector store" ] @@ -217,7 +213,7 @@ "metadata": {}, "outputs": [], "source": [ - "!docker compose -f weaviate/docker-compose.yaml up --detach" + "!docker compose -f weaviate_service/docker-compose.yaml up --detach" ] }, { @@ -227,15 +223,6 @@ "Make sure you have **Weaviate client v3**" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -q \"weaviate-client==3.*\" --disable-pip-version-check && echo \"Weaviate client installed successfully\"" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -263,7 +250,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Indexing pipeline" + "#### Indexing pipeline" ] }, { @@ -329,8 +316,7 @@ "# Parameters for the indexing pipeline\n", "indexing_args = {\n", " \"n_rows_to_load\": 1000,\n", - " \"chunk_size\": 1024,\n", - " \"chunk_overlap\": 8,\n", + " \"chunk_args\": {\"chunk_size\": 512, \"chunk_overlap\": 32}\n", "}\n", "\n", "# Parameters for the GPU resources\n", @@ -421,15 +407,14 @@ "import os\n", "import pipeline_eval\n", "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n", "\n", "evaluation_args = {\n", " \"retrieval_top_k\": 2,\n", - " \"evaluation_module\": \"langchain.chat_models\",\n", - " \"evaluation_llm\": \"ChatOpenAI\",\n", - " \"evaluation_llm_kwargs\": {\n", - " \"openai_api_key\": os.environ[\"OPENAI_API_KEY\"], # TODO: Update with your key or use a different model\n", - " \"model_name\" : \"gpt-3.5-turbo\"\n", + " \"llm_module_name\": \"langchain.chat_models\",\n", + " \"llm_class_name\": \"ChatOpenAI\",\n", + " \"llm_kwargs\": {\n", + " \"openai_api_key\":\"\" , # TODO: Update with your key or use a different model\n", + " \"model_name\" : \"gpt-3.5-turbo\"\n", " },\n", " \"evaluation_metrics\": [\"context_precision\", \"context_relevancy\"]\n", "}\n", @@ -450,9 +435,13 @@ "metadata": {}, "outputs": [], "source": [ - "runner = DockerRunner()\n", - "extra_volumes = [str(os.path.join(os.path.abspath('.'), \"evaluation_datasets\")) + \":/evaldata\"]\n", - "runner.run(evaluation_pipeline, extra_volumes=extra_volumes)" + "if utils.check_weaviate_class_exists(\n", + " local_weaviate_client,\n", + " shared_args[\"weaviate_class\"]\n", + "): \n", + " runner = DockerRunner()\n", + " extra_volumes = [str(os.path.join(os.path.abspath('.'), \"evaluation_datasets\")) + \":/evaldata\"]\n", + " runner.run(evaluation_pipeline, extra_volumes=extra_volumes)" ] }, { @@ -507,6 +496,24 @@ "run_explorer_app(base_path=BASE_PATH)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To stop the Explore, run the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fondant.explore import stop_explorer_app\n", + "\n", + "stop_explorer_app()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -565,7 +572,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/src/parameter_search.ipynb b/src/parameter_search.ipynb index 38676b7..cdda530 100644 --- a/src/parameter_search.ipynb +++ b/src/parameter_search.ipynb @@ -59,9 +59,7 @@ }, { "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "## Set up environment" ] @@ -172,9 +170,7 @@ }, { "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "## Spin up the Weaviate vector store" ] @@ -219,16 +215,7 @@ "metadata": {}, "outputs": [], "source": [ - "!docker compose -f weaviate/docker-compose.yaml up --detach" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -q \"weaviate-client==3.*\" --disable-pip-version-check && echo \"Weaviate client installed successfully\"" + "!docker compose -f weaviate_service/docker-compose.yaml up --detach" ] }, { @@ -337,7 +324,7 @@ " 'chunk_overlap' : [64, 128, 192],\n", "}\n", "searchable_shared_params = {\n", - " 'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\"), (\"huggingface\", \"BAAI/bge-base-en-v1.5\")]\n", + " 'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")]\n", "}\n", "searchable_eval_params = {\n", " 'retrieval_top_k' : [2, 4, 8]\n", @@ -363,7 +350,6 @@ "source": [ "from utils import get_host_ip\n", "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n", "\n", "# configurable parameters\n", "shared_args = {\n", @@ -377,10 +363,12 @@ "eval_args = {\n", " \"evaluation_set_filename\" : \"wikitext_1000_q.csv\",\n", " \"evaluation_set_separator\" : \";\",\n", - " \"evaluation_module\": \"langchain.chat_models\",\n", - " \"evaluation_llm\": \"ChatOpenAI\",\n", - " \"evaluation_llm_kwargs\": {\"openai_api_key\": os.environ[\"OPENAI_API_KEY\"], #TODO Specify your key if you're using OpenAI\n", - " \"model_name\" : \"gpt-3.5-turbo\"}, # e.g. \"gpt-4\" or \"gpt-3.5-turbo\"\n", + " \"llm_module_name\": \"langchain.chat_models\",\n", + " \"llm_class_name\": \"ChatOpenAI\",\n", + " \"llm_kwargs\": {\n", + " \"openai_api_key\": \"\" , # TODO: Update with your key or use a different model\n", + " \"model_name\" : \"gpt-3.5-turbo\"\n", + " },\n", " \"evaluation_metrics\" : [\"context_precision\", \"context_relevancy\"]\n", "}\n", "\n", @@ -416,6 +404,7 @@ "outputs": [], "source": [ "from utils import ParameterSearch\n", + "from utils import check_weaviate_class_exists\n", "\n", "mysearch = ParameterSearch(\n", " searchable_index_params = searchable_index_params,\n", @@ -430,7 +419,7 @@ " evaluation_set_path=evaluation_set_path,\n", ")\n", "\n", - "results = mysearch.run()" + "results = mysearch.run(weaviate_client)" ] }, { @@ -549,6 +538,24 @@ "run_explorer_app(base_path=shared_args[\"base_path\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To stop the Explore, run the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fondant.explore import stop_explorer_app\n", + "\n", + "stop_explorer_app()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -607,7 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/src/pipeline.ipynb b/src/pipeline.ipynb index e5f3f50..5735e42 100644 --- a/src/pipeline.ipynb +++ b/src/pipeline.ipynb @@ -130,7 +130,7 @@ "from pathlib import Path\n", "from fondant.pipeline import Pipeline, Resources\n", "\n", - "BASE_PATH = \"./data-dir\"\n", + "BASE_PATH = \"./data\"\n", "Path(BASE_PATH).mkdir(parents=True, exist_ok=True)\n", "\n", "pipeline = Pipeline(\n", @@ -187,8 +187,7 @@ "chunks = text.apply(\n", " \"chunk_text\",\n", " arguments={\n", - " \"chunk_size\": 512,\n", - " \"chunk_overlap\": 32,\n", + " \"chunk_args\": {\"chunk_size\": 512, \"chunk_overlap\": 32}\n", " }\n", ")\n", "\n", @@ -252,7 +251,7 @@ "metadata": {}, "outputs": [], "source": [ - "!docker compose -f weaviate/docker-compose.yaml up --detach --quiet-pull" + "!docker compose -f weaviate_service/docker-compose.yaml up --detach --quiet-pull" ] }, { @@ -307,7 +306,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To stop the Explorer and continue the notebook, press the stop button at the top of the notebook." + "To stop the Explore, run the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fondant.explore import stop_explorer_app\n", + "\n", + "stop_explorer_app()" ] }, { @@ -435,7 +445,7 @@ "outputs": [], "source": [ "%%writefile components/text_cleaning/requirements.txt\n", - "fondant[component]==0.8.dev4" + "fondant[component]==0.9.0" ] }, { @@ -555,6 +565,15 @@ "!docker compose -f weaviate/docker-compose.yaml down" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_explorer_app()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -581,7 +600,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/src/pipeline_eval.py b/src/pipeline_eval.py index 87c2c54..05db011 100644 --- a/src/pipeline_eval.py +++ b/src/pipeline_eval.py @@ -16,9 +16,9 @@ def create_pipeline( embed_model: str = "all-MiniLM-L6-v2", embed_api_key: dict = {}, retrieval_top_k: int = 3, - evaluation_module: str = "langchain.llms", - evaluation_llm: str = "OpenAI", - evaluation_llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"}, + llm_module_name: str = "langchain.chat_models", + llm_class_name: str = "ChatOpenAI", + llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"}, evaluation_metrics: list = ["context_precision", "context_relevancy"], number_of_accelerators=None, accelerator_name=None, @@ -72,9 +72,9 @@ def create_pipeline( retriever_eval = retrieve_chunks.apply( "evaluate_ragas", arguments={ - "module": evaluation_module, - "llm_name": evaluation_llm, - "llm_kwargs": evaluation_llm_kwargs, + "llm_module_name": llm_module_name, + "llm_class_name": llm_class_name, + "llm_kwargs": llm_kwargs, }, produces={metric: pa.float32() for metric in evaluation_metrics}, ) diff --git a/src/pipeline_index.py b/src/pipeline_index.py index e1926e6..999fddb 100644 --- a/src/pipeline_index.py +++ b/src/pipeline_index.py @@ -13,8 +13,7 @@ def create_pipeline( embed_model_provider: str = "huggingface", embed_model: str = "all-MiniLM-L6-v2", embed_api_key: dict = {}, - chunk_size: int = 512, - chunk_overlap: int = 32, + chunk_args: dict = {"chunk_size": 512, "chunk_overlap": 32}, number_of_accelerators=None, accelerator_name=None, ): @@ -40,8 +39,7 @@ def create_pipeline( chunks = text.apply( "chunk_text", arguments={ - "chunk_size": chunk_size, - "chunk_overlap": chunk_overlap, + "chunk_args": chunk_args, }, ) diff --git a/src/utils.py b/src/utils.py index 1b28e79..91285be 100644 --- a/src/utils.py +++ b/src/utils.py @@ -9,12 +9,28 @@ import pandas as pd import pipeline_eval import pipeline_index +import weaviate from fondant.pipeline.runner import DockerRunner logger = logging.getLogger() logger.setLevel(logging.INFO) +def check_weaviate_class_exists( + weaviate_client: weaviate.Client, + weaviate_class: str, +) -> bool: + """Check if a class exists in Weaviate.""" + classes = weaviate_client.schema.get()["classes"] + available_classes = [_class["class"] for _class in classes] + if weaviate_class not in available_classes: + logger.error(f"Class {weaviate_class} does not exist in Weaviate.") + return False + + logger.info(f"Class {weaviate_class} exists in Weaviate.") + return True + + def get_host_ip(): try: # Create a socket object and connect to an external server @@ -151,7 +167,7 @@ def __init__( # list of dicts to store all params & results self.results = [] - def run(self): + def run(self, weaviate_client: weaviate.Client): run_count = 0 while True: @@ -172,6 +188,10 @@ def run(self): # run indexing pipeline self.run_indexing_pipeline(run_count, indexing_config, indexing_pipeline) + check_weaviate_class_exists( + weaviate_client, indexing_config["weaviate_class"] + ) + # run evaluation pipeline self.run_evaluation_pipeline( run_count, @@ -299,10 +319,17 @@ def create_configs(self, run_count: int): def create_pipelines(self, indexing_config, evaluation_config): # create indexing pipeline + indexing_config_copy = indexing_config.copy() + + indexing_config_copy["chunk_args"] = { + "chunk_size": indexing_config_copy.pop("chunk_size"), + "chunk_overlap": indexing_config_copy.pop("chunk_overlap"), + } + indexing_pipeline = pipeline_index.create_pipeline( **self.shared_args, **self.index_args, - **indexing_config, + **indexing_config_copy, **self.resource_args, ) diff --git a/src/weaviate/docker-compose.yaml b/src/weaviate_service/docker-compose.yaml similarity index 100% rename from src/weaviate/docker-compose.yaml rename to src/weaviate_service/docker-compose.yaml