From b91071019a0d10d596dad1742a72dba06b1f4c37 Mon Sep 17 00:00:00 2001
From: Philippe Moussalli <philippe.moussalli95@gmail.com>
Date: Tue, 16 Jan 2024 10:59:26 +0100
Subject: [PATCH] Bump to 0.9.0 (#66)

---
 requirements.txt                              |  5 +-
 .../aggregate_eval_results/requirements.txt   |  2 +-
 src/components/text_cleaning/requirements.txt |  2 +-
 src/evaluation.ipynb                          | 65 ++++++++++---------
 src/parameter_search.ipynb                    | 55 +++++++++-------
 src/pipeline.ipynb                            | 33 ++++++++--
 src/pipeline_eval.py                          | 12 ++--
 src/pipeline_index.py                         |  6 +-
 src/utils.py                                  | 31 ++++++++-
 .../docker-compose.yaml                       |  0
 10 files changed, 135 insertions(+), 76 deletions(-)
 rename src/{weaviate => weaviate_service}/docker-compose.yaml (100%)

diff --git a/requirements.txt b/requirements.txt
index 78929e9..609e239 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
-fondant==0.8.0
-notebook==7.0.6
\ No newline at end of file
+fondant==0.9.0
+notebook==7.0.6
+weaviate-client==3.25.3
diff --git a/src/components/aggregate_eval_results/requirements.txt b/src/components/aggregate_eval_results/requirements.txt
index 6d86734..53e5d83 100644
--- a/src/components/aggregate_eval_results/requirements.txt
+++ b/src/components/aggregate_eval_results/requirements.txt
@@ -1 +1 @@
-fondant[component]==0.8.dev6
\ No newline at end of file
+fondant[component]==0.9.0
\ No newline at end of file
diff --git a/src/components/text_cleaning/requirements.txt b/src/components/text_cleaning/requirements.txt
index 29abcb4..f2c5454 100644
--- a/src/components/text_cleaning/requirements.txt
+++ b/src/components/text_cleaning/requirements.txt
@@ -1 +1 @@
-fondant[component]==0.8.dev4
+fondant[component]==0.9.0
diff --git a/src/evaluation.ipynb b/src/evaluation.ipynb
index bb3085c..4bfd424 100644
--- a/src/evaluation.ipynb
+++ b/src/evaluation.ipynb
@@ -65,9 +65,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
    "source": [
     "## Set up environment"
    ]
@@ -177,9 +175,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
    "source": [
     "## Spin up the Weaviate vector store"
    ]
@@ -217,7 +213,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!docker compose -f weaviate/docker-compose.yaml up --detach"
+    "!docker compose -f weaviate_service/docker-compose.yaml up --detach"
    ]
   },
   {
@@ -227,15 +223,6 @@
     "Make sure you have **Weaviate client v3**"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install -q \"weaviate-client==3.*\" --disable-pip-version-check && echo \"Weaviate client installed successfully\""
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -263,7 +250,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Indexing pipeline"
+    "#### Indexing pipeline"
    ]
   },
   {
@@ -329,8 +316,7 @@
     "# Parameters for the indexing pipeline\n",
     "indexing_args = {\n",
     "    \"n_rows_to_load\": 1000,\n",
-    "    \"chunk_size\": 1024,\n",
-    "    \"chunk_overlap\": 8,\n",
+    "    \"chunk_args\": {\"chunk_size\": 512, \"chunk_overlap\": 32}\n",
     "}\n",
     "\n",
     "# Parameters for the GPU resources\n",
@@ -421,15 +407,14 @@
     "import os\n",
     "import pipeline_eval\n",
     "\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
     "\n",
     "evaluation_args = {\n",
     "    \"retrieval_top_k\": 2,\n",
-    "    \"evaluation_module\": \"langchain.chat_models\",\n",
-    "    \"evaluation_llm\": \"ChatOpenAI\",\n",
-    "    \"evaluation_llm_kwargs\": {\n",
-    "                              \"openai_api_key\": os.environ[\"OPENAI_API_KEY\"],   # TODO: Update with your key or use a different model\n",
-    "                              \"model_name\" : \"gpt-3.5-turbo\"\n",
+    "    \"llm_module_name\": \"langchain.chat_models\",\n",
+    "    \"llm_class_name\": \"ChatOpenAI\",\n",
+    "    \"llm_kwargs\": {\n",
+    "      \"openai_api_key\":\"\" ,   # TODO: Update with your key or use a different model\n",
+    "      \"model_name\" : \"gpt-3.5-turbo\"\n",
     "    },\n",
     "    \"evaluation_metrics\": [\"context_precision\", \"context_relevancy\"]\n",
     "}\n",
@@ -450,9 +435,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "runner = DockerRunner()\n",
-    "extra_volumes = [str(os.path.join(os.path.abspath('.'), \"evaluation_datasets\")) + \":/evaldata\"]\n",
-    "runner.run(evaluation_pipeline, extra_volumes=extra_volumes)"
+    "if utils.check_weaviate_class_exists(\n",
+    "    local_weaviate_client,\n",
+    "    shared_args[\"weaviate_class\"]\n",
+    "): \n",
+    "    runner = DockerRunner()\n",
+    "    extra_volumes = [str(os.path.join(os.path.abspath('.'), \"evaluation_datasets\")) + \":/evaldata\"]\n",
+    "    runner.run(evaluation_pipeline, extra_volumes=extra_volumes)"
    ]
   },
   {
@@ -507,6 +496,24 @@
     "run_explorer_app(base_path=BASE_PATH)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To stop the Explore, run the cell below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fondant.explore import stop_explorer_app\n",
+    "\n",
+    "stop_explorer_app()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -565,7 +572,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/src/parameter_search.ipynb b/src/parameter_search.ipynb
index 38676b7..cdda530 100644
--- a/src/parameter_search.ipynb
+++ b/src/parameter_search.ipynb
@@ -59,9 +59,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
    "source": [
     "## Set up environment"
    ]
@@ -172,9 +170,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
    "source": [
     "## Spin up the Weaviate vector store"
    ]
@@ -219,16 +215,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!docker compose -f weaviate/docker-compose.yaml up --detach"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install -q \"weaviate-client==3.*\" --disable-pip-version-check && echo \"Weaviate client installed successfully\""
+    "!docker compose -f weaviate_service/docker-compose.yaml up --detach"
    ]
   },
   {
@@ -337,7 +324,7 @@
     "    'chunk_overlap' : [64, 128, 192],\n",
     "}\n",
     "searchable_shared_params = {\n",
-    "    'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\"), (\"huggingface\", \"BAAI/bge-base-en-v1.5\")]\n",
+    "    'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")]\n",
     "}\n",
     "searchable_eval_params = {\n",
     "    'retrieval_top_k' : [2, 4, 8]\n",
@@ -363,7 +350,6 @@
    "source": [
     "from utils import get_host_ip\n",
     "\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
     "\n",
     "# configurable parameters\n",
     "shared_args = {\n",
@@ -377,10 +363,12 @@
     "eval_args = {\n",
     "    \"evaluation_set_filename\" : \"wikitext_1000_q.csv\",\n",
     "    \"evaluation_set_separator\" : \";\",\n",
-    "    \"evaluation_module\": \"langchain.chat_models\",\n",
-    "    \"evaluation_llm\": \"ChatOpenAI\",\n",
-    "    \"evaluation_llm_kwargs\": {\"openai_api_key\": os.environ[\"OPENAI_API_KEY\"], #TODO Specify your key if you're using OpenAI\n",
-    "                              \"model_name\" : \"gpt-3.5-turbo\"}, # e.g. \"gpt-4\" or \"gpt-3.5-turbo\"\n",
+    "    \"llm_module_name\": \"langchain.chat_models\",\n",
+    "    \"llm_class_name\": \"ChatOpenAI\",\n",
+    "    \"llm_kwargs\": {\n",
+    "      \"openai_api_key\": \"\" ,   # TODO: Update with your key or use a different model\n",
+    "      \"model_name\" : \"gpt-3.5-turbo\"\n",
+    "    },\n",
     "    \"evaluation_metrics\" : [\"context_precision\", \"context_relevancy\"]\n",
     "}\n",
     "\n",
@@ -416,6 +404,7 @@
    "outputs": [],
    "source": [
     "from utils import ParameterSearch\n",
+    "from utils import check_weaviate_class_exists\n",
     "\n",
     "mysearch = ParameterSearch(\n",
     "    searchable_index_params = searchable_index_params,\n",
@@ -430,7 +419,7 @@
     "    evaluation_set_path=evaluation_set_path,\n",
     ")\n",
     "\n",
-    "results = mysearch.run()"
+    "results = mysearch.run(weaviate_client)"
    ]
   },
   {
@@ -549,6 +538,24 @@
     "run_explorer_app(base_path=shared_args[\"base_path\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To stop the Explore, run the cell below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fondant.explore import stop_explorer_app\n",
+    "\n",
+    "stop_explorer_app()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -607,7 +614,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/src/pipeline.ipynb b/src/pipeline.ipynb
index e5f3f50..5735e42 100644
--- a/src/pipeline.ipynb
+++ b/src/pipeline.ipynb
@@ -130,7 +130,7 @@
     "from pathlib import Path\n",
     "from fondant.pipeline import Pipeline, Resources\n",
     "\n",
-    "BASE_PATH = \"./data-dir\"\n",
+    "BASE_PATH = \"./data\"\n",
     "Path(BASE_PATH).mkdir(parents=True, exist_ok=True)\n",
     "\n",
     "pipeline = Pipeline(\n",
@@ -187,8 +187,7 @@
     "chunks = text.apply(\n",
     "    \"chunk_text\",\n",
     "    arguments={\n",
-    "        \"chunk_size\": 512,\n",
-    "        \"chunk_overlap\": 32,\n",
+    "        \"chunk_args\": {\"chunk_size\": 512, \"chunk_overlap\": 32}\n",
     "    }\n",
     ")\n",
     "\n",
@@ -252,7 +251,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!docker compose -f weaviate/docker-compose.yaml up --detach --quiet-pull"
+    "!docker compose -f weaviate_service/docker-compose.yaml up --detach --quiet-pull"
    ]
   },
   {
@@ -307,7 +306,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To stop the Explorer and continue the notebook, press the stop button at the top of the notebook."
+    "To stop the Explore, run the cell below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fondant.explore import stop_explorer_app\n",
+    "\n",
+    "stop_explorer_app()"
    ]
   },
   {
@@ -435,7 +445,7 @@
    "outputs": [],
    "source": [
     "%%writefile components/text_cleaning/requirements.txt\n",
-    "fondant[component]==0.8.dev4"
+    "fondant[component]==0.9.0"
    ]
   },
   {
@@ -555,6 +565,15 @@
     "!docker compose -f weaviate/docker-compose.yaml down"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_explorer_app()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -581,7 +600,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/src/pipeline_eval.py b/src/pipeline_eval.py
index 87c2c54..05db011 100644
--- a/src/pipeline_eval.py
+++ b/src/pipeline_eval.py
@@ -16,9 +16,9 @@ def create_pipeline(
     embed_model: str = "all-MiniLM-L6-v2",
     embed_api_key: dict = {},
     retrieval_top_k: int = 3,
-    evaluation_module: str = "langchain.llms",
-    evaluation_llm: str = "OpenAI",
-    evaluation_llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"},
+    llm_module_name: str = "langchain.chat_models",
+    llm_class_name: str = "ChatOpenAI",
+    llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"},
     evaluation_metrics: list = ["context_precision", "context_relevancy"],
     number_of_accelerators=None,
     accelerator_name=None,
@@ -72,9 +72,9 @@ def create_pipeline(
     retriever_eval = retrieve_chunks.apply(
         "evaluate_ragas",
         arguments={
-            "module": evaluation_module,
-            "llm_name": evaluation_llm,
-            "llm_kwargs": evaluation_llm_kwargs,
+            "llm_module_name": llm_module_name,
+            "llm_class_name": llm_class_name,
+            "llm_kwargs": llm_kwargs,
         },
         produces={metric: pa.float32() for metric in evaluation_metrics},
     )
diff --git a/src/pipeline_index.py b/src/pipeline_index.py
index e1926e6..999fddb 100644
--- a/src/pipeline_index.py
+++ b/src/pipeline_index.py
@@ -13,8 +13,7 @@ def create_pipeline(
     embed_model_provider: str = "huggingface",
     embed_model: str = "all-MiniLM-L6-v2",
     embed_api_key: dict = {},
-    chunk_size: int = 512,
-    chunk_overlap: int = 32,
+    chunk_args: dict = {"chunk_size": 512, "chunk_overlap": 32},
     number_of_accelerators=None,
     accelerator_name=None,
 ):
@@ -40,8 +39,7 @@ def create_pipeline(
     chunks = text.apply(
         "chunk_text",
         arguments={
-            "chunk_size": chunk_size,
-            "chunk_overlap": chunk_overlap,
+            "chunk_args": chunk_args,
         },
     )
 
diff --git a/src/utils.py b/src/utils.py
index 1b28e79..91285be 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -9,12 +9,28 @@
 import pandas as pd
 import pipeline_eval
 import pipeline_index
+import weaviate
 from fondant.pipeline.runner import DockerRunner
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
 
+def check_weaviate_class_exists(
+    weaviate_client: weaviate.Client,
+    weaviate_class: str,
+) -> bool:
+    """Check if a class exists in Weaviate."""
+    classes = weaviate_client.schema.get()["classes"]
+    available_classes = [_class["class"] for _class in classes]
+    if weaviate_class not in available_classes:
+        logger.error(f"Class {weaviate_class} does not exist in Weaviate.")
+        return False
+
+    logger.info(f"Class {weaviate_class} exists in Weaviate.")
+    return True
+
+
 def get_host_ip():
     try:
         # Create a socket object and connect to an external server
@@ -151,7 +167,7 @@ def __init__(
         # list of dicts to store all params & results
         self.results = []
 
-    def run(self):
+    def run(self, weaviate_client: weaviate.Client):
         run_count = 0
 
         while True:
@@ -172,6 +188,10 @@ def run(self):
             # run indexing pipeline
             self.run_indexing_pipeline(run_count, indexing_config, indexing_pipeline)
 
+            check_weaviate_class_exists(
+                weaviate_client, indexing_config["weaviate_class"]
+            )
+
             # run evaluation pipeline
             self.run_evaluation_pipeline(
                 run_count,
@@ -299,10 +319,17 @@ def create_configs(self, run_count: int):
     def create_pipelines(self, indexing_config, evaluation_config):
         # create indexing pipeline
 
+        indexing_config_copy = indexing_config.copy()
+
+        indexing_config_copy["chunk_args"] = {
+            "chunk_size": indexing_config_copy.pop("chunk_size"),
+            "chunk_overlap": indexing_config_copy.pop("chunk_overlap"),
+        }
+
         indexing_pipeline = pipeline_index.create_pipeline(
             **self.shared_args,
             **self.index_args,
-            **indexing_config,
+            **indexing_config_copy,
             **self.resource_args,
         )
 
diff --git a/src/weaviate/docker-compose.yaml b/src/weaviate_service/docker-compose.yaml
similarity index 100%
rename from src/weaviate/docker-compose.yaml
rename to src/weaviate_service/docker-compose.yaml