Skip to content

Commit

Permalink
Add fixes (#63)
Browse files Browse the repository at this point in the history
Added small fixes to the SPS_merge branch. Mainly some formatting and
linting related fixes. Tested all notebooks end-to-end and they seem to
work fine
  • Loading branch information
PhilippeMoussalli authored Jan 10, 2024
1 parent dce26fd commit ff6e686
Show file tree
Hide file tree
Showing 5 changed files with 289 additions and 138 deletions.
61 changes: 44 additions & 17 deletions src/evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"metadata": {},
"source": [
"## Set up environment"
]
Expand Down Expand Up @@ -110,8 +108,7 @@
"metadata": {},
"outputs": [],
"source": [
"!docker compose version\n",
"!docker ps && echo \"Docker running\""
"!docker compose version"
]
},
{
Expand Down Expand Up @@ -151,9 +148,34 @@
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"metadata": {},
"source": [
"**Check if GPU is available**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import subprocess\n",
"\n",
"try:\n",
" subprocess.check_output('nvidia-smi')\n",
" logging.info(\"Found GPU, using it!\")\n",
" number_of_accelerators = 1\n",
" accelerator_name = \"GPU\"\n",
"except Exception:\n",
" logging.warning(\"We recommend to run this pipeline on a GPU, but none could be found, using CPU instead\")\n",
" number_of_accelerators = None\n",
" accelerator_name = None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Spin up the Weaviate vector store"
]
Expand Down Expand Up @@ -227,7 +249,7 @@
"import weaviate\n",
"\n",
"try:\n",
" local_weaviate_client = weaviate.Client(\"http://localhost:8080\")\n",
" local_weaviate_client = weaviate.Client(\"http://localhost:8081\")\n",
" logging.info(\"Connected to Weaviate instance\")\n",
"except weaviate.WeaviateStartUpError:\n",
" logging.error(\"Cannot connect to weaviate instance, is it running?\")"
Expand Down Expand Up @@ -296,7 +318,7 @@
" \"embed_model_provider\": \"huggingface\",\n",
" \"embed_model\": \"all-MiniLM-L6-v2\",\n",
" \"embed_api_key\": {},\n",
" \"weaviate_url\": f\"http://{utils.get_host_ip()}:8080\",\n",
" \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n",
" \"weaviate_class\": \"Pipeline1\", # Capitalized, avoid special characters (_, =, -, etc.)\n",
"}\n",
"\n",
Expand All @@ -307,7 +329,13 @@
" \"chunk_overlap\": 8,\n",
"}\n",
"\n",
"indexing_pipeline = pipeline_index.create_pipeline(**shared_args, **indexing_args)"
"# Parameters for the GPU resources\n",
"resources_args = {\n",
" \"number_of_accelerators\": number_of_accelerators,\n",
" \"accelerator_name\": accelerator_name,\n",
"}\n",
"\n",
"indexing_pipeline = pipeline_index.create_pipeline(**shared_args, **indexing_args, **resources_args)"
]
},
{
Expand Down Expand Up @@ -389,13 +417,12 @@
"import os\n",
"import pipeline_eval\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
"\n",
"evaluation_args = {\n",
" \"retrieval_top_k\": 2,\n",
" \"evaluation_set_path\" : \"./evaluation_datasets\",\n",
" \"evaluation_set_filename\" : \"wikitext_1000_q.csv\",\n",
" \"evaluation_set_separator\" : \";\",\n",
" \"evaluation_module\": \"langchain.llms\",\n",
" \"evaluation_llm\": \"OpenAI\",\n",
" \"evaluation_module\": \"langchain.chat_models\",\n",
" \"evaluation_llm\": \"ChatOpenAI\",\n",
" \"evaluation_llm_kwargs\": {\n",
" \"openai_api_key\": os.environ[\"OPENAI_API_KEY\"], # TODO: Update with your key or use a different model\n",
" \"model_name\" : \"gpt-3.5-turbo\"\n",
Expand Down Expand Up @@ -534,7 +561,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
67 changes: 50 additions & 17 deletions src/parameter_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"metadata": {},
"source": [
"## Set up environment"
]
Expand Down Expand Up @@ -127,6 +125,33 @@
"logging.info(\"test\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Check if GPU is available**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import subprocess\n",
"\n",
"try:\n",
" subprocess.check_output('nvidia-smi')\n",
" logging.info(\"Found GPU, using it!\")\n",
" number_of_accelerators = 1\n",
" accelerator_name = \"GPU\"\n",
"except Exception:\n",
" logging.warning(\"We recommend to run this pipeline on a GPU, but none could be found, using CPU instead\")\n",
" number_of_accelerators = None\n",
" accelerator_name = None"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -145,9 +170,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"metadata": {},
"source": [
"## Spin up the Weaviate vector store"
]
Expand Down Expand Up @@ -180,19 +203,19 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"!docker compose -f weaviate/docker-compose.yaml up --detach"
"Make sure you have **Weaviate client v3**"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Make sure you have **Weaviate client v3**"
"!docker compose -f weaviate/docker-compose.yaml up --detach"
]
},
{
Expand Down Expand Up @@ -316,6 +339,7 @@
" 'retrieval_top_k' : [2, 4, 8]\n",
"}\n",
"\n",
"evaluation_set_path = \"./evaluation_datasets\"\n",
"search_method = 'progressive_search' # 'grid_search', 'progressive_search'\n",
"target_metric = 'context_precision' # relevant for 'smart' methods that use previous results to determine params, e.g. progressive search"
]
Expand All @@ -335,6 +359,8 @@
"source": [
"from utils import get_host_ip\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
"\n",
"# configurable parameters\n",
"shared_args = {\n",
" \"base_path\" : \"./data\", # where data goes\n",
Expand All @@ -345,14 +371,19 @@
" \"n_rows_to_load\" : 1000,\n",
"}\n",
"eval_args = {\n",
" \"evaluation_set_path\" : \"./evaluation_datasets\",\n",
" \"evaluation_set_filename\" : \"wikitext_1000_q.csv\",\n",
" \"evaluation_set_separator\" : \";\",\n",
" \"evaluation_module\": \"langchain.chat_models\",\n",
" \"evaluation_llm\": \"ChatOpenAI\",\n",
" \"evaluation_llm_kwargs\": {\"openai_api_key\": os.environ[\"OPENAI_API_KEY\"], #TODO Specify your key if you're using OpenAI\n",
" \"model_name\" : \"gpt-4\"}, # e.g. \"gpt-4\" or \"gpt-3.5-turbo\"\n",
" \"model_name\" : \"gpt-3.5-turbo\"}, # e.g. \"gpt-4\" or \"gpt-3.5-turbo\"\n",
" \"evaluation_metrics\" : [\"context_precision\", \"context_relevancy\"]\n",
"}\n",
"\n",
"# Parameters for the GPU resources\n",
"resource_args = {\n",
" \"number_of_accelerators\": number_of_accelerators,\n",
" \"accelerator_name\": accelerator_name,\n",
"}"
]
},
Expand Down Expand Up @@ -389,12 +420,14 @@
" shared_args = shared_args,\n",
" index_args = index_args,\n",
" eval_args = eval_args,\n",
" resource_args = resource_args,\n",
" search_method = search_method,\n",
" target_metric = target_metric,\n",
" evaluation_set_path=evaluation_set_path,\n",
" debug = True # set to False if you do not want to see intermediary results and evolving parameters printed out\n",
")\n",
"\n",
"parameter_search_results = mysearch.run()"
"results = mysearch.run()"
]
},
{
Expand All @@ -417,7 +450,7 @@
"metadata": {},
"outputs": [],
"source": [
"parameter_search_results"
"results"
]
},
{
Expand Down Expand Up @@ -571,7 +604,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
16 changes: 12 additions & 4 deletions src/pipeline_eval.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
"""Fondant pipeline to evaluate a RAG pipeline."""

import pyarrow as pa
from fondant.pipeline import Pipeline
from fondant.pipeline import Pipeline, Resources


def create_pipeline(
*,
base_path: str = "./data",
weaviate_url="http://host.docker.internal:8080",
weaviate_class: str = "Pipeline1",
evaluation_set_path = "./evaluation_sets",
evaluation_set_filename = "wikitext_1000_q.csv",
evaluation_set_path="./evaluation_datasets",
evaluation_set_filename="wikitext_1000_q.csv",
evaluation_set_separator: str = ";",
embed_model_provider: str = "huggingface",
embed_model: str = "all-MiniLM-L6-v2",
Expand All @@ -20,6 +20,8 @@ def create_pipeline(
evaluation_llm: str = "OpenAI",
evaluation_llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"},
evaluation_metrics: list = ["context_precision", "context_relevancy"],
number_of_accelerators=None,
accelerator_name=None,
):
"""Create a Fondant pipeline based on the provided arguments."""
evaluation_pipeline = Pipeline(
Expand All @@ -31,7 +33,8 @@ def create_pipeline(
load_from_csv = evaluation_pipeline.read(
"load_from_csv",
arguments={
"dataset_uri": '/evaldata/' + evaluation_set_filename, # mounted dir from within docker as extra_volumes
"dataset_uri": "/evaldata/" + evaluation_set_filename,
# mounted dir from within docker as extra_volumes
"column_separator": evaluation_set_separator,
},
produces={
Expand All @@ -49,6 +52,11 @@ def create_pipeline(
consumes={
"text": "question",
},
resources=Resources(
accelerator_number=number_of_accelerators,
accelerator_name=accelerator_name,
),
cluster_type="local" if number_of_accelerators is not None else "default",
)

retrieve_chunks = embed_text_op.apply(
Expand Down
11 changes: 9 additions & 2 deletions src/pipeline_index.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
"""Fondant pipeline to index a RAG system."""
import pyarrow as pa
from fondant.pipeline import Pipeline
from fondant.pipeline import Pipeline, Resources


def create_pipeline(
*,
weaviate_url: str,
base_path: str = "./data",
n_rows_to_load: int = 1000,
weaviate_url: str = "http://host.docker.internal:8080",
weaviate_class: str = "Pipeline1",
weaviate_overwrite: bool = True,
embed_model_provider: str = "huggingface",
embed_model: str = "all-MiniLM-L6-v2",
embed_api_key: dict = {},
chunk_size: int = 512,
chunk_overlap: int = 32,
number_of_accelerators=None,
accelerator_name=None,
):
"""Create a Fondant pipeline based on the provided arguments."""
indexing_pipeline = Pipeline(
Expand Down Expand Up @@ -50,6 +52,11 @@ def create_pipeline(
"model": embed_model,
"api_keys": embed_api_key,
},
resources=Resources(
accelerator_number=number_of_accelerators,
accelerator_name=accelerator_name,
),
cluster_type="local" if number_of_accelerators is not None else "default",
)

embeddings.write(
Expand Down
Loading

0 comments on commit ff6e686

Please sign in to comment.