diff --git a/LLM/llama_index/notebooks/Mistral7b_llama_index_RAG.ipynb b/LLM/llama_index/notebooks/Mistral7b_llama_index_RAG.ipynb new file mode 100644 index 00000000..4abba052 --- /dev/null +++ b/LLM/llama_index/notebooks/Mistral7b_llama_index_RAG.ipynb @@ -0,0 +1,1008 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # LLAMAIndex + W&B: RAG with Evaluation\n", + " \n", + " \n", + " This Jupyter Notebook demonstrates how to use LLAMAIndex with Weights & Biases (W&B) for Retrieval-Augmented Generation (RAG). \n", + "We will set up the environment, read document, initialize W&B, perform queries, and evaluate the results.." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## 0️⃣ | Initial Setup\n", + " First, we need to import necessary libraries and set up our environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing required libraries\n", + "import warnings\n", + "import os\n", + "import openai\n", + "from pathlib import Path\n", + "from dotenv import load_dotenv\n", + "from llama_index.llms import OpenAI\n", + "import wandb\n", + "\n", + "# Configuring warnings and environmental variables\n", + "warnings.filterwarnings(\"ignore\")\n", + "WANDB_PROJECT = \"test_local_v2\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ### 📋 Read Documents\n", + " We will now load the documents for our RAG setup. In this example, we use a PDF file named 'Mixtral.pdf'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the PDFReader from llama_index\n", + "from llama_index import VectorStoreIndex, download_loader\n", + "\n", + "PDFReader = download_loader(\"PDFReader\")\n", + "loader = PDFReader()\n", + "documents = loader.load_data(file=Path(\"./Mixtral.pdf\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ### 📉 Initialize W&B\n", + " Weights & Biases (W&B) is used for tracking experiments, visualizing data, and sharing insights. We initialize it here for our project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize W&B for tracking and visualizations\n", + "from llama_index import ServiceContext\n", + "from llama_index.callbacks import CallbackManager, WandbCallbackHandler\n", + "\n", + "wandb_args = {\"project\": WANDB_PROJECT, \"name\": \"baseline-rag\"}\n", + "wandb_callback = WandbCallbackHandler(run_args=wandb_args)\n", + "callback_manager = CallbackManager([wandb_callback])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🏎️ Setup Local LLM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we used the Mistral-7b model. In order to make this experimetns even faster i will use the quantised version of the model. I higly recommed to check QuIP qunatised models, since they support up to 2bit qunatisation with extramly low loss in quality. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import (\n", + " SimpleDirectoryReader,\n", + " VectorStoreIndex,\n", + " ServiceContext,\n", + ")\n", + "from llama_index.llms import LlamaCPP\n", + "from llama_index.llms.llama_utils import (\n", + " messages_to_prompt,\n", + " completion_to_prompt,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that if you installed llama.cpp propperly it will use the available GPU by default. Either 'cuda' or 'metal'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm = LlamaCPP(\n", + " # You can pass in the URL to a GGML model to download it automatically\n", + " model_url=None,\n", + " # optionally, you can set the path to a pre-downloaded model instead of model_url\n", + " model_path=\"/Users/nkise/Documents/projects/Courses 📜/RAG/llama.cpp/models/mistral-instruct-7b-q3k-small.gguf\",\n", + " temperature=0.1,\n", + " max_new_tokens=512,\n", + " # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room\n", + " context_window=3900,\n", + " # kwargs to pass to __call__()\n", + " generate_kwargs={},\n", + " # kwargs to pass to __init__()\n", + " # set to at least 1 to use GPU\n", + " model_kwargs={\"n_gpu_layers\": 1},\n", + " # transform inputs into Llama2 format\n", + " messages_to_prompt=messages_to_prompt,\n", + " completion_to_prompt=completion_to_prompt,\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Otherwise, if you want to use GPT-3.5 Api just uncomment the following cell and comment the previous ones" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv()\n", + "openai.api_key = os.getenv(\n", + " \"OPENAI_API_KEY\"\n", + ") ## DON'T FORGET TO SET YOUR API KEY AS AN ENVIRONMENTAL VARIABLE\n", + "\n", + "llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test if the model is working" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.complete(\"Hello! Can you tell me a poem about cats and dogs?\")\n", + "print(response.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1️⃣ Baseline RAG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ### Setup ServiceContext\n", + " The ServiceContext in LLAMAIndex is used to manage the lifecycle of services like models and callbacks. We set it up with the required configurations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that as an embedding model we will use also the local model. LlamaIndex will also automatically detect the necessray GPU , so don't worry about it if you are usig Mac with M-processor. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setting up the ServiceContext with the language model and embedding model\n", + "embed_model = \"local:BAAI/bge-small-en-v1.5\"\n", + "service_context = ServiceContext.from_defaults(\n", + " llm=llm, \n", + " embed_model=embed_model, \n", + " callback_manager=callback_manager\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ### Create VectorStore\n", + " The VectorStore in LLAMAIndex is responsible for chunking, embedding, and storing document vectors. We create and configure it here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating the VectorStoreIndex for document handling\n", + "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n", + "\n", + "# Converting the index to a query engine for retrieval\n", + "query_engine = index.as_query_engine()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ### Testing the Query Engine\n", + " Let's test our query engine by asking a few questions related to the loaded documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining a function to display responses\n", + "from llama_index.response.notebook_utils import display_response\n", + "\n", + "\n", + "def query_and_display(question):\n", + " response = query_engine.query(question)\n", + " display_response(response)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Testing the query engine with different questions\n", + "query_and_display(\"Who wrote Mixtral paper?\")\n", + "query_and_display(\"What is Sparse MoE?\")\n", + "query_and_display(\"How many experts are used in Sparse MoE?\")\n", + "query_and_display(\"Where can I find the code?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Closing the W&B run after queries\n", + "wandb_callback.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## 2️⃣ Evaluation\n", + " We now move to the evaluation phase where we will assess the performance of our RAG setup using different metrics." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ❓ Generating Eval questions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To evaluate - we need questions. Let's be honest - we are lazy to write them by ourselves. So let's already available QuestionsGenerator inside llamaindex + GPT-3.5 Api to generate them for us. Alternatively you can use your local llm model. Just simply repplace the llm object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing necessary modules for evaluation\n", + "import copy\n", + "import random\n", + "import nest_asyncio\n", + "import pandas as pd\n", + "from llama_index.evaluation import (\n", + " DatasetGenerator,\n", + " RelevancyEvaluator,\n", + " ResponseEvaluator,\n", + " RetrieverEvaluator,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize W&B for evaluation\n", + "wandb_args = {\"project\": WANDB_PROJECT, \"name\": \"eval-questions-generation\"}\n", + "wandb_callback = WandbCallbackHandler(run_args=wandb_args)\n", + "callback_manager = CallbackManager([wandb_callback])\n", + "llm_eval = OpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n", + "service_context = ServiceContext.from_defaults(\n", + " llm=llm_eval, \n", + " embed_model=embed_model, \n", + " callback_manager=callback_manager\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setting up the documents and generating questions for evaluation\n", + "random_documents = copy.deepcopy(documents)\n", + "\n", + "# Shuffling the documents and selecting 5 random documents. Just to make the evaluation quicker\n", + "random.shuffle(random_documents)\n", + "random_documents = random_documents[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generating questions from the documents for evaluation\n", + "data_generator = DatasetGenerator.from_documents(\n", + " random_documents, service_context=service_context, num_questions_per_chunk=2\n", + ")\n", + "\n", + "# Applying nest_asyncio to run async code in Jupyter\n", + "nest_asyncio.apply()\n", + "eval_questions = data_generator.generate_questions_from_nodes()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_questions[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ideally you want to save evaluation questions as an artifact in W&B. This way you can easily show them, share and re-use. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Persisting the questions to a CSV file using W&B, for further loading\n", + "# Create an artifact object\n", + "artifact = wandb.Artifact(name=\"eval-questions\", type=\"text\")\n", + "\n", + "# Add the list of questions as a file to the artifact\n", + "with artifact.new_file(\"questions.txt\", mode=\"w\") as f:\n", + " f.write(\"\\n\".join(eval_questions))\n", + "\n", + "# Log the artifact to W&B\n", + "wandb.log_artifact(artifact)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can easily load them for later use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lookup the artifact\n", + "# artifact = wandb.use_artifact(\"eval-questions:v0\")\n", + "\n", + "# # Get the file containing the list of questions\n", + "# file = artifact.get_file(\"questions.txt\")\n", + "\n", + "# # Read the list of questions from the file\n", + "# with file.open(\"r\") as f:\n", + "# questions = f.read().split(\"\\n\")\n", + "\n", + "# # Print the list of questions\n", + "# print(questions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb_callback.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ### 🔎 Evaluation on the validation set\n", + " We evaluate the responses on a validation set to measure the effectiveness of our setup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize W&B for response evaluation\n", + "wandb_args = {\"project\": WANDB_PROJECT, \"name\": \"baseline-evaluation\"}\n", + "wandb_callback = WandbCallbackHandler(run_args=wandb_args)\n", + "callback_manager = CallbackManager([wandb_callback])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preparing the data for evaluation\n", + "question_df = pd.DataFrame(columns=[\"questions\"], data=eval_questions)\n", + "question_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup for evaluating the responses\n", + "llm_eval = OpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n", + "service_context_eval = ServiceContext.from_defaults(\n", + " llm=llm_eval, \n", + " callback_manager=callback_manager\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Running the evaluation using BatchEvalRunner\n", + "from llama_index.evaluation import (\n", + " BatchEvalRunner,\n", + " FaithfulnessEvaluator,\n", + " RelevancyEvaluator,\n", + ")\n", + "\n", + "faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_eval)\n", + "relevancy_evaluator = RelevancyEvaluator(service_context=service_context_eval)\n", + "runner = BatchEvalRunner(\n", + " {\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n", + " workers=8,\n", + ")\n", + "\n", + "eval_results = await runner.aevaluate_queries(\n", + " index.as_query_engine(), queries=eval_questions\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So here is the thing, current integration of wandb and llamaindex is not perfect. So we will need to do some workarounds in order to propperly log our information. But, its fairly easy. We just need to use the weandb library itself. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Make a dataframe from the results.\n", + "faithfulness_df = pd.DataFrame.from_records(\n", + " [eval_result.dict() for eval_result in eval_results[\"faithfulness\"]]\n", + ")\n", + "relevancy_df = pd.DataFrame.from_records(\n", + " [eval_result.dict() for eval_result in eval_results[\"relevancy\"]]\n", + ")\n", + "relevancy_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save questions , faithfulness_df and relevancy_df to csv. Drop none columns from faithfulness_df and relevancy_df\n", + "question_df.to_csv(\"questions.csv\", index=False)\n", + "faithfulness_df.dropna(axis=1).to_csv(\"faithfulness.csv\", index=False)\n", + "relevancy_df.dropna(axis=1).to_csv(\"relevancy.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make 2 new tables in Wandb for Faithfulness and Relevancy. Log the results.\n", + "# Firstly, create a table for Faithfulness.\n", + "import wandb\n", + "\n", + "faithfulness_table = wandb.Table(dataframe=faithfulness_df)\n", + "relevancy_table = wandb.Table(dataframe=relevancy_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb.log({\"faithfulness\": faithfulness_table, \"relevancy\": relevancy_table})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wandb log scalr mean of faithfulness and relevancy scores\n", + "wandb.log({\"faithfulness_mean\": faithfulness_df[\"score\"].mean()})\n", + "wandb.log({\"relevancy_mean\": relevancy_df[\"score\"].mean()})\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "faithfulness_df[\"score\"].mean(), relevancy_df[\"score\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb_callback.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## 🚀 Advanced RAG\n", + "Now let's ramp up RAG quality. Our baseline RAG already can answer questions, but not particularly well. Let's explore an advanced setup with hierarchical node parsing and re-ranking for better context merging and retrieval prioritization." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The method here is the one which described in the course of Advanced RAG on the DeeepLearning.AI. Highly recommend to check it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.node_parser import HierarchicalNodeParser\n", + "\n", + "# create the hierarchical node parser. Note we have to specify the chunk sizes LYERS\n", + "node_parser = HierarchicalNodeParser.from_defaults(\n", + " chunk_sizes=[2048, 512, 256]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the nodes from the documents\n", + "nodes = node_parser.get_nodes_from_documents(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Printing the leaf node**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.node_parser import get_leaf_nodes\n", + "\n", + "leaf_nodes = get_leaf_nodes(nodes)\n", + "print(leaf_nodes[0].text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Now the 1st layer of the Parent node***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nodes_by_id = {node.node_id: node for node in nodes}\n", + "\n", + "parent_node = nodes_by_id[leaf_nodes[1].parent_node.node_id]\n", + "print(parent_node.text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# initialise WandbCallbackHandler and pass any wandb.init args\n", + "wandb_args = {\"project\":\"test\", \"name\":\"adv-rag\"}\n", + "wandb_callback = WandbCallbackHandler(run_args=wandb_args)\n", + "# pass wandb_callback to the service context\n", + "callback_manager = CallbackManager([wandb_callback])\n", + "\n", + "# Creating the specification for the context retrieval\n", + "auto_merging_context = ServiceContext.from_defaults(\n", + " llm=llm,\n", + " embed_model=\"local:BAAI/bge-small-en-v1.5\",\n", + " node_parser=node_parser, # Note that hierarchical node parser in here\n", + " callback_manager=callback_manager\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import VectorStoreIndex, StorageContext\n", + "\n", + "# StorageContext is an utility conteinr for nodes, graphs and other doc types\n", + "storage_context = StorageContext.from_defaults()\n", + "storage_context.docstore.add_documents(nodes)\n", + "\n", + "# Creating the Index from the configuration\n", + "automerging_index = VectorStoreIndex(\n", + " leaf_nodes, storage_context=storage_context, service_context=auto_merging_context\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.indices.postprocessor import SentenceTransformerRerank\n", + "from llama_index.retrievers import AutoMergingRetriever\n", + "from llama_index.query_engine import RetrieverQueryEngine\n", + "\n", + "# Getting retriever from the index\n", + "automerging_retriever = automerging_index.as_retriever(\n", + " similarity_top_k=12\n", + ")\n", + "\n", + "# Creating AutoMergingRetriever\n", + "# Note we pass the retriever from Index with hierarchical node parser\n", + "retriever = AutoMergingRetriever(\n", + " automerging_retriever, \n", + " automerging_index.storage_context, \n", + " verbose=True,\n", + ")\n", + "\n", + "# Creating the re-ranker, we will need it later for merged chunks\n", + "rerank = SentenceTransformerRerank(top_n=4, model=\"BAAI/bge-reranker-base\")\n", + "\n", + "# Creating the query engine wrapper. We need wrapper to put postprocessors in it.\n", + "auto_merging_engine = RetrieverQueryEngine.from_args(\n", + " automerging_retriever, node_postprocessors=[rerank], verbose=True, service_context=auto_merging_context\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the query engine on a user question.\n", + "response = auto_merging_engine.query(\"Who wrote Mixtral paper?\")\n", + "display_response(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = auto_merging_engine.query(\"What is Sparse MoE?\")\n", + "display_response(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = auto_merging_engine.query(\"How many experts are used in Sparse MoE?\")\n", + "display_response(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = auto_merging_engine.query(\"Where I can find a code?\")\n", + "display_response(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# close wandb run\n", + "wandb_callback.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " #### Evaluation of Advanced RAG\n", + " We perform a similar evaluation as before" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize W&B for response evaluation\n", + "wandb_args = {\"project\": WANDB_PROJECT, \"name\":\"evaluation-adv-rag\"}\n", + "wandb_callback = WandbCallbackHandler(run_args=wandb_args)\n", + "callback_manager = CallbackManager([wandb_callback])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup for evaluating the responses\n", + "llm_eval = OpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n", + "service_context_eval = ServiceContext.from_defaults(\n", + " llm=llm_eval, \n", + " callback_manager=callback_manager\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.evaluation import BatchEvalRunner\n", + "from llama_index.evaluation import FaithfulnessEvaluator, RelevancyEvaluator\n", + "faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_eval)\n", + "relevancy_evaluator = RelevancyEvaluator(service_context=service_context_eval)\n", + "\n", + "runner = BatchEvalRunner(\n", + " {\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n", + " workers=1,\n", + ")\n", + "\n", + "eval_results = await runner.aevaluate_queries(\n", + " auto_merging_engine, queries=eval_questions,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Make a dataframe from the results.\n", + "faithfulness_df = pd.DataFrame.from_records(\n", + " [eval_result.dict() for eval_result in eval_results[\"faithfulness\"]]\n", + ")\n", + "relevancy_df = pd.DataFrame.from_records(\n", + " [eval_result.dict() for eval_result in eval_results[\"relevancy\"]]\n", + ")\n", + "relevancy_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make 2 new tables in Wandb for Faithfulness and Relevancy. Log the results.\n", + "# Firstly, create a table for Faithfulness.\n", + "faithfulness_table = wandb.Table(dataframe=faithfulness_df)\n", + "relevancy_table = wandb.Table(dataframe=relevancy_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb.log({\"faithfulness\": faithfulness_table, \"relevancy\": relevancy_table})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wandb log scalr mean of faithfulness and relevancy scores\n", + "wandb.log({\"faithfulness_mean\": faithfulness_df[\"score\"].mean()})\n", + "wandb.log({\"relevancy_mean\": relevancy_df[\"score\"].mean()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note how the scores improved! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "faithfulness_df[\"score\"].mean(), relevancy_df[\"score\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb_callback.finish()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = auto_merging_engine.query(\"How many experts are used in Sparse MoE?\")\n", + "display_response(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rag", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}