From b69293df6d9b5112d297bbbd33c946a64e2a32f9 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 14 Nov 2024 19:30:11 +0000
Subject: [PATCH 01/14] fix ragas metrics for crag eval

---
 .../agent_eval/crag_eval/docker/build_image.sh        |  2 +-
 .../crag_eval/docker/launch_eval_container.sh         |  2 +-
 .../agent_eval/crag_eval/run_benchmark/run_grading.sh |  4 ++--
 evals/metrics/ragas/ragas.py                          | 11 ++++++-----
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
index a743900f..14600925 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
+++ b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
@@ -5,7 +5,7 @@ dockerfile=Dockerfile
 
 docker build \
     -f ${dockerfile} . \
-    -t crag-eval:latest \
+    -t crag-eval:v1.1 \
     --network=host \
     --build-arg http_proxy=${http_proxy} \
     --build-arg https_proxy=${https_proxy} \
diff --git a/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
index 8698f452..681e79e9 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
+++ b/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
@@ -4,4 +4,4 @@
 volume=$WORKDIR
 host_ip=$(hostname -I | awk '{print $1}')
 
-docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest
+docker run -it --name rag_eval -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:v1.1
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
index 5431d39b..d519e897 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
@@ -1,8 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-FILEDIR=$WORKDIR/datasets/crag_results/
-FILENAME=crag_music_sampled_results.csv
+FILEDIR=$WORKDIR/datasets/ragagent_eval/
+FILENAME=rag_llama3.1-70B-instruct_92queries.csv #crag_music_sampled_results.csv
 LLM_ENDPOINT=http://${host_ip}:8085 # change host_ip to the IP of LLM endpoint
 
 python3 grade_answers.py \
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index c31cb632..e71ab38d 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -52,11 +52,12 @@ def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import ALL_METRICS
-
-            self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS]
-            self.metric_names = [re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower() for name in self.metric_names]
-            self.metric_names = list(set(self.metric_names))
+            from ragas.metrics import __all__ 
+            ALL_METRICS=__all__
+            print("ALL METRICS: ", ALL_METRICS)
+            #self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS]
+            #self.metric_names = [re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower() for name in self.metric_names]
+            self.metric_names = list(set(ALL_METRICS))
             # Note - summarization score metric is not working with best open-source LLMs
             # Note - which is why we are removing it from our offering at the moment.
             self.metric_names.remove("summarization_score")

From c47871c5bd74d63d8470fbf011b765b620013108 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 14 Nov 2024 11:40:49 -0800
Subject: [PATCH 02/14] fix ragas code

---
 .../agent_eval/crag_eval/docker/Dockerfile    |   3 +-
 .../crag_eval/docker/requirements.txt         |   1 +
 evals/metrics/ragas/ragas.py                  | 102 ++++++------------
 3 files changed, 36 insertions(+), 70 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/docker/Dockerfile b/evals/evaluation/agent_eval/crag_eval/docker/Dockerfile
index a3a97c5b..d3fc4d74 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/Dockerfile
+++ b/evals/evaluation/agent_eval/crag_eval/docker/Dockerfile
@@ -10,7 +10,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     git \
     poppler-utils \
     libmkl-dev \
-    curl
+    curl \
+    nano
 
 COPY requirements.txt /home/user/requirements.txt
 
diff --git a/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
index b32606b7..85d99818 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
+++ b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
@@ -6,3 +6,4 @@ langchain-huggingface
 pandas
 ragas
 sentence_transformers
+nltk
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index e71ab38d..2eed49c5 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -12,7 +12,18 @@
 from langchain_huggingface import HuggingFaceEndpoint
 
 # import * is only allowed at module level according to python syntax
-from ragas.metrics import *
+try:
+    from ragas.metrics import *
+    from ragas import evaluate
+    from ragas.metrics import __all__ 
+except ModuleNotFoundError:
+    raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+try:
+    from datasets import Dataset
+except ModuleNotFoundError:
+    raise ModuleNotFoundError("Please install dataset")
+
 
 
 def format_ragas_metric_name(name: str):
@@ -34,65 +45,21 @@ def __init__(
         self.embeddings = embeddings
         self.metrics = metrics
 
-        # self.validated_list = [
-        #     "answer_correctness",
-        #     "answer_relevancy",
-        #     "answer_similarity",
-        #     "context_precision",
-        #     "context_recall",
-        #     "faithfulness",
-        #     "context_utilization",
-        #     # "reference_free_rubrics_score",
-        # ]
+        ALL_METRICS=__all__   
+            
+        print("ALL METRICS: ", ALL_METRICS)
+        self.metric_names = list(set(ALL_METRICS))
+        # Note - summarization score metric is not working with best open-source LLMs
+        # Note - which is why we are removing it from our offering at the moment.
+        self.metric_names.remove("summarization_score")
+        self.metric_instances = {}
+        for metric in self.metric_names:
+            try:
+                self.metric_instances[metric] = eval(metric)
+            except:
+                pass
 
-    async def a_measure(self, test_case: Dict):
-        return self.measure(test_case)
 
-    def measure(self, test_case: Dict):
-        # sends to server
-        try:
-            from ragas import evaluate
-            from ragas.metrics import __all__ 
-            ALL_METRICS=__all__
-            print("ALL METRICS: ", ALL_METRICS)
-            #self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS]
-            #self.metric_names = [re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower() for name in self.metric_names]
-            self.metric_names = list(set(ALL_METRICS))
-            # Note - summarization score metric is not working with best open-source LLMs
-            # Note - which is why we are removing it from our offering at the moment.
-            self.metric_names.remove("summarization_score")
-            self.metric_instances = {}
-            for metric in self.metric_names:
-                try:
-                    self.metric_instances[metric] = eval(metric)
-                except:
-                    pass
-            # from ragas.metrics import (
-            #     answer_correctness,
-            #     answer_relevancy,
-            #     answer_similarity,
-            #     context_precision,
-            #     context_recall,
-            #     context_utilization,
-            #     faithfulness,
-            # )
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
-        try:
-            from datasets import Dataset
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install dataset")
-        # self.metrics_instance = {
-        #     "answer_correctness": answer_correctness,
-        #     "answer_relevancy": answer_relevancy,
-        #     "answer_similarity": answer_similarity,
-        #     "context_precision": context_precision,
-        #     "context_recall": context_recall,
-        #     "faithfulness": faithfulness,
-        #     "context_utilization": context_utilization,
-        #     # "reference_free_rubrics_score": reference_free_rubrics_score,
-        # }
-        # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
@@ -106,8 +73,7 @@ def measure(self, test_case: Dict):
         else:
             print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
             self.chat_model = self.model
-        # Create a dataset from the test case
-        # Convert the Dict to a format compatible with Dataset
+        
         if self.metrics is not None:
             tmp_metrics = []
             # check supported list
@@ -128,14 +94,7 @@ def measure(self, test_case: Dict):
             self.metrics = tmp_metrics
         else:
             self.metrics = list(self.metric_instances.values())
-            # self.metrics = [
-            #     answer_relevancy,
-            #     faithfulness,
-            #     answer_correctness,
-            #     answer_similarity,
-            #     context_precision,
-            #     context_recall,
-            # ]
+
         # Find necessary input fields using the given metrics
         _required_columns = set()
         column_map = {  # this column maps new naming style in ragas to their old naming style
@@ -158,9 +117,14 @@ def measure(self, test_case: Dict):
                 print("metric has no attribute denoting required columns")
 
         print("Required columns for given list of metrics are = {}".format(_required_columns))
+        self._required_columns = _required_columns
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
 
+    def measure(self, test_case: Dict):        
         # get only necessary columns from test case
-        data = {column: test_case[column] for column in _required_columns}
+        data = {column: test_case[column] for column in self._required_columns}
         dataset = Dataset.from_dict(data)
 
         # evaluate

From 7406a69d6ba1957ae2847030376d9ce5823b5458 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 14 Nov 2024 14:41:39 -0800
Subject: [PATCH 03/14] fix score list bug

---
 evals/evaluation/agent_eval/crag_eval/README.md        | 10 +++++-----
 .../crag_eval/run_benchmark/grade_answers.py           | 10 ++++++----
 .../agent_eval/crag_eval/run_benchmark/run_grading.sh  |  2 +-
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
index b458a930..96d8b9c8 100644
--- a/evals/evaluation/agent_eval/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -46,7 +46,7 @@ cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/preprocess_data
 bash run_data_preprocess.sh
 ```
 **Note**: This is an example of data processing. You can develop and optimize your own data processing for this benchmark.
-3. Sample queries for benchmark
+3. (Optional) Sample queries for benchmark
 The CRAG dataset has more than 4000 queries, and running all of them can be very expensive and time-consuming. You can sample a subset for benchmark. Here we provide a script to sample up to 5 queries per question_type per dynamism in each domain. For example, we were able to get 92 queries from the music domain using the script.
 ```
 bash run_sample_data.sh
@@ -57,18 +57,18 @@ Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in
 
 > **Please note**: This is an example. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
 
-To launch the agent in our AgentQnA example, open another terminal and build images and launch agent system there.
+To launch the agent in our AgentQnA example, open another terminal and follow the instructions below.
 1. Build images
 ```
 export $WORKDIR=<your-work-directory>
 cd $WORKDIR
 git clone https://github.com/opea-project/GenAIExamples.git
 cd GenAIExamples/AgentQnA/tests/
-bash 1_build_images.sh
+bash step1_build_images.sh
 ```
 2. Start retrieval tool
 ```
-bash 2_start_retrieval_tool.sh
+bash step2_start_retrieval_tool.sh
 ```
 3. Ingest data into vector database and validate retrieval tool
 ```
@@ -86,7 +86,7 @@ python3 index_data.py --host_ip $host_ip --filedir ${WORKDIR}/datasets/crag_docs
 ```
 # Go to the terminal where you launched the AgentQnA example
 cd $WORKDIR/GenAIExamples/AgentQnA/tests/
-bash 4_launch_and_validate_agent.sh
+bash step4_launch_and_validate_agent.sh
 ```
 
 ## Run CRAG benchmark
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
index 8f95d497..e2a903d3 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
@@ -50,8 +50,8 @@ def grade_answers(args, test_case):
         scores = []
         for case in test_case:
             metric.measure(case)
-            scores.append(metric.score["answer_correctness"])
-            print(metric.score)
+            scores.append(metric.score["answer_correctness"][0])
+            print(metric.score["answer_correctness"][0])
             print("-" * 50)
         return scores
 
@@ -79,13 +79,15 @@ def grade_answers(args, test_case):
     # print(test_case)
 
     scores = grade_answers(args, test_case)
+    print(scores)
 
     # save the scores
     if args.batch_grade:
         print("Aggregated answer correctness score: ", scores)
     else:
         data["answer_correctness"] = scores
-        print("Average answer correctness score: ", data["answer_correctness"].mean())
-        output_file = args.filename.split(".")[0] + "_graded.csv"
+        output_file = args.filename.replace(".csv", "_graded.csv") 
         data.to_csv(os.path.join(args.filedir, output_file), index=False)
         print("Scores saved to ", os.path.join(args.filedir, output_file))
+
+        print("Average answer correctness score: ", data["answer_correctness"].mean())
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
index d519e897..1b253901 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 FILEDIR=$WORKDIR/datasets/ragagent_eval/
-FILENAME=rag_llama3.1-70B-instruct_92queries.csv #crag_music_sampled_results.csv
+FILENAME=crag_music_sampled_results.csv
 LLM_ENDPOINT=http://${host_ip}:8085 # change host_ip to the IP of LLM endpoint
 
 python3 grade_answers.py \

From 51bc9165e721b492b73ebdcaf6f370b35bad3869 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 14 Nov 2024 15:58:02 -0800
Subject: [PATCH 04/14] add code for conventional rag

---
 .../run_benchmark/conventional_rag.py         | 149 ++++++++++++++++++
 .../crag_eval/run_benchmark/run_conv_rag.sh   |  13 ++
 2 files changed, 162 insertions(+)
 create mode 100644 evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
 create mode 100644 evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh

diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
new file mode 100644
index 00000000..96d718c9
--- /dev/null
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
@@ -0,0 +1,149 @@
+import argparse
+import json
+import os
+import pandas as pd
+import requests
+
+
+def get_test_dataset(args):
+    filepath = os.path.join(args.filedir, args.filename)
+    if filepath.endswith('.jsonl'):
+        df = pd.read_json(filepath, lines=True, convert_dates=False)
+    elif filepath.endswith('.csv'):
+        df = pd.read_csv(filepath)
+    else:
+        raise ValueError("Invalid file format")
+    return df
+
+def save_results(output_file, output_list):
+    with open(output_file, "w") as f:
+        for output in output_list:
+            f.write(json.dumps(output))
+            f.write("\n")
+
+def save_as_csv(output):
+    df = pd.read_json(output, lines=True, convert_dates=False)
+    df.to_csv(output.replace(".jsonl", ".csv"), index=False)
+    print(f"Saved to {output.replace('.jsonl', '.csv')}")
+
+
+def search_knowledge_base(query: str) -> str:
+    """Search the knowledge base for a specific query."""
+    url = os.environ.get("RETRIEVAL_TOOL_URL")
+    print(url)
+    proxies = {"http": ""}
+    payload = {
+        "text": query,
+    }
+    response = requests.post(url, json=payload, proxies=proxies)
+    print(response)
+    if "documents" in response.json():
+        docs = response.json()["documents"]
+        context = ""
+        for i, doc in enumerate(docs):
+            if i == 0:
+                context = doc
+            else:
+                context += "\n" + doc
+        # print(context)
+        return context
+    elif "text" in response.json():
+        return response.json()["text"]
+    elif "reranked_docs" in response.json():
+        docs = response.json()["reranked_docs"]
+        context = ""
+        for i, doc in enumerate(docs):
+            if i == 0:
+                context = doc["text"]
+            else:
+                context += "\n" + doc["text"]
+        # print(context)
+        return context
+    else:
+        return "Error parsing response from the knowledge base."
+
+PROMPT = """\
+### You are a helpful, respectful and honest assistant.
+You are given a Question and the time when it was asked in the Pacific Time Zone (PT), referred to as "Query
+Time". The query time is formatted as "mm/dd/yyyy, hh:mm:ss PT".
+Please follow these guidelines when formulating your answer:
+1. If the question contains a false premise or assumption, answer “invalid question”.
+2. If you are uncertain or do not know the answer, respond with “I don’t know”.
+3. Refer to the search results to form your answer.
+5. Give concise, factual and relevant answers.
+
+### Search results: {context} \n
+### Question: {question} \n
+### Query Time: {time} \n
+### Answer:
+"""
+
+def setup_chat_model(args):
+    from langchain_openai import ChatOpenAI
+    params = {
+        "temperature": args.temperature,
+        "max_tokens": args.max_new_tokens,
+        "top_p": args.top_p,
+        "streaming": False,
+    }
+    openai_endpoint = f"{args.llm_endpoint_url}/v1"
+    llm = ChatOpenAI(
+        openai_api_key="EMPTY",
+        openai_api_base=openai_endpoint,
+        model_name=args.model,
+        **params,
+    )
+    return llm
+
+def generate_answer(llm, query, context, time):
+    prompt = PROMPT.format(context=context, question=query, time=time)
+    response = llm.invoke(prompt)
+    return response.content 
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--filedir", type=str, default="./", help="test file directory")
+    parser.add_argument("--filename", type=str, default="query.csv", help="query_list_file")
+    parser.add_argument("--output", type=str, default="output.csv", help="query_list_file")
+    parser.add_argument("--llm_endpoint_url", type=str, default="http://localhost:8085", help="llm endpoint url")
+    parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3.1-70B-Instruct", help="model name")
+    parser.add_argument("--temperature", type=float, default=0.01, help="temperature")
+    parser.add_argument("--max_new_tokens", type=int, default=8192, help="max_new_tokens")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top_p")
+    args = parser.parse_args()
+    print(args)
+
+    df = get_test_dataset(args)
+    df=df.head(3)
+    print(df.shape)
+
+    llm = setup_chat_model(args)
+
+    contexts = []
+    output_list = []
+    for _, row in df.iterrows():
+        q = row["query"]
+        t = row["query_time"]
+        print("========== Query: ", q)
+        context = search_knowledge_base(q)
+        print("========== Context:\n", context)
+        answer = generate_answer(llm, q, context, t)  
+        print("========== Answer:\n", answer)
+        contexts.append(context)
+        output_list.append(
+            {
+                "query": q,
+                "query_time": t,
+                "ref_answer": row["answer"],
+                "answer": answer,
+                "question_type": row["question_type"],
+                "static_or_dynamic": row["static_or_dynamic"],
+            }
+        )
+        save_results(args.output, output_list)
+
+    save_as_csv(args.output)
+
+
+
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
new file mode 100644
index 00000000..ea5e5002
--- /dev/null
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
@@ -0,0 +1,13 @@
+MODEL="meta-llama/Meta-Llama-3.1-70B-Instruct"
+LLMENDPOINT=http://${host_ip}:8085
+
+FILEDIR=$WORKDIR/datasets/ragagent_eval/
+FILENAME=crag_qa_music.jsonl
+OUTPUT=$WORKDIR/datasets/ragagent_eval/val_conv_rag_music_full.jsonl
+
+python3 benchmark.py \
+--model ${MODEL} \
+--llm_endpoint_url ${LLMENDPOINT} \
+--filedir ${FILEDIR} \
+--filename ${FILENAME} \
+--output ${OUTPUT}
\ No newline at end of file

From 7695ad31951ac74a07d94eb7f2f99b41d06d18c6 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 14 Nov 2024 16:05:34 -0800
Subject: [PATCH 05/14] update code for conv rag

---
 evals/evaluation/agent_eval/crag_eval/docker/requirements.txt | 1 +
 .../agent_eval/crag_eval/run_benchmark/run_conv_rag.sh        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
index 85d99818..1e8beb72 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
+++ b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
@@ -3,6 +3,7 @@ evaluate
 jieba
 langchain-community
 langchain-huggingface
+langchain-openai
 pandas
 ragas
 sentence_transformers
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
index ea5e5002..634ea15d 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
@@ -5,7 +5,9 @@ FILEDIR=$WORKDIR/datasets/ragagent_eval/
 FILENAME=crag_qa_music.jsonl
 OUTPUT=$WORKDIR/datasets/ragagent_eval/val_conv_rag_music_full.jsonl
 
-python3 benchmark.py \
+export RETRIEVAL_TOOL_URL="http://${host_ip}:8889/v1/retrievaltool"
+
+python3 conventional_rag.py \
 --model ${MODEL} \
 --llm_endpoint_url ${LLMENDPOINT} \
 --filedir ${FILEDIR} \

From 970b9cf9cc065573be0f626bbaade8704c36409f Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Fri, 15 Nov 2024 11:47:11 -0800
Subject: [PATCH 06/14] add code to compare scores

---
 .../evaluation/agent_eval/crag_eval/README.md |  4 +-
 .../crag_eval/docker/build_image.sh           |  1 +
 .../crag_eval/run_benchmark/compare_scores.py | 67 +++++++++++++++++++
 3 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py

diff --git a/evals/evaluation/agent_eval/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
index 96d8b9c8..f9c3f95a 100644
--- a/evals/evaluation/agent_eval/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -57,7 +57,7 @@ Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in
 
 > **Please note**: This is an example. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
 
-To launch the agent in our AgentQnA example, open another terminal and follow the instructions below.
+To launch the agent in our AgentQnA example on Intel Gaudi accelerators, open another terminal and follow the instructions below.
 1. Build images
 ```
 export $WORKDIR=<your-work-directory>
@@ -86,7 +86,7 @@ python3 index_data.py --host_ip $host_ip --filedir ${WORKDIR}/datasets/crag_docs
 ```
 # Go to the terminal where you launched the AgentQnA example
 cd $WORKDIR/GenAIExamples/AgentQnA/tests/
-bash step4_launch_and_validate_agent.sh
+bash step4_launch_and_validate_agent_gaudi.sh
 ```
 
 ## Run CRAG benchmark
diff --git a/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
index 14600925..33207c9e 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
+++ b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
@@ -4,6 +4,7 @@
 dockerfile=Dockerfile
 
 docker build \
+    --no-cache \
     -f ${dockerfile} . \
     -t crag-eval:v1.1 \
     --network=host \
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
new file mode 100644
index 00000000..e7736b91
--- /dev/null
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
@@ -0,0 +1,67 @@
+import pandas as pd
+from scipy.stats import spearmanr, pearsonr
+
+
+def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
+    conv_rag_df = pd.read_csv(filedir+conv_rag)
+    ragagent_df = pd.read_csv(filedir+ragagent)
+    reactagent_df = pd.read_csv(filedir+reactagent)
+
+    conv_rag_df = conv_rag_df.rename(columns={"answer_correctness": "conv_rag_score"})
+    ragagent_df = ragagent_df.rename(columns={"answer_correctness": "ragagent_score"})
+    reactagent_df = reactagent_df.rename(columns={"answer_correctness": "reactagent_score"})
+    merged_df = pd.merge(conv_rag_df, ragagent_df, on="query")
+    merged_df = pd.merge(merged_df, reactagent_df, on="query")
+    print(merged_df.shape)
+    merged_df.to_csv(filedir+prefix+"merged_scores.csv", index=False)
+
+    # drop rows with nan
+    merged_df_dropped = merged_df.dropna()
+    # merged_df = merged_df.reset_index(drop=True)
+    print(merged_df_dropped.shape)
+
+    # compare scores
+    print(merged_df_dropped.describe())
+    merged_df_dropped.to_csv(filedir+prefix+"merged_scores_nadropped.csv", index=False)
+    return merged_df, merged_df_dropped
+
+
+
+#RAGAS scores
+print("===============RAGAS scores==================")
+filedir="/localdisk/minminho/dataset/rag_eval/"
+conv_rag="rag_llama3.1-70B-instruct_92queries_graded.csv"
+ragagent="ragagent_chatopenai_tgi_llama3.1-70B-instruct_92queries_graded.csv"
+reactagent="react_v3parser_v3prompt_tgi_chatopenai_llama3.1-70B-instruct_92queries_graded.csv"
+merged_df, merged_df_dropped = merge_and_get_stats(filedir, conv_rag, ragagent, reactagent)
+
+# human scores
+print("===============Human scores==================")
+human_scores = "human_scores_92queries.csv"
+human_scores_df = pd.read_csv(filedir+human_scores)
+print(human_scores_df.describe())
+
+human_scores_df_dropped = human_scores_df.loc[human_scores_df["query"].isin(merged_df_dropped["query"])]
+print(human_scores_df_dropped.describe())
+human_scores_df_dropped.to_csv(filedir+"human_scores_dropped.csv", index=False)
+
+# calculate spearman correlation
+print("===============Spearman correlation==================")
+print(spearmanr(merged_df_dropped["conv_rag_score"], human_scores_df_dropped["conv_rag"]))
+print(spearmanr(merged_df_dropped["ragagent_score"], human_scores_df_dropped["ragagent"]))
+print(spearmanr(merged_df_dropped["reactagent_score"], human_scores_df_dropped["reactagent"]))
+
+# concat conv_rag, ragagent, reactagent scores in merged_df_dropped
+ragas_scores = pd.concat([merged_df_dropped["conv_rag_score"], merged_df_dropped["ragagent_score"], merged_df_dropped["reactagent_score"]], axis=0)
+human_scores = pd.concat([human_scores_df_dropped["conv_rag"], human_scores_df_dropped["ragagent"], human_scores_df_dropped["reactagent"]], axis=0)
+print(spearmanr(ragas_scores, human_scores))
+
+# pearson correlation
+print("===============Pearson correlation==================")
+print(pearsonr(merged_df_dropped["conv_rag_score"], human_scores_df_dropped["conv_rag"]))
+print(pearsonr(merged_df_dropped["ragagent_score"], human_scores_df_dropped["ragagent"]))
+print(pearsonr(merged_df_dropped["reactagent_score"], human_scores_df_dropped["reactagent"]))
+print(pearsonr(ragas_scores, human_scores))
+
+
+

From 889e72666f523090e8c761df314b429ce68aabbd Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Fri, 15 Nov 2024 14:24:34 -0800
Subject: [PATCH 07/14] update readme

---
 .../evaluation/agent_eval/crag_eval/README.md | 39 +++++++++-
 .../crag_eval/run_benchmark/compare_scores.py | 77 ++++++++++---------
 .../run_benchmark/run_compare_scores.sh       | 12 +++
 3 files changed, 92 insertions(+), 36 deletions(-)
 create mode 100644 evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh

diff --git a/evals/evaluation/agent_eval/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
index f9c3f95a..e041ab6e 100644
--- a/evals/evaluation/agent_eval/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -53,7 +53,7 @@ bash run_sample_data.sh
 ```
 
 ## Launch agent QnA system
-Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the [AgentQnA example](https://github.com/opea-project/GenAIExamples/tree/main/AgentQnA/README.md) for more details.
+Here we showcase an agent system in OPEA GenAIExamples repo. Please refer to the README in the [AgentQnA example](https://github.com/opea-project/GenAIExamples/tree/main/AgentQnA/README.md) for more details.
 
 > **Please note**: This is an example. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
 
@@ -88,6 +88,7 @@ python3 index_data.py --host_ip $host_ip --filedir ${WORKDIR}/datasets/crag_docs
 cd $WORKDIR/GenAIExamples/AgentQnA/tests/
 bash step4_launch_and_validate_agent_gaudi.sh
 ```
+Note: There are two agents in the agent system: a RAG agent (as the worker agent) and a ReAct agent (as the supervisor agent). For CRAG benchmark, we will use the RAG agent.
 
 ## Run CRAG benchmark
 Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
@@ -123,3 +124,39 @@ python3 test_llm_endpoint.py
 cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark/
 bash run_grading.sh
 ```
+ 
+### Validation of LLM-as-judge
+We validated RAGAS answer correctness as the metric to evaluate agents. We sampled 92 queries from the 374 music domain questions and conducted human evaluations on the conventional RAG answers, single RAG agent answers and hierachical ReAct agent answers of the 92 queries. We ran our experiments on Intel Gaudi2 accelerators. We used `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge.
+
+We followed the criteria in the [CRAG paper](https://arxiv.org/pdf/2406.04744) to get human scores: 
+1. score 1 if the answer matches the golden answer or semantically similar.
+2. score 0 if the asnwer misses information, or is "I don't know", “I’m sorry I can’t find ...”, a system error such as recursion limit is hit, or a request from the system to clarify the original question.
+3. score -1 if the answer contains incorrect information.
+
+Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py) for the implementation of its `answer correctness` score.
+
+|Setup           |Mean Human score|Mean RAGAS `answer_correctness` score|
+|----------------|-----------|------------------------------|
+|Conventional RAG|0.05       |0.37|
+|Single RAG agent|0.18       |0.43|
+|Hierachical ReAct agent|0.22|0.54|
+
+We can see that the human scores and the RAGAS `answer_correctness` scores follow the same trend. Therefore, we went on to use RAGAS `answer_correctness` scores produced by `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge for the evaluation of OPEA agents on the full CRAG music domain dataset.
+
+We have made available our scripts to calculate the mean scores. Refer to the `run_compare_scores.sh` script in the `run_benchmark` folder.
+
+
+## Benchmark results for OPEA RAG Agent
+We have evaluated the RAG agent (`rag_agent_llama` strategy) in the OPEA AgentQnA example on CRAG music domain dataset (374 questions in total). We used `meta-llama/Meta-Llama-3-70B-Instruct` and we served the LLM with tgi-gaudi on 4 Intel Gaudi2 accelerator cards. Refer to the docker compose yaml files in the AgentQnA example for more details on the configurations.
+
+For the tests of conventional RAG, we used the script in the `run_benchmark` folder: `run_conv_rag.sh`. And we used the same LLM, serving configs and generation parameters as the RAG agent.
+
+
+|Setup           |Mean RAGAS `answer_correctness` score|
+|----------------|------------------------------|
+|Conventional RAG|0.42|
+|Single RAG agent|0.43|
+|Hierachical ReAct agent|To come soon...|
+
+Note: Currently OPEA agents do not support tool selection (i.e., only give a subset of tools to agent based on query), which we found can boost agent performance when the number of tools is large. We are in the process of enabling tool selection and will report the performance of Hierachical ReAct agent once tool selection is enabled. 
+
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
index e7736b91..92b838ad 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
@@ -1,7 +1,17 @@
 import pandas as pd
 from scipy.stats import spearmanr, pearsonr
+import argparse
 
 
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--filedir", type=str, help="file directory")
+    parser.add_argument("--conv_rag", type=str, help="file with RAGAS scores for conventional RAG")
+    parser.add_argument("--ragagent", type=str, help="file with RAGAS scores for RAG agent")
+    parser.add_argument("--reactagent", type=str, help="file with RAGAS scores for React agent")
+    parser.add_argument("--human_scores_file", type=str, help="file with human scores for 3 setups")
+    return parser.parse_args()
+
 def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
     conv_rag_df = pd.read_csv(filedir+conv_rag)
     ragagent_df = pd.read_csv(filedir+ragagent)
@@ -13,6 +23,7 @@ def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
     merged_df = pd.merge(conv_rag_df, ragagent_df, on="query")
     merged_df = pd.merge(merged_df, reactagent_df, on="query")
     print(merged_df.shape)
+    print(merged_df.describe())
     merged_df.to_csv(filedir+prefix+"merged_scores.csv", index=False)
 
     # drop rows with nan
@@ -26,42 +37,38 @@ def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
     return merged_df, merged_df_dropped
 
 
+if __name__ == "__main__":
+    args = get_args()
+    filedir = args.filedir
+    conv_rag = args.conv_rag
+    ragagent = args.ragagent
+    reactagent = args.reactagent
+    human_scores_file = args.human_scores_file
+
+    #RAGAS scores
+    print("===============RAGAS scores==================")
+    merged_df, merged_df_dropped = merge_and_get_stats(filedir, conv_rag, ragagent, reactagent)
+
+    # human scores
+    print("===============Human scores==================")
+    human_scores_df = pd.read_csv(filedir+human_scores_file)
+    print(human_scores_df.describe())
+
+    human_scores_df_dropped = human_scores_df.loc[human_scores_df["query"].isin(merged_df_dropped["query"])]
+    print(human_scores_df_dropped.describe())
+    human_scores_df_dropped.to_csv(filedir+"human_scores_dropped.csv", index=False)
+
+    # concat conv_rag, ragagent, reactagent scores in merged_df_dropped
+    ragas_scores = pd.concat([merged_df_dropped["conv_rag_score"], merged_df_dropped["ragagent_score"], merged_df_dropped["reactagent_score"]], axis=0)
+    human_scores = pd.concat([human_scores_df_dropped["conv_rag"], human_scores_df_dropped["ragagent"], human_scores_df_dropped["reactagent"]], axis=0)
+
+    # calculate spearman correlation
+    print("===============Spearman correlation==================")
+    print(spearmanr(ragas_scores, human_scores))
 
-#RAGAS scores
-print("===============RAGAS scores==================")
-filedir="/localdisk/minminho/dataset/rag_eval/"
-conv_rag="rag_llama3.1-70B-instruct_92queries_graded.csv"
-ragagent="ragagent_chatopenai_tgi_llama3.1-70B-instruct_92queries_graded.csv"
-reactagent="react_v3parser_v3prompt_tgi_chatopenai_llama3.1-70B-instruct_92queries_graded.csv"
-merged_df, merged_df_dropped = merge_and_get_stats(filedir, conv_rag, ragagent, reactagent)
-
-# human scores
-print("===============Human scores==================")
-human_scores = "human_scores_92queries.csv"
-human_scores_df = pd.read_csv(filedir+human_scores)
-print(human_scores_df.describe())
-
-human_scores_df_dropped = human_scores_df.loc[human_scores_df["query"].isin(merged_df_dropped["query"])]
-print(human_scores_df_dropped.describe())
-human_scores_df_dropped.to_csv(filedir+"human_scores_dropped.csv", index=False)
-
-# calculate spearman correlation
-print("===============Spearman correlation==================")
-print(spearmanr(merged_df_dropped["conv_rag_score"], human_scores_df_dropped["conv_rag"]))
-print(spearmanr(merged_df_dropped["ragagent_score"], human_scores_df_dropped["ragagent"]))
-print(spearmanr(merged_df_dropped["reactagent_score"], human_scores_df_dropped["reactagent"]))
-
-# concat conv_rag, ragagent, reactagent scores in merged_df_dropped
-ragas_scores = pd.concat([merged_df_dropped["conv_rag_score"], merged_df_dropped["ragagent_score"], merged_df_dropped["reactagent_score"]], axis=0)
-human_scores = pd.concat([human_scores_df_dropped["conv_rag"], human_scores_df_dropped["ragagent"], human_scores_df_dropped["reactagent"]], axis=0)
-print(spearmanr(ragas_scores, human_scores))
-
-# pearson correlation
-print("===============Pearson correlation==================")
-print(pearsonr(merged_df_dropped["conv_rag_score"], human_scores_df_dropped["conv_rag"]))
-print(pearsonr(merged_df_dropped["ragagent_score"], human_scores_df_dropped["ragagent"]))
-print(pearsonr(merged_df_dropped["reactagent_score"], human_scores_df_dropped["reactagent"]))
-print(pearsonr(ragas_scores, human_scores))
+    # pearson correlation
+    print("===============Pearson correlation==================")
+    print(pearsonr(ragas_scores, human_scores))
 
 
 
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh
new file mode 100644
index 00000000..fdb9ae44
--- /dev/null
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh
@@ -0,0 +1,12 @@
+filedir=$WORKDIR/datasets/crag_results/
+conv_rag="conv_rag_graded.csv" # replace with your file name
+ragagent="ragagent_graded.csv" # replace with your file name
+reactagent="react_graded.csv" # replace with your file name
+human_scores_file="human_scores.csv" # replace with your file name
+
+python3 compare_scores.py \
+--filedir $filedir \
+--conv_rag $conv_rag \
+--ragagent $ragagent \
+--reactagent $reactagent \
+--human_scores_file $human_scores_file
\ No newline at end of file

From 95f815cada10fe9900cd70a21d65819853eec114 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 18 Nov 2024 16:38:16 -0800
Subject: [PATCH 08/14] update readme with benchmark results

---
 .../evaluation/agent_eval/crag_eval/README.md | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
index e041ab6e..e64c3b0b 100644
--- a/evals/evaluation/agent_eval/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -126,14 +126,14 @@ bash run_grading.sh
 ```
  
 ### Validation of LLM-as-judge
-We validated RAGAS answer correctness as the metric to evaluate agents. We sampled 92 queries from the 374 music domain questions and conducted human evaluations on the conventional RAG answers, single RAG agent answers and hierachical ReAct agent answers of the 92 queries. We ran our experiments on Intel Gaudi2 accelerators. We used `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge.
+We validated RAGAS answer correctness as the metric to evaluate agents. We sampled 92 queries from the full music domain dataset (up to 5 questions per sub-category for all 32 sub-categories), and conducted human evaluations on the conventional RAG answers, the single RAG agent answers and the hierachical ReAct agent answers of the 92 queries. 
 
 We followed the criteria in the [CRAG paper](https://arxiv.org/pdf/2406.04744) to get human scores: 
 1. score 1 if the answer matches the golden answer or semantically similar.
 2. score 0 if the asnwer misses information, or is "I don't know", “I’m sorry I can’t find ...”, a system error such as recursion limit is hit, or a request from the system to clarify the original question.
 3. score -1 if the answer contains incorrect information.
 
-Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py) for the implementation of its `answer correctness` score.
+On the other hand, RAGAS `answer_correctness` score is on a scale of 0-1 and is a weighted average of 1) an F1 score and 2) similarity between answer and golden answer. The F1 score is based on the number of statements in the answer supported or not supported by the golden answer, and the number of statements in the golden answer appeared or did not appear in the answer. Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py) for the implementation of its `answer_correctness` score. We ran RAGAS on Intel Gaudi2 accelerators. We used `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge.
 
 |Setup           |Mean Human score|Mean RAGAS `answer_correctness` score|
 |----------------|-----------|------------------------------|
@@ -141,22 +141,28 @@ Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/
 |Single RAG agent|0.18       |0.43|
 |Hierachical ReAct agent|0.22|0.54|
 
-We can see that the human scores and the RAGAS `answer_correctness` scores follow the same trend. Therefore, we went on to use RAGAS `answer_correctness` scores produced by `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge for the evaluation of OPEA agents on the full CRAG music domain dataset.
+We can see that the human scores and the RAGAS `answer_correctness` scores follow the same trend, although the two scoring methods used different grading criteria and methods. Since LLM-as-judge is more scalable for larger datasets, we decided to use RAGAS `answer_correctness` scores (produced by `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge) for the evaluation of OPEA agents on the full CRAG music domain dataset.
 
-We have made available our scripts to calculate the mean scores. Refer to the `run_compare_scores.sh` script in the `run_benchmark` folder.
+We have made available our scripts to calculate the mean RAGAS scores. Refer to the `run_compare_scores.sh` script in the `run_benchmark` folder.
 
 
 ## Benchmark results for OPEA RAG Agent
-We have evaluated the RAG agent (`rag_agent_llama` strategy) in the OPEA AgentQnA example on CRAG music domain dataset (374 questions in total). We used `meta-llama/Meta-Llama-3-70B-Instruct` and we served the LLM with tgi-gaudi on 4 Intel Gaudi2 accelerator cards. Refer to the docker compose yaml files in the AgentQnA example for more details on the configurations.
+We have evaluated the agents (`rag_agent_llama` strategy) in the OPEA AgentQnA example on CRAG music domain dataset (373 questions in total). We used `meta-llama/Meta-Llama-3-70B-Instruct` and we served the LLM with tgi-gaudi on 4 Intel Gaudi2 accelerator cards. Refer to the docker compose yaml files in the AgentQnA example for more details on the configurations.
 
 For the tests of conventional RAG, we used the script in the `run_benchmark` folder: `run_conv_rag.sh`. And we used the same LLM, serving configs and generation parameters as the RAG agent.
 
+The Conventional RAG and Single RAG agent use the same retriever. The Hierarchical ReAct agent uses the Single RAG agent as its tool.
+
 
 |Setup           |Mean RAGAS `answer_correctness` score|
 |----------------|------------------------------|
 |Conventional RAG|0.42|
 |Single RAG agent|0.43|
-|Hierachical ReAct agent|To come soon...|
+|Hierachical ReAct agent|0.53|
+
+From the results, we can see that the single RAG agent performs better than conventional RAG, while the hierarchical ReAct agent has the highest `answer_correctness` score. The reasons for such performance improvements:
+1. RAG agent rewrites query and checks the quality of retrieved documents before feeding the docs to generation. It can get docs that are more relevant to generate answers. It can also decompose complex questions into modular tasks and get related docs for each task and then aggregate info to come up with answers.
+2. Hierarchical ReAct agent was supplied with APIs to get information from knowledge graphs, and thus can supplement info to the knowledge in the retrieval vector database. So it can answer questions where conventional RAG or Single RAG agent cannot due to the lack of relevant info in vector database.
 
-Note: Currently OPEA agents do not support tool selection (i.e., only give a subset of tools to agent based on query), which we found can boost agent performance when the number of tools is large. We are in the process of enabling tool selection and will report the performance of Hierachical ReAct agent once tool selection is enabled. 
+Note: The performance result for the hierarchical ReAct agent is with tool selection, i.e., only give a subset of tools to agent based on query, which we found can boost agent performance when the number of tools is large. However, currently OPEA agents do not support tool selection yet. We are in the process of enabling tool selection. 
 

From 426dec49eb95d101d8f5418ed02d3d3dcc69fd92 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 21 Nov 2024 21:12:18 +0000
Subject: [PATCH 09/14] run full music dataset

---
 evals/evaluation/agent_eval/crag_eval/docker/build_image.sh     | 1 +
 .../agent_eval/crag_eval/run_benchmark/grade_answers.py         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
index 14600925..33207c9e 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
+++ b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
@@ -4,6 +4,7 @@
 dockerfile=Dockerfile
 
 docker build \
+    --no-cache \
     -f ${dockerfile} . \
     -t crag-eval:v1.1 \
     --network=host \
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
index e2a903d3..6a1d400f 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
@@ -79,7 +79,7 @@ def grade_answers(args, test_case):
     # print(test_case)
 
     scores = grade_answers(args, test_case)
-    print(scores)
+    #print(scores)
 
     # save the scores
     if args.batch_grade:

From 81797a4eebf8ac66c735ec875546d49ef98554b7 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 2 Dec 2024 14:33:57 -0800
Subject: [PATCH 10/14] debug ragas code

---
 evals/metrics/ragas/ragas.py | 56 ++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 2eed49c5..682a90c8 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -13,9 +13,17 @@
 
 # import * is only allowed at module level according to python syntax
 try:
-    from ragas.metrics import *
+    # from ragas.metrics import *
     from ragas import evaluate
-    from ragas.metrics import __all__ 
+    from ragas.metrics import (
+                answer_correctness,
+                answer_relevancy,
+                answer_similarity,
+                context_precision,
+                context_recall,
+                context_utilization,
+                faithfulness,
+            )
 except ModuleNotFoundError:
     raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
 
@@ -24,6 +32,25 @@
 except ModuleNotFoundError:
     raise ModuleNotFoundError("Please install dataset")
 
+VALIDATED_LIST = [
+    "answer_correctness",
+    "answer_relevancy",
+    "answer_similarity",
+    "context_precision",
+    "context_recall",
+    "faithfulness",
+    "context_utilization",
+]
+
+metrics_mapping = {
+        "answer_correctness": answer_correctness,
+        "answer_relevancy": answer_relevancy,
+        "answer_similarity": answer_similarity,
+        "context_precision": context_precision,
+        "context_recall": context_recall,
+        "faithfulness": faithfulness,
+        "context_utilization": context_utilization,
+    }
 
 
 def format_ragas_metric_name(name: str):
@@ -45,21 +72,6 @@ def __init__(
         self.embeddings = embeddings
         self.metrics = metrics
 
-        ALL_METRICS=__all__   
-            
-        print("ALL METRICS: ", ALL_METRICS)
-        self.metric_names = list(set(ALL_METRICS))
-        # Note - summarization score metric is not working with best open-source LLMs
-        # Note - which is why we are removing it from our offering at the moment.
-        self.metric_names.remove("summarization_score")
-        self.metric_instances = {}
-        for metric in self.metric_names:
-            try:
-                self.metric_instances[metric] = eval(metric)
-            except:
-                pass
-
-
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
@@ -78,9 +90,9 @@ def __init__(
             tmp_metrics = []
             # check supported list
             for metric in self.metrics:
-                if metric not in self.metric_names:
+                if metric not in VALIDATED_LIST:
                     raise ValueError(
-                        "metric should be in supported list {}. ".format(self.metric_names)
+                        "metric should be in supported list {}. ".format(VALIDATED_LIST)
                         + "ClientResponseError raised with LangchainLLM "
                         + "when context_precision, context_recall ran. "
                         + "Here are the related issues described in ragas "
@@ -88,12 +100,12 @@ def __init__(
                         + "https://github.com/explodinggradients/ragas/issues/664."
                     )
                 else:
-                    if metric == "AnswerRelevancy" and self.embeddings is None:
+                    if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("AnswerRelevancy metric need provide embeddings model.")
-                    tmp_metrics.append(self.metric_instances[metric])
+                    tmp_metrics.append(metrics_mapping[metric])
             self.metrics = tmp_metrics
         else:
-            self.metrics = list(self.metric_instances.values())
+            self.metrics = [metrics_mapping[metric] for metric in VALIDATED_LIST]
 
         # Find necessary input fields using the given metrics
         _required_columns = set()

From ab088e6dfc22494b1502b3019147bbbc54b5e6bb Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 2 Dec 2024 14:52:15 -0800
Subject: [PATCH 11/14] remove context utilization from ragas validated list

---
 .../agent_eval/crag_eval/run_benchmark/run_grading.sh          | 2 +-
 evals/metrics/ragas/ragas.py                                   | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
index 1b253901..5431d39b 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
@@ -1,7 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-FILEDIR=$WORKDIR/datasets/ragagent_eval/
+FILEDIR=$WORKDIR/datasets/crag_results/
 FILENAME=crag_music_sampled_results.csv
 LLM_ENDPOINT=http://${host_ip}:8085 # change host_ip to the IP of LLM endpoint
 
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 682a90c8..df8f8f6e 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -21,7 +21,6 @@
                 answer_similarity,
                 context_precision,
                 context_recall,
-                context_utilization,
                 faithfulness,
             )
 except ModuleNotFoundError:
@@ -39,7 +38,6 @@
     "context_precision",
     "context_recall",
     "faithfulness",
-    "context_utilization",
 ]
 
 metrics_mapping = {
@@ -49,7 +47,6 @@
         "context_precision": context_precision,
         "context_recall": context_recall,
         "faithfulness": faithfulness,
-        "context_utilization": context_utilization,
     }
 
 

From 858680387355ec02b63801e7217f26c32d88e025 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 2 Dec 2024 15:30:40 -0800
Subject: [PATCH 12/14] update readme and clean up scripts

---
 .../evaluation/agent_eval/crag_eval/README.md | 31 ++++++++++++++-----
 .../run_benchmark/conventional_rag.py         |  4 ++-
 .../crag_eval/run_benchmark/run_conv_rag.sh   |  4 +--
 .../run_benchmark/run_generate_answer.sh      |  4 +--
 .../crag_eval/run_benchmark/run_grading.sh    |  2 +-
 5 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
index e64c3b0b..293d3975 100644
--- a/evals/evaluation/agent_eval/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -88,18 +88,19 @@ python3 index_data.py --host_ip $host_ip --filedir ${WORKDIR}/datasets/crag_docs
 cd $WORKDIR/GenAIExamples/AgentQnA/tests/
 bash step4_launch_and_validate_agent_gaudi.sh
 ```
-Note: There are two agents in the agent system: a RAG agent (as the worker agent) and a ReAct agent (as the supervisor agent). For CRAG benchmark, we will use the RAG agent.
+Note: There are two agents in the agent system: a RAG agent (as the worker agent) and a ReAct agent (as the supervisor agent). We can evaluate both agents - just need to specify the agent endpoint url in the scripts - see instructions below.
 
 ## Run CRAG benchmark
-Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
+Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run the entire set of queries in the music domain (in total 373 queries). You can choose to run other domains or just run a sampled subset of music domain.
 ```
 # Come back to the interactive crag-eval docker container
 cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark
+# Remember to specify the agent endpoint url in the script.
 bash run_generate_answer.sh
 ```
 
 ## Use LLM-as-judge to grade the answers
-1. Launch llm endpoint with HF TGI: in another terminal, run the command below. By default, `meta-llama/Meta-Llama-3-70B-Instruct` is used as the LLM judge.
+1. Launch llm endpoint with HF TGI: in another terminal, run the command below. By default, `meta-llama/Meta-Llama-3.1-70B-Instruct` is used as the LLM judge.
 ```
 cd llm_judge
 bash launch_llm_judge_endpoint.sh
@@ -133,7 +134,7 @@ We followed the criteria in the [CRAG paper](https://arxiv.org/pdf/2406.04744) t
 2. score 0 if the asnwer misses information, or is "I don't know", “I’m sorry I can’t find ...”, a system error such as recursion limit is hit, or a request from the system to clarify the original question.
 3. score -1 if the answer contains incorrect information.
 
-On the other hand, RAGAS `answer_correctness` score is on a scale of 0-1 and is a weighted average of 1) an F1 score and 2) similarity between answer and golden answer. The F1 score is based on the number of statements in the answer supported or not supported by the golden answer, and the number of statements in the golden answer appeared or did not appear in the answer. Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py) for the implementation of its `answer_correctness` score. We ran RAGAS on Intel Gaudi2 accelerators. We used `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge.
+On the other hand, RAGAS `answer_correctness` score is on a scale of 0-1 and is a weighted average of 1) an F1 score and 2) similarity between answer and golden answer. The F1 score is based on the number of statements in the answer supported or not supported by the golden answer, and the number of statements in the golden answer appeared or did not appear in the answer. Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py) for the implementation of its `answer_correctness` score. We ran RAGAS on Intel Gaudi2 accelerators. We used `meta-llama/Meta-Llama-3.1-70B-Instruct` as the LLM judge.
 
 |Setup           |Mean Human score|Mean RAGAS `answer_correctness` score|
 |----------------|-----------|------------------------------|
@@ -146,12 +147,12 @@ We can see that the human scores and the RAGAS `answer_correctness` scores follo
 We have made available our scripts to calculate the mean RAGAS scores. Refer to the `run_compare_scores.sh` script in the `run_benchmark` folder.
 
 
-## Benchmark results for OPEA RAG Agent
-We have evaluated the agents (`rag_agent_llama` strategy) in the OPEA AgentQnA example on CRAG music domain dataset (373 questions in total). We used `meta-llama/Meta-Llama-3-70B-Instruct` and we served the LLM with tgi-gaudi on 4 Intel Gaudi2 accelerator cards. Refer to the docker compose yaml files in the AgentQnA example for more details on the configurations.
+## Benchmark results for OPEA RAG Agents
+We have evaluated the agents (`rag_agent_llama` and `react_llama` strategies) in the OPEA AgentQnA example on CRAG music domain dataset (373 questions in total). We used `meta-llama/Meta-Llama-3.1-70B-Instruct` and we served the LLM with tgi-gaudi on 4 Intel Gaudi2 accelerator cards. Refer to the docker compose yaml files in the AgentQnA example for more details on the configurations.
 
 For the tests of conventional RAG, we used the script in the `run_benchmark` folder: `run_conv_rag.sh`. And we used the same LLM, serving configs and generation parameters as the RAG agent.
 
-The Conventional RAG and Single RAG agent use the same retriever. The Hierarchical ReAct agent uses the Single RAG agent as its tool.
+The Conventional RAG and Single RAG agent use the same retriever. The Hierarchical ReAct agent uses the Single RAG agent as its retrieval tool and also has access to CRAG APIs provided by Meta as part of the CRAG benchmark.
 
 
 |Setup           |Mean RAGAS `answer_correctness` score|
@@ -166,3 +167,19 @@ From the results, we can see that the single RAG agent performs better than conv
 
 Note: The performance result for the hierarchical ReAct agent is with tool selection, i.e., only give a subset of tools to agent based on query, which we found can boost agent performance when the number of tools is large. However, currently OPEA agents do not support tool selection yet. We are in the process of enabling tool selection. 
 
+### Comparison with GPT-4o-mini
+Open-source LLM serving libraries (tgi and vllm) have limited capabilities in producing tool-call objects. Although vllm improved its tool-calling capabilities recently, parallel tool calling is still not well supported. Therefore, we had to write our own prompts and output parsers for the `rag_agent_llama` and `react_llama` strategies for using open-source LLMs served with open-source serving frameworks for OPEA agent microservices.
+
+Below we show the comparisons of `meta-llama/Meta-Llama-3.1-70B-Instruct` versus OpenAI's `gpt-4o-mini-2024-07-18` on 20 sampled queries from the CRAG music domain dataset. We used human evaluation criteria outlined above. The numbers are the average scores graged by human. The parathesis denotes the OPEA agent strategy used.
+
+|Setup|Llama3.1-70B-Instruct|gpt-4o-mini|
+|-----|---------------------|-----------|
+|Conventional RAG|0.15|0.05|
+|Single RAG agent|0.45 (`rag_agent_llama`)|0.65 (`rag_agent`)|
+|Hierarchical ReAct agent|0.55 (`react_llama`)|0.75 (`react_langgraph`)|
+
+From the comparisons on this small subset, we can see that OPEA agents using `meta-llama/Meta-Llama-3.1-70B-Instruct` with calibrated prompt templates and output parsers are only slightly behind `gpt-4o-mini-2024-07-18` with proprietary tool-calling capabilities.
+
+
+
+
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
index 96d718c9..c3fafab1 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
@@ -115,9 +115,11 @@ def generate_answer(llm, query, context, time):
     print(args)
 
     df = get_test_dataset(args)
-    df=df.head(3)
     print(df.shape)
 
+    if not os.path.exists(os.path.dirname(args.output)):
+        os.makedirs(os.path.dirname(args.output))
+
     llm = setup_chat_model(args)
 
     contexts = []
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
index 634ea15d..e29270c1 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
@@ -1,9 +1,9 @@
 MODEL="meta-llama/Meta-Llama-3.1-70B-Instruct"
 LLMENDPOINT=http://${host_ip}:8085
 
-FILEDIR=$WORKDIR/datasets/ragagent_eval/
+FILEDIR=$WORKDIR/datasets/crag_qas/
 FILENAME=crag_qa_music.jsonl
-OUTPUT=$WORKDIR/datasets/ragagent_eval/val_conv_rag_music_full.jsonl
+OUTPUT=$WORKDIR/datasets/crag_results/conv_rag_music.jsonl
 
 export RETRIEVAL_TOOL_URL="http://${host_ip}:8889/v1/retrievaltool"
 
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh
index ee863bba..20e578a6 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh
@@ -7,8 +7,8 @@ endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent
 URL="http://${host_ip}:${endpoint}"
 echo "AGENT ENDPOINT URL: ${URL}"
 
-QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl
-OUTPUTFILE=$WORKDIR/datasets/crag_results/crag_music_sampled_results.jsonl
+QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music.jsonl
+OUTPUTFILE=$WORKDIR/datasets/crag_results/ragagent_crag_music_results.jsonl
 
 python3 generate_answers.py \
 --endpoint_url ${URL} \
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
index 5431d39b..b9af1a18 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 FILEDIR=$WORKDIR/datasets/crag_results/
-FILENAME=crag_music_sampled_results.csv
+FILENAME=ragagent_crag_music_results.csv
 LLM_ENDPOINT=http://${host_ip}:8085 # change host_ip to the IP of LLM endpoint
 
 python3 grade_answers.py \

From a1fe751d1cf043a6e3fc3ff5b83999264a4fe9f3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 23:32:42 +0000
Subject: [PATCH 13/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../evaluation/agent_eval/crag_eval/README.md | 14 ++----
 .../crag_eval/docker/requirements.txt         |  2 +-
 .../crag_eval/run_benchmark/compare_scores.py | 46 +++++++++++++------
 .../run_benchmark/conventional_rag.py         | 21 ++++++---
 .../crag_eval/run_benchmark/grade_answers.py  |  4 +-
 .../run_benchmark/run_compare_scores.sh       |  5 +-
 .../crag_eval/run_benchmark/run_conv_rag.sh   |  5 +-
 evals/metrics/ragas/ragas.py                  | 32 ++++++-------
 8 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
index 293d3975..5c2c0e84 100644
--- a/evals/evaluation/agent_eval/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -127,11 +127,11 @@ bash run_grading.sh
 ```
  
 ### Validation of LLM-as-judge
-We validated RAGAS answer correctness as the metric to evaluate agents. We sampled 92 queries from the full music domain dataset (up to 5 questions per sub-category for all 32 sub-categories), and conducted human evaluations on the conventional RAG answers, the single RAG agent answers and the hierachical ReAct agent answers of the 92 queries. 
+We validated RAGAS answer correctness as the metric to evaluate agents. We sampled 92 queries from the full music domain dataset (up to 5 questions per sub-category for all 32 sub-categories), and conducted human evaluations on the conventional RAG answers, the single RAG agent answers and the hierarchical ReAct agent answers of the 92 queries. 
 
 We followed the criteria in the [CRAG paper](https://arxiv.org/pdf/2406.04744) to get human scores: 
 1. score 1 if the answer matches the golden answer or semantically similar.
-2. score 0 if the asnwer misses information, or is "I don't know", “I’m sorry I can’t find ...”, a system error such as recursion limit is hit, or a request from the system to clarify the original question.
+2. score 0 if the answer misses information, or is "I don't know", “I’m sorry I can’t find ...”, a system error such as recursion limit is hit, or a request from the system to clarify the original question.
 3. score -1 if the answer contains incorrect information.
 
 On the other hand, RAGAS `answer_correctness` score is on a scale of 0-1 and is a weighted average of 1) an F1 score and 2) similarity between answer and golden answer. The F1 score is based on the number of statements in the answer supported or not supported by the golden answer, and the number of statements in the golden answer appeared or did not appear in the answer. Please refer to [RAGAS source code](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py) for the implementation of its `answer_correctness` score. We ran RAGAS on Intel Gaudi2 accelerators. We used `meta-llama/Meta-Llama-3.1-70B-Instruct` as the LLM judge.
@@ -140,7 +140,7 @@ On the other hand, RAGAS `answer_correctness` score is on a scale of 0-1 and is
 |----------------|-----------|------------------------------|
 |Conventional RAG|0.05       |0.37|
 |Single RAG agent|0.18       |0.43|
-|Hierachical ReAct agent|0.22|0.54|
+|Hierarchical ReAct agent|0.22|0.54|
 
 We can see that the human scores and the RAGAS `answer_correctness` scores follow the same trend, although the two scoring methods used different grading criteria and methods. Since LLM-as-judge is more scalable for larger datasets, we decided to use RAGAS `answer_correctness` scores (produced by `meta-llama/Meta-Llama-3-70B-Instruct` as the LLM judge) for the evaluation of OPEA agents on the full CRAG music domain dataset.
 
@@ -159,7 +159,7 @@ The Conventional RAG and Single RAG agent use the same retriever. The Hierarchic
 |----------------|------------------------------|
 |Conventional RAG|0.42|
 |Single RAG agent|0.43|
-|Hierachical ReAct agent|0.53|
+|Hierarchical ReAct agent|0.53|
 
 From the results, we can see that the single RAG agent performs better than conventional RAG, while the hierarchical ReAct agent has the highest `answer_correctness` score. The reasons for such performance improvements:
 1. RAG agent rewrites query and checks the quality of retrieved documents before feeding the docs to generation. It can get docs that are more relevant to generate answers. It can also decompose complex questions into modular tasks and get related docs for each task and then aggregate info to come up with answers.
@@ -170,7 +170,7 @@ Note: The performance result for the hierarchical ReAct agent is with tool selec
 ### Comparison with GPT-4o-mini
 Open-source LLM serving libraries (tgi and vllm) have limited capabilities in producing tool-call objects. Although vllm improved its tool-calling capabilities recently, parallel tool calling is still not well supported. Therefore, we had to write our own prompts and output parsers for the `rag_agent_llama` and `react_llama` strategies for using open-source LLMs served with open-source serving frameworks for OPEA agent microservices.
 
-Below we show the comparisons of `meta-llama/Meta-Llama-3.1-70B-Instruct` versus OpenAI's `gpt-4o-mini-2024-07-18` on 20 sampled queries from the CRAG music domain dataset. We used human evaluation criteria outlined above. The numbers are the average scores graged by human. The parathesis denotes the OPEA agent strategy used.
+Below we show the comparisons of `meta-llama/Meta-Llama-3.1-70B-Instruct` versus OpenAI's `gpt-4o-mini-2024-07-18` on 20 sampled queries from the CRAG music domain dataset. We used human evaluation criteria outlined above. The numbers are the average scores graged by human. The parenthesis denotes the OPEA agent strategy used.
 
 |Setup|Llama3.1-70B-Instruct|gpt-4o-mini|
 |-----|---------------------|-----------|
@@ -179,7 +179,3 @@ Below we show the comparisons of `meta-llama/Meta-Llama-3.1-70B-Instruct` versus
 |Hierarchical ReAct agent|0.55 (`react_llama`)|0.75 (`react_langgraph`)|
 
 From the comparisons on this small subset, we can see that OPEA agents using `meta-llama/Meta-Llama-3.1-70B-Instruct` with calibrated prompt templates and output parsers are only slightly behind `gpt-4o-mini-2024-07-18` with proprietary tool-calling capabilities.
-
-
-
-
diff --git a/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
index 1e8beb72..a6d88f19 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
+++ b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
@@ -4,7 +4,7 @@ jieba
 langchain-community
 langchain-huggingface
 langchain-openai
+nltk
 pandas
 ragas
 sentence_transformers
-nltk
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
index 92b838ad..b91568f1 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/compare_scores.py
@@ -1,7 +1,11 @@
-import pandas as pd
-from scipy.stats import spearmanr, pearsonr
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
+import pandas as pd
+from scipy.stats import pearsonr, spearmanr
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -12,10 +16,11 @@ def get_args():
     parser.add_argument("--human_scores_file", type=str, help="file with human scores for 3 setups")
     return parser.parse_args()
 
+
 def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
-    conv_rag_df = pd.read_csv(filedir+conv_rag)
-    ragagent_df = pd.read_csv(filedir+ragagent)
-    reactagent_df = pd.read_csv(filedir+reactagent)
+    conv_rag_df = pd.read_csv(filedir + conv_rag)
+    ragagent_df = pd.read_csv(filedir + ragagent)
+    reactagent_df = pd.read_csv(filedir + reactagent)
 
     conv_rag_df = conv_rag_df.rename(columns={"answer_correctness": "conv_rag_score"})
     ragagent_df = ragagent_df.rename(columns={"answer_correctness": "ragagent_score"})
@@ -24,7 +29,7 @@ def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
     merged_df = pd.merge(merged_df, reactagent_df, on="query")
     print(merged_df.shape)
     print(merged_df.describe())
-    merged_df.to_csv(filedir+prefix+"merged_scores.csv", index=False)
+    merged_df.to_csv(filedir + prefix + "merged_scores.csv", index=False)
 
     # drop rows with nan
     merged_df_dropped = merged_df.dropna()
@@ -33,7 +38,7 @@ def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
 
     # compare scores
     print(merged_df_dropped.describe())
-    merged_df_dropped.to_csv(filedir+prefix+"merged_scores_nadropped.csv", index=False)
+    merged_df_dropped.to_csv(filedir + prefix + "merged_scores_nadropped.csv", index=False)
     return merged_df, merged_df_dropped
 
 
@@ -45,22 +50,36 @@ def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
     reactagent = args.reactagent
     human_scores_file = args.human_scores_file
 
-    #RAGAS scores
+    # RAGAS scores
     print("===============RAGAS scores==================")
     merged_df, merged_df_dropped = merge_and_get_stats(filedir, conv_rag, ragagent, reactagent)
 
     # human scores
     print("===============Human scores==================")
-    human_scores_df = pd.read_csv(filedir+human_scores_file)
+    human_scores_df = pd.read_csv(filedir + human_scores_file)
     print(human_scores_df.describe())
 
     human_scores_df_dropped = human_scores_df.loc[human_scores_df["query"].isin(merged_df_dropped["query"])]
     print(human_scores_df_dropped.describe())
-    human_scores_df_dropped.to_csv(filedir+"human_scores_dropped.csv", index=False)
+    human_scores_df_dropped.to_csv(filedir + "human_scores_dropped.csv", index=False)
 
     # concat conv_rag, ragagent, reactagent scores in merged_df_dropped
-    ragas_scores = pd.concat([merged_df_dropped["conv_rag_score"], merged_df_dropped["ragagent_score"], merged_df_dropped["reactagent_score"]], axis=0)
-    human_scores = pd.concat([human_scores_df_dropped["conv_rag"], human_scores_df_dropped["ragagent"], human_scores_df_dropped["reactagent"]], axis=0)
+    ragas_scores = pd.concat(
+        [
+            merged_df_dropped["conv_rag_score"],
+            merged_df_dropped["ragagent_score"],
+            merged_df_dropped["reactagent_score"],
+        ],
+        axis=0,
+    )
+    human_scores = pd.concat(
+        [
+            human_scores_df_dropped["conv_rag"],
+            human_scores_df_dropped["ragagent"],
+            human_scores_df_dropped["reactagent"],
+        ],
+        axis=0,
+    )
 
     # calculate spearman correlation
     print("===============Spearman correlation==================")
@@ -69,6 +88,3 @@ def merge_and_get_stats(filedir, conv_rag, ragagent, reactagent, prefix=""):
     # pearson correlation
     print("===============Pearson correlation==================")
     print(pearsonr(ragas_scores, human_scores))
-
-
-
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
index c3fafab1..f6ee133c 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/conventional_rag.py
@@ -1,26 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 import os
+
 import pandas as pd
 import requests
 
 
 def get_test_dataset(args):
     filepath = os.path.join(args.filedir, args.filename)
-    if filepath.endswith('.jsonl'):
+    if filepath.endswith(".jsonl"):
         df = pd.read_json(filepath, lines=True, convert_dates=False)
-    elif filepath.endswith('.csv'):
+    elif filepath.endswith(".csv"):
         df = pd.read_csv(filepath)
     else:
         raise ValueError("Invalid file format")
     return df
 
+
 def save_results(output_file, output_list):
     with open(output_file, "w") as f:
         for output in output_list:
             f.write(json.dumps(output))
             f.write("\n")
 
+
 def save_as_csv(output):
     df = pd.read_json(output, lines=True, convert_dates=False)
     df.to_csv(output.replace(".jsonl", ".csv"), index=False)
@@ -62,6 +68,7 @@ def search_knowledge_base(query: str) -> str:
     else:
         return "Error parsing response from the knowledge base."
 
+
 PROMPT = """\
 ### You are a helpful, respectful and honest assistant.
 You are given a Question and the time when it was asked in the Pacific Time Zone (PT), referred to as "Query
@@ -78,8 +85,10 @@ def search_knowledge_base(query: str) -> str:
 ### Answer:
 """
 
+
 def setup_chat_model(args):
     from langchain_openai import ChatOpenAI
+
     params = {
         "temperature": args.temperature,
         "max_tokens": args.max_new_tokens,
@@ -95,10 +104,11 @@ def setup_chat_model(args):
     )
     return llm
 
+
 def generate_answer(llm, query, context, time):
     prompt = PROMPT.format(context=context, question=query, time=time)
     response = llm.invoke(prompt)
-    return response.content 
+    return response.content
 
 
 if __name__ == "__main__":
@@ -130,7 +140,7 @@ def generate_answer(llm, query, context, time):
         print("========== Query: ", q)
         context = search_knowledge_base(q)
         print("========== Context:\n", context)
-        answer = generate_answer(llm, q, context, t)  
+        answer = generate_answer(llm, q, context, t)
         print("========== Answer:\n", answer)
         contexts.append(context)
         output_list.append(
@@ -146,6 +156,3 @@ def generate_answer(llm, query, context, time):
         save_results(args.output, output_list)
 
     save_as_csv(args.output)
-
-
-
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
index 6a1d400f..5c826b5f 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
@@ -79,14 +79,14 @@ def grade_answers(args, test_case):
     # print(test_case)
 
     scores = grade_answers(args, test_case)
-    #print(scores)
+    # print(scores)
 
     # save the scores
     if args.batch_grade:
         print("Aggregated answer correctness score: ", scores)
     else:
         data["answer_correctness"] = scores
-        output_file = args.filename.replace(".csv", "_graded.csv") 
+        output_file = args.filename.replace(".csv", "_graded.csv")
         data.to_csv(os.path.join(args.filedir, output_file), index=False)
         print("Scores saved to ", os.path.join(args.filedir, output_file))
 
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh
index fdb9ae44..38eea97d 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_compare_scores.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 filedir=$WORKDIR/datasets/crag_results/
 conv_rag="conv_rag_graded.csv" # replace with your file name
 ragagent="ragagent_graded.csv" # replace with your file name
@@ -9,4 +12,4 @@ python3 compare_scores.py \
 --conv_rag $conv_rag \
 --ragagent $ragagent \
 --reactagent $reactagent \
---human_scores_file $human_scores_file
\ No newline at end of file
+--human_scores_file $human_scores_file
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
index e29270c1..b5a7766a 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_conv_rag.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 MODEL="meta-llama/Meta-Llama-3.1-70B-Instruct"
 LLMENDPOINT=http://${host_ip}:8085
 
@@ -12,4 +15,4 @@ python3 conventional_rag.py \
 --llm_endpoint_url ${LLMENDPOINT} \
 --filedir ${FILEDIR} \
 --filename ${FILENAME} \
---output ${OUTPUT}
\ No newline at end of file
+--output ${OUTPUT}
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index df8f8f6e..8b98b60b 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -16,13 +16,13 @@
     # from ragas.metrics import *
     from ragas import evaluate
     from ragas.metrics import (
-                answer_correctness,
-                answer_relevancy,
-                answer_similarity,
-                context_precision,
-                context_recall,
-                faithfulness,
-            )
+        answer_correctness,
+        answer_relevancy,
+        answer_similarity,
+        context_precision,
+        context_recall,
+        faithfulness,
+    )
 except ModuleNotFoundError:
     raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
 
@@ -41,13 +41,13 @@
 ]
 
 metrics_mapping = {
-        "answer_correctness": answer_correctness,
-        "answer_relevancy": answer_relevancy,
-        "answer_similarity": answer_similarity,
-        "context_precision": context_precision,
-        "context_recall": context_recall,
-        "faithfulness": faithfulness,
-    }
+    "answer_correctness": answer_correctness,
+    "answer_relevancy": answer_relevancy,
+    "answer_similarity": answer_similarity,
+    "context_precision": context_precision,
+    "context_recall": context_recall,
+    "faithfulness": faithfulness,
+}
 
 
 def format_ragas_metric_name(name: str):
@@ -82,7 +82,7 @@ def __init__(
         else:
             print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
             self.chat_model = self.model
-        
+
         if self.metrics is not None:
             tmp_metrics = []
             # check supported list
@@ -131,7 +131,7 @@ def __init__(
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)
 
-    def measure(self, test_case: Dict):        
+    def measure(self, test_case: Dict):
         # get only necessary columns from test case
         data = {column: test_case[column] for column in self._required_columns}
         dataset = Dataset.from_dict(data)

From 88fb81a779f905eff3a87b9ad6d0e32b60c488af Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 2 Dec 2024 15:37:37 -0800
Subject: [PATCH 14/14] update llm judge model

---
 .../agent_eval/crag_eval/docker/launch_eval_container.sh        | 2 +-
 .../run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml | 2 +-
 .../run_benchmark/llm_judge/launch_llm_judge_endpoint.sh        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
index 681e79e9..cf25502f 100644
--- a/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
+++ b/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
@@ -4,4 +4,4 @@
 volume=$WORKDIR
 host_ip=$(hostname -I | awk '{print $1}')
 
-docker run -it --name rag_eval -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:v1.1
+docker run -it --name crag_eval -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:v1.1
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
index 572011ef..1ba0962a 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:latest
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
     container_name: tgi-server
     ports:
       - "8085:80"
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
index 0cb08d8f..1a57cd56 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
@@ -1,7 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct"
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export HF_CACHE_DIR=${HF_CACHE_DIR}
 docker compose -f docker-compose-llm-judge-gaudi.yaml up -d