Moved deep_eval to input for combine outputs

datakind · Mar 28, 2024 · e141d0c · e141d0c
1 parent bdad1fe
commit e141d0c
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 24 deletions.
diff --git a/flows/reliefweb_chat/deep_eval.py b/flows/reliefweb_chat/deep_eval.py
@@ -80,13 +80,19 @@ def get_model_name(self):
 # Deep eval, see https://github.com/confident-ai/deepeval
 @tool
 def test_case(
-    processed_output: dict, conn: AzureOpenAIConnection, deployment_name: str
-):
+    rweb_results: str,
+    user_question: str,
+    actual_output: str,
+    conn: AzureOpenAIConnection,
+    deployment_name: str,
+) -> dict:
     """
     An example function for evaluating a question using the deepeval library.
 
     Args:
-        processed_output (dict): The processed output containing the necessary data for evaluation.
+        rweb_results (str): The results from the ReliefWeb API.
+        user_question (str): The user question to evaluate.
+        actual_output (str): The actual output to evaluate against.
         conn (AzureOpenAIConnection): The AzureOpenAIConnection object for connecting to Azure services.
         deployment_name (str): The name of the deployment.
 
@@ -95,13 +101,6 @@ def test_case(
     """
     conn_dict = dict(conn)
 
-    rweb_results = processed_output["rweb_results"]
-    input = ""
-    for r in rweb_results:
-        input += r["title"] + " " + str(r["body"])
-
-    actual_output = processed_output["llm_summary_result_processed"]
-
     # Set up LLM connection
     custom_model = AzureChatOpenAI(
         openai_api_version=conn_dict["api_version"],
@@ -111,10 +110,6 @@ def test_case(
     )
     model = AzureOpenAI(model=custom_model)
 
-    user_question = processed_output["user_question"]
-    actual_output = (processed_output["llm_question_result"],)
-    rweb_results = str(processed_output["rweb_results"])
-
     print("user_question: ", user_question)
     print("actual_output: ", actual_output)
     print("rweb_results: ", rweb_results)
@@ -128,4 +123,4 @@ def test_case(
 
     metric.measure(test_case)
 
-    return {"deepeval_score": metric.score, "deepevalscore_reason": metric.reason}
+    return {"deep_eval_score": metric.score, "deep_eval_score_reason": metric.reason}
diff --git a/flows/reliefweb_chat/flow.dag.yaml b/flows/reliefweb_chat/flow.dag.yaml
@@ -534,6 +534,8 @@ nodes:
     llm_question_result: ${answer_question.output}
     rweb_query: ${create_rweb_query.output}
     content_safety_result: ${content_safety.output.suggested_action}
+    deep_eval_score: ${deep_eval.output.deep_eval_score}
+    deep_eval_score_reason: ${deep_eval.output.deep_eval_score_reason}
 - name: extract_references
   type: python
   source:
@@ -564,15 +566,6 @@ nodes:
   inputs:
     connection: azure_content_safety_connection
     text: ${inputs.question}
-- name: deep_eval
-  type: python
-  source:
-    type: code
-    path: deep_eval.py
-  inputs:
-    processed_output: ${process_output.output}
-    conn: azure_openai
-    deployment_name: gpt-35-turbo-16k
 - name: concatenate_scores
   type: python
   source:
@@ -608,6 +601,17 @@ nodes:
   inputs:
     connection: azure_content_safety_connection
     text: ${inputs.question}
+- name: deep_eval
+  type: python
+  source:
+    type: code
+    path: deep_eval.py
+  inputs:
+    rweb_results: ${get_rweb_results.output}
+    user_question: ${inputs.question}
+    actual_output: ${answer_question.output}
+    conn: azure_openai
+    deployment_name: gpt-35-turbo-16k
 node_variants:
   summarize:
     default_variant_id: variant_0

diff --git a/flows/reliefweb_chat/process_output.py b/flows/reliefweb_chat/process_output.py
@@ -14,6 +14,8 @@ def process_output(
     refs: str,
     llm_question_result: str,
     content_safety_result: str,
+    deep_eval_score: float,
+    deep_eval_score_reason: str,
 ) -> dict:
 
     # TODO Hack for bug where running full output generates different output compared to just running this node.