feat(autofix): Eval scores as a % fixed (#1416)

We'll now be calculating % fixed and % correct root cause scores in evals. Also renamed the root cause highest score to just the root cause score. The score seems stable and consistent with the previous run. ![CleanShot 2024-11-11 at 12 39 59@2x](https://github.com/user-attachments/assets/8c837b02-bb20-48ec-b682-74db4353b9eb)
getsentry · Nov 11, 2024 · 3505931 · 3505931
1 parent 36d930c
commit 3505931
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 110 deletions.
diff --git a/src/seer/automation/autofix/evaluations.py b/src/seer/automation/autofix/evaluations.py
@@ -1,7 +1,6 @@
 import logging
 import textwrap
 from typing import Literal, TypedDict, cast
-from xml.etree import ElementTree as ET
 
 from langfuse.client import DatasetItemClient
 from langfuse.decorators import observe
@@ -28,11 +27,7 @@
 from seer.automation.autofix.steps.root_cause_step import RootCauseStep, RootCauseStepRequest
 from seer.automation.models import EventDetails, PromptXmlModel
 from seer.automation.pipeline import PIPELINE_SYNC_SIGNAL
-from seer.automation.utils import (
-    escape_multi_xml,
-    extract_text_inside_tags,
-    extract_xml_element_text,
-)
+from seer.automation.utils import extract_text_inside_tags
 
 logger = logging.getLogger(__name__)
 
@@ -246,7 +241,9 @@ def sync_run_evaluation_on_item(item: DatasetItemClient):
 
 
 @observe(name="Score fix")
-def score_fix_single_it(dataset_item: DatasetItemClient, predicted_diff: str, model: str) -> float:
+def score_fix_single_it(
+    dataset_item: DatasetItemClient, predicted_diff: str, model: str
+) -> tuple[float, bool]:
     if not dataset_item.expected_output:
         raise ValueError("Expected output is missing from dataset item")
 
@@ -282,7 +279,8 @@ def score_fix_single_it(dataset_item: DatasetItemClient, predicted_diff: str, mo
             - Consider that there are multiple ways to fix an issue
 
             Think step-by-step inside a <thoughts> tag before giving a score.
-            Return the score inside a <score> tag."""
+            Return the score inside a <score> tag.
+            Then, return your verdict of whether the predicted solution fixes the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>."""
     ).format(
         event_details=event_details.format_event(),
         expected_description=expected_output["solution_diff"]["description"],
@@ -293,43 +291,30 @@ def score_fix_single_it(dataset_item: DatasetItemClient, predicted_diff: str, mo
         model=OpenAiProvider.model(model),
         prompt=prompt,
     )
+
     if not response.message.content:
-        return 0
+        raise ValueError("No response content")
 
-    tree = ET.fromstring(f"<root>{escape_multi_xml(response.message.content, ['score'])}</root>")
-    score_str = extract_xml_element_text(tree, "score")
+    score_str = extract_text_inside_tags(response.message.content, "score")
     score = float(score_str) if score_str else 0
 
-    return score
-
+    verdict = extract_text_inside_tags(response.message.content, "verdict")
+    verdict_bool = (verdict or "False").lower() == "true"
 
-RootCauseScoreResult = TypedDict(
-    "RootCauseScoreResult",
-    {
-        "highest_score": float,
-        "position_score": float,
-        "mean_score": float,
-    },
-)
+    return score, verdict_bool
 
 
 @observe(name="Score root cause iteration")
 def score_root_cause_single_it(
     dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], model: str
-) -> list[float] | None:
+) -> tuple[float, bool]:
     if not dataset_item.expected_output:
         raise ValueError("Expected output is missing from dataset item")
 
     input_data: AutofixRequestDict = dataset_item.input
     expected_output: ExpectedOutputDict = dataset_item.expected_output
     root_cause_expected_str = expected_output.get("root_cause")
-    causes_xml = [RootCausePlanTaskPromptXml.from_root_cause(cause) for cause in causes]
-
-    solution_strs: list[str] = []
-    for i, cause in enumerate(causes_xml):
-        num = i + 1
-        solution_strs.append(f"<solution_{num}>{cause.to_prompt_str()}</solution_{num}>")
-    solutions_str = "\n".join(solution_strs)
+    cause_xml = RootCausePlanTaskPromptXml.from_root_cause(causes[0])
 
     request = AutofixRequest.model_validate(input_data["request"])
 
@@ -346,18 +331,19 @@ def score_root_cause_single_it(
             The model outputted the following possible root causes and solutions:
 
             <predicted_solutions>
-            {predicted_solutions}
+            {predicted_solution}
             </predicted_solutions>
 
             Score how well the predicted root cause and solution matches the expected root cause and solution with a float score from 0 to 1, where 1 means the solution fully fixes the issue and 0 means the solution does not fix the issue at all.
             - The model will return multiple predicted root causes and solutions, ordered from most likely to least likely.
 
             Think step-by-step inside a <thoughts> tag before giving scores.
-            Score each solution inside a <score_{{n}}> tag, such as <score_1>0.5</score_1>, where n is the number of the solution."""
+            Score each solution inside a <score> tag, such as <score>0.5</score>.
+            Also, return your verdict of whether the predicted solution is the correct root cause of the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>."""
     ).format(
         event_details=event_details.format_event(),
         expected_output=root_cause_expected_str,
-        predicted_solutions=solutions_str,
+        predicted_solution=cause_xml.to_prompt_str(),
     )
     response = LlmClient().generate_text(
         model=OpenAiProvider.model(model),
@@ -366,53 +352,41 @@ def score_root_cause_single_it(
     if not response.message.content:
         raise ValueError("No response content")
 
-    scores: list[float] = []
-    for i in range(len(causes_xml)):
-        score_str = extract_text_inside_tags(response.message.content, f"score_{i + 1}")
-        score = float(score_str) if score_str else 0
-        scores.append(score)
+    score_str = extract_text_inside_tags(response.message.content, "score")
+    score = float(score_str) if score_str else 0
+
+    verdict_str = extract_text_inside_tags(response.message.content, "verdict")
+    verdict_bool = (verdict_str or "False").lower() == "true"
 
-    return scores
+    return score, verdict_bool
 
 
 @observe(name="Score one")
 def score_one(
     dataset_item: DatasetItemClient, predicted_diff_str: str, n_panel: int, model: str
-) -> float:
-    return round(
-        sum([score_fix_single_it(dataset_item, predicted_diff_str, model) for _ in range(n_panel)])
-        / n_panel,
-        2,
-    )
+) -> tuple[float, bool]:
+    results = [score_fix_single_it(dataset_item, predicted_diff_str, model) for _ in range(n_panel)]
+
+    mean_score = round(sum([result[0] for result in results]) / n_panel, 2)
+
+    # If at least half of the panel says the fix is correct, then the fix is correct.
+    verdict = sum(1 for result in results if result[1]) >= len(results) / 2
+
+    return mean_score, verdict
 
 
 @observe(name="Score root cause")
 def score_root_causes(
     dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], n_panel: int, model: str
-) -> RootCauseScoreResult:
-    all_results: list[list[float]] = []
-    i = 0
-    while i < n_panel:
-        result = score_root_cause_single_it(dataset_item, causes, model)
-        if result is None or len(result) != len(causes):
-            continue
-        else:
-            all_results.append(result)
-            i += 1
-
-    mean_scores = [round(sum(scores) / len(scores), 2) for scores in zip(*all_results)]
-
-    highest_score = max(mean_scores)
-    mean_score = sum(mean_scores) / len(mean_scores)
-
-    # Position score: 1.0 if the highest score is first, reduce the points the lower the score is, if it is at the last position, give 0.0
-    position_score = 1.0 - (mean_scores.index(highest_score) / len(mean_scores))
-
-    return {
-        "highest_score": highest_score,
-        "position_score": position_score,
-        "mean_score": mean_score,
-    }
+) -> tuple[float, bool]:
+    results = [score_root_cause_single_it(dataset_item, causes, model) for _ in range(n_panel)]
+
+    mean_score = round(sum([result[0] for result in results]) / len(results), 2)
+
+    # If at least half of the panel says the fix is correct, then the fix is correct.
+    verdict = sum(1 for result in results if result[1]) >= len(results) / 2
+
+    return mean_score, verdict
 
 
 def make_score_name(model: str, n_panel: int, name: str) -> str:

diff --git a/src/seer/automation/autofix/tasks.py b/src/seer/automation/autofix/tasks.py
@@ -657,14 +657,21 @@ def run_autofix_evaluation_on_item(
                 logger.exception(f"Error running evaluation: {e}")
 
             if diff:
-                score = score_one(
+                score, verdict = score_one(
                     dataset_item,
                     diff,
                     n_panel=scoring_n_panel,
                     model=scoring_model,
                     langfuse_session_id=trace_id,
                 )
 
+                langfuse.score(
+                    trace_id=trace_id,
+                    name=make_score_name(
+                        model=scoring_model, n_panel=scoring_n_panel, name="is_fixed"
+                    ),
+                    value=1 if verdict else 0,
+                )
                 langfuse.score(
                     trace_id=trace_id,
                     name=make_score_name(
@@ -689,7 +696,7 @@ def run_autofix_evaluation_on_item(
                 )
 
             if causes:
-                root_cause_score = score_root_causes(
+                root_cause_score, root_cause_verdict = score_root_causes(
                     dataset_item,
                     causes,
                     n_panel=scoring_n_panel,
@@ -701,19 +708,26 @@ def run_autofix_evaluation_on_item(
                 # - `"highest_score"`: The score for the highest scored cause out of all the returned root causes.
                 # - `"error_weighted_score"` This score is the same as the highest score but scored 0 if there is an error or no root cause returned. This is used to weight the score in the aggregated run result.
 
+                langfuse.score(
+                    trace_id=trace_id,
+                    name=make_score_name(
+                        model=scoring_model, n_panel=scoring_n_panel, name="rc_is_correct"
+                    ),
+                    value=1 if root_cause_verdict else 0,
+                )
                 langfuse.score(
                     trace_id=trace_id,
                     name=make_score_name(
                         model=scoring_model, n_panel=scoring_n_panel, name="rc_error_weighted_score"
                     ),
-                    value=root_cause_score.get("highest_score"),
+                    value=root_cause_score,
                 )
                 langfuse.score(
                     trace_id=trace_id,
                     name=make_score_name(
-                        model=scoring_model, n_panel=scoring_n_panel, name="rc_highest_score"
+                        model=scoring_model, n_panel=scoring_n_panel, name="rc_score"
                     ),
-                    value=root_cause_score.get("highest_score"),
+                    value=root_cause_score,
                 )
             else:
                 langfuse.score(