Skip to content

Commit

Permalink
feat(autofix): Eval scores as a % fixed (#1416)
Browse files Browse the repository at this point in the history
We'll now be calculating % fixed and % correct root cause scores in
evals. Also renamed the root cause highest score to just the root cause
score. The score seems stable and consistent with the previous run.

![CleanShot 2024-11-11 at 12 39
59@2x](https://github.com/user-attachments/assets/8c837b02-bb20-48ec-b682-74db4353b9eb)
  • Loading branch information
jennmueng authored Nov 11, 2024
1 parent 36d930c commit 3505931
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 110 deletions.
110 changes: 42 additions & 68 deletions src/seer/automation/autofix/evaluations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import textwrap
from typing import Literal, TypedDict, cast
from xml.etree import ElementTree as ET

from langfuse.client import DatasetItemClient
from langfuse.decorators import observe
Expand All @@ -28,11 +27,7 @@
from seer.automation.autofix.steps.root_cause_step import RootCauseStep, RootCauseStepRequest
from seer.automation.models import EventDetails, PromptXmlModel
from seer.automation.pipeline import PIPELINE_SYNC_SIGNAL
from seer.automation.utils import (
escape_multi_xml,
extract_text_inside_tags,
extract_xml_element_text,
)
from seer.automation.utils import extract_text_inside_tags

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -246,7 +241,9 @@ def sync_run_evaluation_on_item(item: DatasetItemClient):


@observe(name="Score fix")
def score_fix_single_it(dataset_item: DatasetItemClient, predicted_diff: str, model: str) -> float:
def score_fix_single_it(
dataset_item: DatasetItemClient, predicted_diff: str, model: str
) -> tuple[float, bool]:
if not dataset_item.expected_output:
raise ValueError("Expected output is missing from dataset item")

Expand Down Expand Up @@ -282,7 +279,8 @@ def score_fix_single_it(dataset_item: DatasetItemClient, predicted_diff: str, mo
- Consider that there are multiple ways to fix an issue
Think step-by-step inside a <thoughts> tag before giving a score.
Return the score inside a <score> tag."""
Return the score inside a <score> tag.
Then, return your verdict of whether the predicted solution fixes the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>."""
).format(
event_details=event_details.format_event(),
expected_description=expected_output["solution_diff"]["description"],
Expand All @@ -293,43 +291,30 @@ def score_fix_single_it(dataset_item: DatasetItemClient, predicted_diff: str, mo
model=OpenAiProvider.model(model),
prompt=prompt,
)

if not response.message.content:
return 0
raise ValueError("No response content")

tree = ET.fromstring(f"<root>{escape_multi_xml(response.message.content, ['score'])}</root>")
score_str = extract_xml_element_text(tree, "score")
score_str = extract_text_inside_tags(response.message.content, "score")
score = float(score_str) if score_str else 0

return score

verdict = extract_text_inside_tags(response.message.content, "verdict")
verdict_bool = (verdict or "False").lower() == "true"

RootCauseScoreResult = TypedDict(
"RootCauseScoreResult",
{
"highest_score": float,
"position_score": float,
"mean_score": float,
},
)
return score, verdict_bool


@observe(name="Score root cause iteration")
def score_root_cause_single_it(
dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], model: str
) -> list[float] | None:
) -> tuple[float, bool]:
if not dataset_item.expected_output:
raise ValueError("Expected output is missing from dataset item")

input_data: AutofixRequestDict = dataset_item.input
expected_output: ExpectedOutputDict = dataset_item.expected_output
root_cause_expected_str = expected_output.get("root_cause")
causes_xml = [RootCausePlanTaskPromptXml.from_root_cause(cause) for cause in causes]

solution_strs: list[str] = []
for i, cause in enumerate(causes_xml):
num = i + 1
solution_strs.append(f"<solution_{num}>{cause.to_prompt_str()}</solution_{num}>")
solutions_str = "\n".join(solution_strs)
cause_xml = RootCausePlanTaskPromptXml.from_root_cause(causes[0])

request = AutofixRequest.model_validate(input_data["request"])

Expand All @@ -346,18 +331,19 @@ def score_root_cause_single_it(
The model outputted the following possible root causes and solutions:
<predicted_solutions>
{predicted_solutions}
{predicted_solution}
</predicted_solutions>
Score how well the predicted root cause and solution matches the expected root cause and solution with a float score from 0 to 1, where 1 means the solution fully fixes the issue and 0 means the solution does not fix the issue at all.
- The model will return multiple predicted root causes and solutions, ordered from most likely to least likely.
Think step-by-step inside a <thoughts> tag before giving scores.
Score each solution inside a <score_{{n}}> tag, such as <score_1>0.5</score_1>, where n is the number of the solution."""
Score each solution inside a <score> tag, such as <score>0.5</score>.
Also, return your verdict of whether the predicted solution is the correct root cause of the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>."""
).format(
event_details=event_details.format_event(),
expected_output=root_cause_expected_str,
predicted_solutions=solutions_str,
predicted_solution=cause_xml.to_prompt_str(),
)
response = LlmClient().generate_text(
model=OpenAiProvider.model(model),
Expand All @@ -366,53 +352,41 @@ def score_root_cause_single_it(
if not response.message.content:
raise ValueError("No response content")

scores: list[float] = []
for i in range(len(causes_xml)):
score_str = extract_text_inside_tags(response.message.content, f"score_{i + 1}")
score = float(score_str) if score_str else 0
scores.append(score)
score_str = extract_text_inside_tags(response.message.content, "score")
score = float(score_str) if score_str else 0

verdict_str = extract_text_inside_tags(response.message.content, "verdict")
verdict_bool = (verdict_str or "False").lower() == "true"

return scores
return score, verdict_bool


@observe(name="Score one")
def score_one(
dataset_item: DatasetItemClient, predicted_diff_str: str, n_panel: int, model: str
) -> float:
return round(
sum([score_fix_single_it(dataset_item, predicted_diff_str, model) for _ in range(n_panel)])
/ n_panel,
2,
)
) -> tuple[float, bool]:
results = [score_fix_single_it(dataset_item, predicted_diff_str, model) for _ in range(n_panel)]

mean_score = round(sum([result[0] for result in results]) / n_panel, 2)

# If at least half of the panel says the fix is correct, then the fix is correct.
verdict = sum(1 for result in results if result[1]) >= len(results) / 2

return mean_score, verdict


@observe(name="Score root cause")
def score_root_causes(
dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], n_panel: int, model: str
) -> RootCauseScoreResult:
all_results: list[list[float]] = []
i = 0
while i < n_panel:
result = score_root_cause_single_it(dataset_item, causes, model)
if result is None or len(result) != len(causes):
continue
else:
all_results.append(result)
i += 1

mean_scores = [round(sum(scores) / len(scores), 2) for scores in zip(*all_results)]

highest_score = max(mean_scores)
mean_score = sum(mean_scores) / len(mean_scores)

# Position score: 1.0 if the highest score is first, reduce the points the lower the score is, if it is at the last position, give 0.0
position_score = 1.0 - (mean_scores.index(highest_score) / len(mean_scores))

return {
"highest_score": highest_score,
"position_score": position_score,
"mean_score": mean_score,
}
) -> tuple[float, bool]:
results = [score_root_cause_single_it(dataset_item, causes, model) for _ in range(n_panel)]

mean_score = round(sum([result[0] for result in results]) / len(results), 2)

# If at least half of the panel says the fix is correct, then the fix is correct.
verdict = sum(1 for result in results if result[1]) >= len(results) / 2

return mean_score, verdict


def make_score_name(model: str, n_panel: int, name: str) -> str:
Expand Down
24 changes: 19 additions & 5 deletions src/seer/automation/autofix/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,14 +657,21 @@ def run_autofix_evaluation_on_item(
logger.exception(f"Error running evaluation: {e}")

if diff:
score = score_one(
score, verdict = score_one(
dataset_item,
diff,
n_panel=scoring_n_panel,
model=scoring_model,
langfuse_session_id=trace_id,
)

langfuse.score(
trace_id=trace_id,
name=make_score_name(
model=scoring_model, n_panel=scoring_n_panel, name="is_fixed"
),
value=1 if verdict else 0,
)
langfuse.score(
trace_id=trace_id,
name=make_score_name(
Expand All @@ -689,7 +696,7 @@ def run_autofix_evaluation_on_item(
)

if causes:
root_cause_score = score_root_causes(
root_cause_score, root_cause_verdict = score_root_causes(
dataset_item,
causes,
n_panel=scoring_n_panel,
Expand All @@ -701,19 +708,26 @@ def run_autofix_evaluation_on_item(
# - `"highest_score"`: The score for the highest scored cause out of all the returned root causes.
# - `"error_weighted_score"` This score is the same as the highest score but scored 0 if there is an error or no root cause returned. This is used to weight the score in the aggregated run result.

langfuse.score(
trace_id=trace_id,
name=make_score_name(
model=scoring_model, n_panel=scoring_n_panel, name="rc_is_correct"
),
value=1 if root_cause_verdict else 0,
)
langfuse.score(
trace_id=trace_id,
name=make_score_name(
model=scoring_model, n_panel=scoring_n_panel, name="rc_error_weighted_score"
),
value=root_cause_score.get("highest_score"),
value=root_cause_score,
)
langfuse.score(
trace_id=trace_id,
name=make_score_name(
model=scoring_model, n_panel=scoring_n_panel, name="rc_highest_score"
model=scoring_model, n_panel=scoring_n_panel, name="rc_score"
),
value=root_cause_score.get("highest_score"),
value=root_cause_score,
)
else:
langfuse.score(
Expand Down
Loading

0 comments on commit 3505931

Please sign in to comment.