Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Evals #1176

Merged
merged 1 commit into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from typing import Optional

from phi.assistant import Assistant
from phi.agent import Agent
from phi.eval import Eval, EvalResult
from phi.llm.openai import OpenAIChat
from phi.model.openai import OpenAIChat
from phi.tools.calculator import Calculator


def multiply_and_exponentiate():
evaluation = Eval(
assistant=Assistant(
llm=OpenAIChat(model="gpt-4o-mini"),
agent=Agent(
model=OpenAIChat(id="gpt-4o-mini"),
tools=[Calculator(add=True, multiply=True, exponentiate=True)],
),
question="What is 10*5 then to the power of 2? do it step by step",
Expand All @@ -23,8 +23,8 @@ def multiply_and_exponentiate():

def factorial():
evaluation = Eval(
assistant=Assistant(
llm=OpenAIChat(model="gpt-4o-mini"),
agent=Agent(
model=OpenAIChat(id="gpt-4o-mini"),
tools=[Calculator(factorial=True)],
),
question="What is 10!?",
Expand Down
4 changes: 2 additions & 2 deletions phi/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1159,7 +1159,7 @@ def _run(
"functions": functions,
"metrics": self.model.metrics if self.model else None,
}
self.log_agent_run(run_id=self.run_response.run_id, run_data=run_data)
self.log_agent_run(run_id=self.run_id, run_data=run_data)

logger.debug(f"*********** Agent Run End: {self.run_response.run_id} ***********")
if stream_intermediate_steps:
Expand Down Expand Up @@ -1537,7 +1537,7 @@ async def _arun(
"functions": functions,
"metrics": self.model.metrics if self.model else None,
}
self.log_agent_run(run_id=self.run_response.run_id, run_data=run_data)
self.log_agent_run(run_id=self.run_id, run_data=run_data)

logger.debug(f"*********** Async Agent Run End: {self.run_response.run_id} ***********")
if stream_intermediate_steps:
Expand Down
67 changes: 32 additions & 35 deletions phi/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

from pydantic import BaseModel, ConfigDict, field_validator, Field

from phi.assistant import Assistant
from phi.agent import Agent, RunResponse
from phi.utils.log import logger, set_log_level_to_debug
from phi.utils.timer import Timer


class AccuracyResult(BaseModel):
score: int = Field(..., description="Accuracy Score between 1 and 10 assigned to the AI Assistant's answer.")
score: int = Field(..., description="Accuracy Score between 1 and 10 assigned to the Agent's answer.")
reason: str = Field(..., description="Detailed reasoning for the accuracy score.")


Expand All @@ -20,12 +20,12 @@ class EvalResult(BaseModel):


class Eval(BaseModel):
# Name of the evaluation
# Evaluation name
name: Optional[str] = None
# UUID of the evaluation (autogenerated if not set)
eval_id: Optional[str] = Field(None, validate_default=True)
# Assistant to evaluate
assistant: Optional[Assistant] = None
# Evaluation UUID (autogenerated if not set)
eval_id: str = Field(default_factory=lambda: str(uuid4()))
# Agent to evaluate
agent: Optional[Agent] = None

# Question to evaluate
question: str
Expand All @@ -35,7 +35,7 @@ class Eval(BaseModel):
# Result of the evaluation
result: Optional[EvalResult] = None

accuracy_evaluator: Optional[Assistant] = None
accuracy_evaluator: Optional[Agent] = None
accuracy_guidelines: Optional[List[str]] = None
accuracy_result: Optional[AccuracyResult] = None

Expand All @@ -54,29 +54,25 @@ def set_log_level(cls, v: bool) -> bool:
logger.debug("Debug logs enabled")
return v

@field_validator("eval_id", mode="before")
def set_eval_id(cls, v: Optional[str]) -> str:
return v if v is not None else str(uuid4())

def get_accuracy_evaluator(self) -> Assistant:
def get_accuracy_evaluator(self) -> Agent:
if self.accuracy_evaluator is not None:
return self.accuracy_evaluator

try:
from phi.llm.openai import OpenAIChat
from phi.model.openai import OpenAIChat
except ImportError:
raise ImportError("`openai` is required for the default evaluator.")

accuracy_guidelines = ""
if self.accuracy_guidelines is not None and len(self.accuracy_guidelines) > 0:
accuracy_guidelines = "\nThe AI Assistant's answer must follow these guidelines:\n"
accuracy_guidelines = "\nThe AI Agent's answer must follow these guidelines:\n"
accuracy_guidelines += "\n- ".join(self.accuracy_guidelines)

self.accuracy_evaluator = Assistant(
llm=OpenAIChat(model="gpt-4o-mini"),
return Agent(
model=OpenAIChat(id="gpt-4o-mini"),
description=f"""\
You are an evaluator tasked with comparing an AI Assistant's answer to an ideal answer for a given question.
You will assess the similarity and accuracy of the Assistant's answer and provide a score on a scale of 1 to 10, where 10 means the answers match exactly.
You are an evaluator tasked with comparing an AI Agent's answer to an ideal answer for a given question.
You will assess the similarity and accuracy of the Agent's answer and provide a score on a scale of 1 to 10, where 10 means the answers match exactly.

Here is the question:
<question>
Expand All @@ -88,7 +84,7 @@ def get_accuracy_evaluator(self) -> Assistant:
{self.ideal_answer}
</ideal_answer>

Compare the Assistant's answer to the ideal answer. Consider the following aspects:
Compare the Agent's answer to the ideal answer. Consider the following aspects:
- Accuracy of information
- Completeness of the answer
- Relevance to the question
Expand All @@ -98,44 +94,43 @@ def get_accuracy_evaluator(self) -> Assistant:

Provide your reasoning for the comparison, highlighting similarities and differences between the two answers.
Make sure to follow the guidelines and be as objective as possible in your evaluation. Mention the guidelines you followed in your reasoning.
Be specific about what the Assistant's answer includes or lacks compared to the ideal answer.
Be specific about what the Agent's answer includes or lacks compared to the ideal answer.

Based on your comparison, assign a score from 1 to 10, where:
1 = The answers are completely different or the Assistant's answer is entirely incorrect
5 = The Assistant's answer captures some key points but misses others or contains some inaccuracies
10 = The Assistant's answer matches the ideal answer exactly in content and presentation
1 = The answers are completely different or the Agent's answer is entirely incorrect
5 = The Agent's answer captures some key points but misses others or contains some inaccuracies
10 = The Agent's answer matches the ideal answer exactly in content and presentation

Only use whole numbers for the score (no decimals).
""",
output_model=AccuracyResult,
response_model=AccuracyResult,
)
return self.accuracy_evaluator

def run(self, answer: Optional[Union[str, Callable]] = None) -> Optional[EvalResult]:
logger.debug(f"*********** Evaluation Start: {self.eval_id} ***********")

answer_to_evaluate = None
answer_to_evaluate: Optional[RunResponse] = None
if answer is None:
if self.assistant is not None:
logger.debug("Getting answer from assistant")
answer_to_evaluate: str = self.assistant.run(self.question, stream=False) # type: ignore
if self.agent is not None:
logger.debug("Getting answer from agent")
answer_to_evaluate = self.agent.run(self.question)
if self.answer is not None:
answer_to_evaluate = self.answer
answer_to_evaluate = RunResponse(content=self.answer)
else:
try:
if callable(answer):
logger.debug("Getting answer from callable")
answer_to_evaluate = answer()
answer_to_evaluate = RunResponse(content=answer())
else:
answer_to_evaluate = answer
answer_to_evaluate = RunResponse(content=answer)
except Exception as e:
logger.error(f"Failed to get answer: {e}")
raise

if answer_to_evaluate is None:
raise ValueError("No Answer to evaluate.")
else:
self.answer = answer_to_evaluate
self.answer = answer_to_evaluate.content

logger.debug("************************ Evaluating ************************")
logger.debug(f"Question: {self.question}")
Expand All @@ -146,7 +141,9 @@ def run(self, answer: Optional[Union[str, Callable]] = None) -> Optional[EvalRes
logger.debug("Evaluating accuracy...")
accuracy_evaluator = self.get_accuracy_evaluator()
try:
self.accuracy_result: AccuracyResult = accuracy_evaluator.run(answer_to_evaluate, stream=False) # type: ignore
self.accuracy_result: AccuracyResult = accuracy_evaluator.run(
answer_to_evaluate.content, stream=False
).content
except Exception as e:
logger.error(f"Failed to evaluate accuracy: {e}")
return None
Expand Down
2 changes: 1 addition & 1 deletion phi/run/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class RunResponse(BaseModel):
messages: Optional[List[Message]] = None
metrics: Optional[Dict[str, Any]] = None
model: Optional[str] = None
run_id: str
run_id: Optional[str] = None
tools: Optional[List[Dict[str, Any]]] = None
created_at: int = Field(default_factory=lambda: int(time()))

Expand Down
Loading