phidatahq · ashpreetbedi · Sep 29, 2024 · Sep 29, 2024
diff --git a/evals/llms/__init__.py → evals/models/__init__.py b/evals/llms/__init__.py → evals/models/__init__.py
diff --git a/evals/llms/openai/__init__.py → evals/models/openai/__init__.py b/evals/llms/openai/__init__.py → evals/models/openai/__init__.py
diff --git a/evals/llms/openai/calculator.py → evals/models/openai/calculator.py b/evals/llms/openai/calculator.py → evals/models/openai/calculator.py
@@ -1,15 +1,15 @@
 from typing import Optional
 
-from phi.assistant import Assistant
+from phi.agent import Agent
 from phi.eval import Eval, EvalResult
-from phi.llm.openai import OpenAIChat
+from phi.model.openai import OpenAIChat
 from phi.tools.calculator import Calculator
 
 
 def multiply_and_exponentiate():
     evaluation = Eval(
-        assistant=Assistant(
-            llm=OpenAIChat(model="gpt-4o-mini"),
+        agent=Agent(
+            model=OpenAIChat(id="gpt-4o-mini"),
             tools=[Calculator(add=True, multiply=True, exponentiate=True)],
         ),
         question="What is 10*5 then to the power of 2? do it step by step",
@@ -23,8 +23,8 @@ def multiply_and_exponentiate():
 
 def factorial():
     evaluation = Eval(
-        assistant=Assistant(
-            llm=OpenAIChat(model="gpt-4o-mini"),
+        agent=Agent(
+            model=OpenAIChat(id="gpt-4o-mini"),
             tools=[Calculator(factorial=True)],
         ),
         question="What is 10!?",

diff --git a/phi/agent/agent.py b/phi/agent/agent.py
@@ -1159,7 +1159,7 @@ def _run(
                 "functions": functions,
                 "metrics": self.model.metrics if self.model else None,
             }
-        self.log_agent_run(run_id=self.run_response.run_id, run_data=run_data)
+        self.log_agent_run(run_id=self.run_id, run_data=run_data)
 
         logger.debug(f"*********** Agent Run End: {self.run_response.run_id} ***********")
         if stream_intermediate_steps:
@@ -1537,7 +1537,7 @@ async def _arun(
                 "functions": functions,
                 "metrics": self.model.metrics if self.model else None,
             }
-        self.log_agent_run(run_id=self.run_response.run_id, run_data=run_data)
+        self.log_agent_run(run_id=self.run_id, run_data=run_data)
 
         logger.debug(f"*********** Async Agent Run End: {self.run_response.run_id} ***********")
         if stream_intermediate_steps:

diff --git a/phi/eval/eval.py b/phi/eval/eval.py
@@ -4,13 +4,13 @@
 
 from pydantic import BaseModel, ConfigDict, field_validator, Field
 
-from phi.assistant import Assistant
+from phi.agent import Agent, RunResponse
 from phi.utils.log import logger, set_log_level_to_debug
 from phi.utils.timer import Timer
 
 
 class AccuracyResult(BaseModel):
-    score: int = Field(..., description="Accuracy Score between 1 and 10 assigned to the AI Assistant's answer.")
+    score: int = Field(..., description="Accuracy Score between 1 and 10 assigned to the Agent's answer.")
     reason: str = Field(..., description="Detailed reasoning for the accuracy score.")
 
 
@@ -20,12 +20,12 @@ class EvalResult(BaseModel):
 
 
 class Eval(BaseModel):
-    # Name of the evaluation
+    # Evaluation name
     name: Optional[str] = None
-    # UUID of the evaluation (autogenerated if not set)
-    eval_id: Optional[str] = Field(None, validate_default=True)
-    # Assistant to evaluate
-    assistant: Optional[Assistant] = None
+    # Evaluation UUID (autogenerated if not set)
+    eval_id: str = Field(default_factory=lambda: str(uuid4()))
+    # Agent to evaluate
+    agent: Optional[Agent] = None
 
     # Question to evaluate
     question: str
@@ -35,7 +35,7 @@ class Eval(BaseModel):
     # Result of the evaluation
     result: Optional[EvalResult] = None
 
-    accuracy_evaluator: Optional[Assistant] = None
+    accuracy_evaluator: Optional[Agent] = None
     accuracy_guidelines: Optional[List[str]] = None
     accuracy_result: Optional[AccuracyResult] = None
 
@@ -54,29 +54,25 @@ def set_log_level(cls, v: bool) -> bool:
             logger.debug("Debug logs enabled")
         return v
 
-    @field_validator("eval_id", mode="before")
-    def set_eval_id(cls, v: Optional[str]) -> str:
-        return v if v is not None else str(uuid4())
-
-    def get_accuracy_evaluator(self) -> Assistant:
+    def get_accuracy_evaluator(self) -> Agent:
         if self.accuracy_evaluator is not None:
             return self.accuracy_evaluator
 
         try:
-            from phi.llm.openai import OpenAIChat
+            from phi.model.openai import OpenAIChat
         except ImportError:
             raise ImportError("`openai` is required for the default evaluator.")
 
         accuracy_guidelines = ""
         if self.accuracy_guidelines is not None and len(self.accuracy_guidelines) > 0:
-            accuracy_guidelines = "\nThe AI Assistant's answer must follow these guidelines:\n"
+            accuracy_guidelines = "\nThe AI Agent's answer must follow these guidelines:\n"
             accuracy_guidelines += "\n- ".join(self.accuracy_guidelines)
 
-        self.accuracy_evaluator = Assistant(
-            llm=OpenAIChat(model="gpt-4o-mini"),
+        return Agent(
+            model=OpenAIChat(id="gpt-4o-mini"),
             description=f"""\
-You are an evaluator tasked with comparing an AI Assistant's answer to an ideal answer for a given question.
-You will assess the similarity and accuracy of the Assistant's answer and provide a score on a scale of 1 to 10, where 10 means the answers match exactly.
+You are an evaluator tasked with comparing an AI Agent's answer to an ideal answer for a given question.
+You will assess the similarity and accuracy of the Agent's answer and provide a score on a scale of 1 to 10, where 10 means the answers match exactly.
 
 Here is the question:
 <question>
@@ -88,7 +84,7 @@ def get_accuracy_evaluator(self) -> Assistant:
 {self.ideal_answer}
 </ideal_answer>
 
-Compare the Assistant's answer to the ideal answer. Consider the following aspects:
+Compare the Agent's answer to the ideal answer. Consider the following aspects:
 - Accuracy of information
 - Completeness of the answer
 - Relevance to the question
@@ -98,44 +94,43 @@ def get_accuracy_evaluator(self) -> Assistant:
 
 Provide your reasoning for the comparison, highlighting similarities and differences between the two answers.
 Make sure to follow the guidelines and be as objective as possible in your evaluation. Mention the guidelines you followed in your reasoning.
-Be specific about what the Assistant's answer includes or lacks compared to the ideal answer.
+Be specific about what the Agent's answer includes or lacks compared to the ideal answer.
 
 Based on your comparison, assign a score from 1 to 10, where:
-1 = The answers are completely different or the Assistant's answer is entirely incorrect
-5 = The Assistant's answer captures some key points but misses others or contains some inaccuracies
-10 = The Assistant's answer matches the ideal answer exactly in content and presentation
+1 = The answers are completely different or the Agent's answer is entirely incorrect
+5 = The Agent's answer captures some key points but misses others or contains some inaccuracies
+10 = The Agent's answer matches the ideal answer exactly in content and presentation
 
 Only use whole numbers for the score (no decimals).
 """,
-            output_model=AccuracyResult,
+            response_model=AccuracyResult,
         )
-        return self.accuracy_evaluator
 
     def run(self, answer: Optional[Union[str, Callable]] = None) -> Optional[EvalResult]:
         logger.debug(f"*********** Evaluation Start: {self.eval_id} ***********")
 
-        answer_to_evaluate = None
+        answer_to_evaluate: Optional[RunResponse] = None
         if answer is None:
-            if self.assistant is not None:
-                logger.debug("Getting answer from assistant")
-                answer_to_evaluate: str = self.assistant.run(self.question, stream=False)  # type: ignore
+            if self.agent is not None:
+                logger.debug("Getting answer from agent")
+                answer_to_evaluate = self.agent.run(self.question)
             if self.answer is not None:
-                answer_to_evaluate = self.answer
+                answer_to_evaluate = RunResponse(content=self.answer)
         else:
             try:
                 if callable(answer):
                     logger.debug("Getting answer from callable")
-                    answer_to_evaluate = answer()
+                    answer_to_evaluate = RunResponse(content=answer())
                 else:
-                    answer_to_evaluate = answer
+                    answer_to_evaluate = RunResponse(content=answer)
             except Exception as e:
                 logger.error(f"Failed to get answer: {e}")
                 raise
 
         if answer_to_evaluate is None:
             raise ValueError("No Answer to evaluate.")
         else:
-            self.answer = answer_to_evaluate
+            self.answer = answer_to_evaluate.content
 
         logger.debug("************************ Evaluating ************************")
         logger.debug(f"Question: {self.question}")
@@ -146,7 +141,9 @@ def run(self, answer: Optional[Union[str, Callable]] = None) -> Optional[EvalRes
         logger.debug("Evaluating accuracy...")
         accuracy_evaluator = self.get_accuracy_evaluator()
         try:
-            self.accuracy_result: AccuracyResult = accuracy_evaluator.run(answer_to_evaluate, stream=False)  # type: ignore
+            self.accuracy_result: AccuracyResult = accuracy_evaluator.run(
+                answer_to_evaluate.content, stream=False
+            ).content
         except Exception as e:
             logger.error(f"Failed to evaluate accuracy: {e}")
             return None

diff --git a/phi/run/response.py b/phi/run/response.py
@@ -28,7 +28,7 @@ class RunResponse(BaseModel):
     messages: Optional[List[Message]] = None
     metrics: Optional[Dict[str, Any]] = None
     model: Optional[str] = None
-    run_id: str
+    run_id: Optional[str] = None
     tools: Optional[List[Dict[str, Any]]] = None
     created_at: int = Field(default_factory=lambda: int(time()))