Thinking Eval working

A-F-V · Feb 23, 2024 · d980ada · d980ada
1 parent 320c3ba
commit d980ada
Show file tree

Hide file tree

Showing 9 changed files with 226 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-demo/
+demo/
+
+.streamlit/
+default.sqlite
diff --git a/cli.py b/cli.py
@@ -43,7 +43,7 @@ def run(repo: str):
     code_path = "edit_distance/edit_distance.py"
     query = f"In the file {code_path}, there is  function called `lowest_...`. Edit the function by using better names for the variables. Do not rename the function"
 
-    context = ProjectContext(folder_path=repo)
+    context = ProjectContext(folder_path=repo, eval_project_id="demo_eval")
     agent = RefactoringAgent()
     click.echo(state_to_str(agent.run(query, context)))
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,7 @@
 
+trulens_eval
+litellm
+boto3
 # For linting
 pylint
 
@@ -12,7 +15,7 @@ pytest
 sphinx
 
 # For Type Checking
-mypy
+#mypy
 
 #########################
 
@@ -32,3 +35,4 @@ diff_match_patch
 # LSP
 jedi
 
+# Evaluation
diff --git a/src/agent.py b/src/agent.py
@@ -1,6 +1,3 @@
-from langchain import hub
-from langchain.agents import AgentExecutor, create_openai_functions_agent
-from langchain_openai import ChatOpenAI
 from langgraph.graph import StateGraph, END
 from src.actions.code_inspection import create_code_loader
 from src.actions.code_manipulation import create_apply_change
@@ -13,6 +10,10 @@
 from .actions.basic_actions import create_logging_action
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableConfig
+from trulens_eval import Tru
+from langchain import hub
+from langchain.agents import AgentExecutor, create_openai_functions_agent
+from langchain_openai import ChatOpenAI
 
 
 class RefactoringAgent:
@@ -52,7 +53,11 @@ def _setup_agent_graph(self):
         # self.graph.add_node('')
         self.app = self.graph.compile()
 
+        # print the graph
+        # TODO
+
     def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState:
+        tru = Tru()
         state: RefactoringAgentState = {
             "goal": inp,
             "project_context": context,
@@ -63,5 +68,7 @@ def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState:
             "code_blocks": [],
             "thoughts": [],
         }
-        config = RunnableConfig(recursion_limit=20)
-        return RefactoringAgentState(**self.app.invoke(state, config=config))
+        config = RunnableConfig(recursion_limit=30)
+        result = RefactoringAgentState(**self.app.invoke(state, config=config))
+        # tru.stop_dashboard()
+        return result
diff --git a/src/common/definitions.py b/src/common/definitions.py
@@ -12,6 +12,7 @@ class ProjectContext(BaseModel):
     """A project context."""
 
     folder_path: str = Field(description="The folder path of the project")
+    eval_project_id: str = Field(description="The project id")
 
 
 ###########################################

diff --git a/src/evaluation/feedback_functions.py b/src/evaluation/feedback_functions.py
@@ -0,0 +1,129 @@
+from typing import Optional
+from trulens_eval import Feedback, Select
+from trulens_eval import Tru
+from trulens_eval import TruChain
+from trulens_eval.feedback import OpenAI as fOpenAI
+import numpy as np
+
+from src.planning.state import RefactoringAgentState, record_to_str
+
+tru = Tru()
+
+# These are to be used by the LLMController where the second query is the one that is used
+
+sentinel = -1.0
+
+
+def create_sentinel_aggregator(agg):
+    def aggregator(values):
+        # Filter out None values
+        values = [v for v in values if v is not sentinel]
+        return agg(values)
+
+    return aggregator
+
+
+def create_tool_relevance_feedback(state):
+    def tool_relevance(output) -> float:
+        provider = fOpenAI()
+        # return sentinel if the output is not a dict
+        if (
+            not isinstance(output, dict)
+            or "tool" not in output
+            or "tool_input" not in output
+        ):
+            return sentinel
+        tool_id = output["tool"]
+        tool_input = output["tool_input"]
+        res = float(
+            provider.endpoint.client.chat.completions.create(
+                model="gpt-3.5-turbo",  # Use better model?
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "How relevant was the selection of TOOL with TOOL_INPUT in addressing the current task in STATE? Reply with a number between 0 and 10.",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"TOOL: {tool_id}; TOOL_INPUT: {tool_input}; STATE: {state}",
+                    },
+                ],
+            )
+            .choices[0]
+            .message.content
+        )
+        return res
+
+    f_tool_relevance = (
+        Feedback(tool_relevance)
+        .on_output()
+        .aggregate(create_sentinel_aggregator(np.mean))
+    )
+    return f_tool_relevance
+
+
+def create_short_thought_feedback():
+    def short_thought(thought: str) -> float:
+        return float(len(thought))
+
+    return Feedback(short_thought).on_output()
+
+
+def create_evolving_thought_feedback(state: RefactoringAgentState):
+    def evolving_thought(thought: str):
+        provider = fOpenAI()
+        past_thoughts_actions = []
+        for i in range(len(state["thoughts"]) - 1):
+            past_thoughts_actions.append(
+                f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}"
+            )
+        res = float(
+            provider.endpoint.client.chat.completions.create(
+                model="gpt-3.5-turbo",  # Use better model?
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "Given PAST_THOUGHTS_AND_ACTIONS, how much has the NEXT_THOUGHT added to solving the ULTIMATE_GOAL? Give a number between 0 to 100 where 100 means it has added a lot. Reply only with a number, for example: '80'",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}\n\n\n ### ULTIMATE_GOAL ###\n {state['goal']}",
+                    },
+                ],
+            )
+            .choices[0]
+            .message.content
+        )
+        return res
+
+    return Feedback(evolving_thought).on_output()
+
+
+def create_repeating_work_feedback(state: RefactoringAgentState):
+    def repeated_work(thought: str):
+        provider = fOpenAI()
+        past_thoughts_actions = []
+        for i in range(len(state["thoughts"]) - 1):
+            past_thoughts_actions.append(
+                f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}"
+            )
+        res = float(
+            provider.endpoint.client.chat.completions.create(
+                model="gpt-3.5-turbo",  # Use better model?
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "Given PAST_THOUGHTS_AND_ACTIONS, how much is the NEXT_THOUGHT suggesting we repeat work already completed? Give a number between 0 to 100 where 100 means it is suggesting a complete repeat of work already completed. Reply only with a number, for example: '80'",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}",
+                    },
+                ],
+            )
+            .choices[0]
+            .message.content
+        )
+        return res
+
+    return Feedback(repeated_work).on_output()
diff --git a/src/execution.py b/src/execution.py
@@ -2,6 +2,7 @@
 from typing import Dict, List
 from langchain_openai import ChatOpenAI
 from langchain_core.utils.function_calling import convert_to_openai_function
+from trulens_eval import FeedbackMode, TruChain
 
 from langchain.prompts import (
     PromptTemplate,
@@ -12,6 +13,7 @@
 )
 from langchain.output_parsers.openai_tools import JsonOutputToolsParser
 from langchain_core.output_parsers import JsonOutputParser
+from langchain.agents import initialize_agent, AgentType
 from langchain import hub
 from langchain.agents import AgentExecutor, create_openai_tools_agent
 from src.actions.action import Action
@@ -28,6 +30,8 @@
     state_to_str,
 )
 from src.utilities.formatting import format_list
+from .evaluation.feedback_functions import *
+from trulens_eval.app import App
 
 
 class ActionDispatcher:
@@ -142,6 +146,7 @@ def __init__(
         current_task: str,
         verbose=True,
         additional_instructions=default_instructions,
+        eval_factory=None,
         record_history=True,
     ):
         self.actions = actions
@@ -150,6 +155,8 @@ def __init__(
         self.verbose = verbose
         self.additional_instructions = additional_instructions
         self.record_history = record_history
+        self.eval_factory = eval_factory
+
         self.create_prompt()
         # self.chain = self.prompt_template | self.llm | self.parser
 
@@ -208,28 +215,44 @@ def run(self, state: RefactoringAgentState):
             return self.run_with_tools(state)
 
     def run_without_tools(self, state):
-        output = self.llm.invoke(self.format_context_prompt(state))
+        self.llm = ChatOpenAI(model="gpt-4-1106-preview")
+        if self.eval_factory is not None:
+
+            tru_llm = TruChain(
+                self.llm,
+                app_id=state["project_context"].eval_project_id,
+                feedbacks=self.eval_factory(state),
+            )
+            with tru_llm:
+                output = self.llm.invoke(self.format_context_prompt(state))
+        else:
+            output = self.llm.invoke(self.format_context_prompt(state))
         return state, output.content
 
     def run_with_tools(self, state):
         tools = self.get_openai_tools(state)
 
         # Construct the OpenAI Tools agent
         agent = create_openai_tools_agent(self.llm, tools, self.agent_prompt)
-        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=self.verbose)
-        # Decide what to do
-        if self.verbose:
-            # print("Action List:")
-            # print("\n".join([str(action) for action in self.actions]))
-            # print("----")
-            pass
+        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
+
+        #  tru_agent = TruChain(
+        #      agent,
+        #      app_id=state["project_context"].eval_project_id,
+        #      feedbacks=[
+        #          # create_tool_relevance_feedback(state)
+        #      ],
+        #      # feedback_mode=FeedbackMode.DEFERRED,
+        #  )
+        # print(tru_agent.app.middle[1])
         output = ""
         try:
             try:
                 result = agent_executor.invoke(
                     {"input": self.format_context_prompt(state)}
                 )
                 output = result["output"]
+
             except Exception as e:
                 raise FeedbackMessage(FailureReason.ACTION_FAILED, str(e))
         except FeedbackMessage as f: