From d980ada6176d60ee88488a0714d76b1d090a8aa2 Mon Sep 17 00:00:00 2001
From: A-F-V <faracea2000@gmail.com>
Date: Fri, 23 Feb 2024 13:30:26 +0000
Subject: [PATCH] Thinking Eval working

---
 .gitignore                           |   5 +-
 cli.py                               |   2 +-
 requirements.txt                     |   6 +-
 src/agent.py                         |  17 ++--
 src/common/definitions.py            |   1 +
 src/evaluation/feedback_functions.py | 129 +++++++++++++++++++++++++++
 src/execution.py                     |  39 ++++++--
 src/planning/planner.py              |  45 +++++++---
 test.json                            |  10 +++
 9 files changed, 226 insertions(+), 28 deletions(-)
 create mode 100644 src/evaluation/feedback_functions.py
 create mode 100644 test.json

diff --git a/.gitignore b/.gitignore
index c8983f8..6afc1cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,4 +159,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-demo/
\ No newline at end of file
+demo/
+
+.streamlit/
+default.sqlite
\ No newline at end of file
diff --git a/cli.py b/cli.py
index 5a73d50..bedc5f0 100644
--- a/cli.py
+++ b/cli.py
@@ -43,7 +43,7 @@ def run(repo: str):
     code_path = "edit_distance/edit_distance.py"
     query = f"In the file {code_path}, there is  function called `lowest_...`. Edit the function by using better names for the variables. Do not rename the function"
 
-    context = ProjectContext(folder_path=repo)
+    context = ProjectContext(folder_path=repo, eval_project_id="demo_eval")
     agent = RefactoringAgent()
     click.echo(state_to_str(agent.run(query, context)))
 
diff --git a/requirements.txt b/requirements.txt
index f654cf4..04bbc93 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,7 @@
 
+trulens_eval
+litellm
+boto3
 # For linting
 pylint
 
@@ -12,7 +15,7 @@ pytest
 sphinx
 
 # For Type Checking
-mypy
+#mypy
 
 #########################
 
@@ -32,3 +35,4 @@ diff_match_patch
 # LSP
 jedi
 
+# Evaluation
\ No newline at end of file
diff --git a/src/agent.py b/src/agent.py
index 66035de..80e3a38 100644
--- a/src/agent.py
+++ b/src/agent.py
@@ -1,6 +1,3 @@
-from langchain import hub
-from langchain.agents import AgentExecutor, create_openai_functions_agent
-from langchain_openai import ChatOpenAI
 from langgraph.graph import StateGraph, END
 from src.actions.code_inspection import create_code_loader
 from src.actions.code_manipulation import create_apply_change
@@ -13,6 +10,10 @@
 from .actions.basic_actions import create_logging_action
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableConfig
+from trulens_eval import Tru
+from langchain import hub
+from langchain.agents import AgentExecutor, create_openai_functions_agent
+from langchain_openai import ChatOpenAI
 
 
 class RefactoringAgent:
@@ -52,7 +53,11 @@ def _setup_agent_graph(self):
         # self.graph.add_node('')
         self.app = self.graph.compile()
 
+        # print the graph
+        # TODO
+
     def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState:
+        tru = Tru()
         state: RefactoringAgentState = {
             "goal": inp,
             "project_context": context,
@@ -63,5 +68,7 @@ def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState:
             "code_blocks": [],
             "thoughts": [],
         }
-        config = RunnableConfig(recursion_limit=20)
-        return RefactoringAgentState(**self.app.invoke(state, config=config))
+        config = RunnableConfig(recursion_limit=30)
+        result = RefactoringAgentState(**self.app.invoke(state, config=config))
+        # tru.stop_dashboard()
+        return result
diff --git a/src/common/definitions.py b/src/common/definitions.py
index 7b9e957..712b55f 100644
--- a/src/common/definitions.py
+++ b/src/common/definitions.py
@@ -12,6 +12,7 @@ class ProjectContext(BaseModel):
     """A project context."""
 
     folder_path: str = Field(description="The folder path of the project")
+    eval_project_id: str = Field(description="The project id")
 
 
 ###########################################
diff --git a/src/evaluation/feedback_functions.py b/src/evaluation/feedback_functions.py
new file mode 100644
index 0000000..e11a51c
--- /dev/null
+++ b/src/evaluation/feedback_functions.py
@@ -0,0 +1,129 @@
+from typing import Optional
+from trulens_eval import Feedback, Select
+from trulens_eval import Tru
+from trulens_eval import TruChain
+from trulens_eval.feedback import OpenAI as fOpenAI
+import numpy as np
+
+from src.planning.state import RefactoringAgentState, record_to_str
+
+tru = Tru()
+
+# These are to be used by the LLMController where the second query is the one that is used
+
+sentinel = -1.0
+
+
+def create_sentinel_aggregator(agg):
+    def aggregator(values):
+        # Filter out None values
+        values = [v for v in values if v is not sentinel]
+        return agg(values)
+
+    return aggregator
+
+
+def create_tool_relevance_feedback(state):
+    def tool_relevance(output) -> float:
+        provider = fOpenAI()
+        # return sentinel if the output is not a dict
+        if (
+            not isinstance(output, dict)
+            or "tool" not in output
+            or "tool_input" not in output
+        ):
+            return sentinel
+        tool_id = output["tool"]
+        tool_input = output["tool_input"]
+        res = float(
+            provider.endpoint.client.chat.completions.create(
+                model="gpt-3.5-turbo",  # Use better model?
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "How relevant was the selection of TOOL with TOOL_INPUT in addressing the current task in STATE? Reply with a number between 0 and 10.",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"TOOL: {tool_id}; TOOL_INPUT: {tool_input}; STATE: {state}",
+                    },
+                ],
+            )
+            .choices[0]
+            .message.content
+        )
+        return res
+
+    f_tool_relevance = (
+        Feedback(tool_relevance)
+        .on_output()
+        .aggregate(create_sentinel_aggregator(np.mean))
+    )
+    return f_tool_relevance
+
+
+def create_short_thought_feedback():
+    def short_thought(thought: str) -> float:
+        return float(len(thought))
+
+    return Feedback(short_thought).on_output()
+
+
+def create_evolving_thought_feedback(state: RefactoringAgentState):
+    def evolving_thought(thought: str):
+        provider = fOpenAI()
+        past_thoughts_actions = []
+        for i in range(len(state["thoughts"]) - 1):
+            past_thoughts_actions.append(
+                f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}"
+            )
+        res = float(
+            provider.endpoint.client.chat.completions.create(
+                model="gpt-3.5-turbo",  # Use better model?
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "Given PAST_THOUGHTS_AND_ACTIONS, how much has the NEXT_THOUGHT added to solving the ULTIMATE_GOAL? Give a number between 0 to 100 where 100 means it has added a lot. Reply only with a number, for example: '80'",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}\n\n\n ### ULTIMATE_GOAL ###\n {state['goal']}",
+                    },
+                ],
+            )
+            .choices[0]
+            .message.content
+        )
+        return res
+
+    return Feedback(evolving_thought).on_output()
+
+
+def create_repeating_work_feedback(state: RefactoringAgentState):
+    def repeated_work(thought: str):
+        provider = fOpenAI()
+        past_thoughts_actions = []
+        for i in range(len(state["thoughts"]) - 1):
+            past_thoughts_actions.append(
+                f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}"
+            )
+        res = float(
+            provider.endpoint.client.chat.completions.create(
+                model="gpt-3.5-turbo",  # Use better model?
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "Given PAST_THOUGHTS_AND_ACTIONS, how much is the NEXT_THOUGHT suggesting we repeat work already completed? Give a number between 0 to 100 where 100 means it is suggesting a complete repeat of work already completed. Reply only with a number, for example: '80'",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}",
+                    },
+                ],
+            )
+            .choices[0]
+            .message.content
+        )
+        return res
+
+    return Feedback(repeated_work).on_output()
diff --git a/src/execution.py b/src/execution.py
index f8d39c4..8e5da98 100644
--- a/src/execution.py
+++ b/src/execution.py
@@ -2,6 +2,7 @@
 from typing import Dict, List
 from langchain_openai import ChatOpenAI
 from langchain_core.utils.function_calling import convert_to_openai_function
+from trulens_eval import FeedbackMode, TruChain
 
 from langchain.prompts import (
     PromptTemplate,
@@ -12,6 +13,7 @@
 )
 from langchain.output_parsers.openai_tools import JsonOutputToolsParser
 from langchain_core.output_parsers import JsonOutputParser
+from langchain.agents import initialize_agent, AgentType
 from langchain import hub
 from langchain.agents import AgentExecutor, create_openai_tools_agent
 from src.actions.action import Action
@@ -28,6 +30,8 @@
     state_to_str,
 )
 from src.utilities.formatting import format_list
+from .evaluation.feedback_functions import *
+from trulens_eval.app import App
 
 
 class ActionDispatcher:
@@ -142,6 +146,7 @@ def __init__(
         current_task: str,
         verbose=True,
         additional_instructions=default_instructions,
+        eval_factory=None,
         record_history=True,
     ):
         self.actions = actions
@@ -150,6 +155,8 @@ def __init__(
         self.verbose = verbose
         self.additional_instructions = additional_instructions
         self.record_history = record_history
+        self.eval_factory = eval_factory
+
         self.create_prompt()
         # self.chain = self.prompt_template | self.llm | self.parser
 
@@ -208,7 +215,18 @@ def run(self, state: RefactoringAgentState):
             return self.run_with_tools(state)
 
     def run_without_tools(self, state):
-        output = self.llm.invoke(self.format_context_prompt(state))
+        self.llm = ChatOpenAI(model="gpt-4-1106-preview")
+        if self.eval_factory is not None:
+
+            tru_llm = TruChain(
+                self.llm,
+                app_id=state["project_context"].eval_project_id,
+                feedbacks=self.eval_factory(state),
+            )
+            with tru_llm:
+                output = self.llm.invoke(self.format_context_prompt(state))
+        else:
+            output = self.llm.invoke(self.format_context_prompt(state))
         return state, output.content
 
     def run_with_tools(self, state):
@@ -216,13 +234,17 @@ def run_with_tools(self, state):
 
         # Construct the OpenAI Tools agent
         agent = create_openai_tools_agent(self.llm, tools, self.agent_prompt)
-        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=self.verbose)
-        # Decide what to do
-        if self.verbose:
-            # print("Action List:")
-            # print("\n".join([str(action) for action in self.actions]))
-            # print("----")
-            pass
+        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
+
+        #  tru_agent = TruChain(
+        #      agent,
+        #      app_id=state["project_context"].eval_project_id,
+        #      feedbacks=[
+        #          # create_tool_relevance_feedback(state)
+        #      ],
+        #      # feedback_mode=FeedbackMode.DEFERRED,
+        #  )
+        # print(tru_agent.app.middle[1])
         output = ""
         try:
             try:
@@ -230,6 +252,7 @@ def run_with_tools(self, state):
                     {"input": self.format_context_prompt(state)}
                 )
                 output = result["output"]
+
             except Exception as e:
                 raise FeedbackMessage(FailureReason.ACTION_FAILED, str(e))
         except FeedbackMessage as f:
diff --git a/src/planning/planner.py b/src/planning/planner.py
index ccd782d..f2d52b1 100644
--- a/src/planning/planner.py
+++ b/src/planning/planner.py
@@ -3,6 +3,11 @@
 from src.actions.action import Action, FeedbackMessage
 from src.actions.basic_actions import create_logging_action
 from src.common.definitions import FailureReason
+from src.evaluation.feedback_functions import (
+    create_evolving_thought_feedback,
+    create_repeating_work_feedback,
+    create_short_thought_feedback,
+)
 from src.execution import ActionDispatcher, LLMController
 from src.planning.plan_actions import (
     create_action_adder_for_plan,
@@ -38,14 +43,14 @@ def __call__(self, state: RefactoringAgentState):
         return self.controller(state)
 
 
+class NewThought(BaseModel):
+    thought: str = Field(description="The thought to add to the thoughts list")
+
+
 class Thinker:
     def __init__(self):
 
         def create_thought():
-            class NewThought(BaseModel):
-                thought: str = Field(
-                    description="The thought to add to the thoughts list"
-                )
 
             def thought(state: RefactoringAgentState, args: NewThought):
                 state["thoughts"].append(args.thought)
@@ -60,17 +65,29 @@ def thought(state: RefactoringAgentState, args: NewThought):
             return action
 
         task = """Reflect on the current state and write a brief thought to help your future self."""
-        additional_instructions = """Use this as a way to plan your next steps, reflect on what went well and how you can improve. Be incredibly brief (1-2 sentences). 
-        Call the add_thought function to add a thought to the thoughts list. Say 'Done' after you have added your thought."""
+        additional_instructions = """Use this as a way to plan your next steps, reflect on what went well and how you can improve. Be incredibly brief (1-2 sentences). This message will be saved in the thoughts section. Do not prefix your answer."""
+
+        def eval_think_factory(state: RefactoringAgentState):
+            return [
+                create_evolving_thought_feedback(state),
+                create_short_thought_feedback(),
+                create_repeating_work_feedback(state),
+            ]
+
+        self.add_thought = create_thought()
         self.controller = LLMController(
-            [create_thought()],
+            [],
             task,
             additional_instructions=additional_instructions,
             record_history=False,
+            eval_factory=eval_think_factory,
         )
 
     def __call__(self, state: RefactoringAgentState):
-        return self.controller(state)
+        state, result = self.controller.run(state)
+        args = NewThought(thought=str(result))
+        self.add_thought.execute(state, args)
+        return state
 
 
 class NextStep(Enum):
@@ -92,11 +109,13 @@ class ShouldContinue:
     next_node: str
 
     def __init__(self) -> None:
-        should_continue_action = self._create_should_continue_action()
+        self.should_continue_action = self._create_should_continue_action()
         task = """Decide whether to think & execute again or finish. """
-        additional_instructions = """Call the `should_continue` function with a true boolean to continue thinking & executing, and false to finish. Say 'Done' after you have called `should_continue`. Call `should_continue` only once."""
+        additional_instructions = (
+            """Return 'true' to think and execute again, or 'false' to finish."""
+        )
         self.controller = LLMController(
-            [should_continue_action],
+            [],
             task,
             additional_instructions=additional_instructions,
             record_history=False,
@@ -122,7 +141,9 @@ def should_continue(state: RefactoringAgentState, args: NextStepInput):
 
     def __call__(self, state: RefactoringAgentState):
         state, decision = self.controller.run(state)
-        return self.next_node
+        # Parse the decision
+        cont = bool(str(decision).lower())
+        return "think" if cont else "finish"
 
 
 class LLMExecutor:
diff --git a/test.json b/test.json
new file mode 100644
index 0000000..f4b3f0a
--- /dev/null
+++ b/test.json
@@ -0,0 +1,10 @@
+{'name': None,
+ 'first': RunnableAssign(mapper=
+ {
+  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))
+}),
+ 'middle': [
+  ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')), MessagesPlaceholder(variable_name='chat_history', optional=True), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), MessagesPlaceholder(variable_name='agent_scratchpad')]),
+  RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x000001B5C2A23950>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x000001B5C2A3C250>, model_name='gpt-4-1106-preview', openai_api_key=SecretStr('**********'), openai_proxy=''), kwargs={'tools': [{'type': 'function', 'function': {'name': 'add_thought', 'description': 'Add a thought to the thoughts list.', 'parameters': {'type': 'object', 'properties': {'thought': {'description': 'The thought to add to the thoughts list', 'type': 'string'}}, 'required': ['thought']}}}]})]
+ , 
+ 'last': OpenAIToolsAgentOutputParser()}
\ No newline at end of file