From d980ada6176d60ee88488a0714d76b1d090a8aa2 Mon Sep 17 00:00:00 2001 From: A-F-V Date: Fri, 23 Feb 2024 13:30:26 +0000 Subject: [PATCH] Thinking Eval working --- .gitignore | 5 +- cli.py | 2 +- requirements.txt | 6 +- src/agent.py | 17 ++-- src/common/definitions.py | 1 + src/evaluation/feedback_functions.py | 129 +++++++++++++++++++++++++++ src/execution.py | 39 ++++++-- src/planning/planner.py | 45 +++++++--- test.json | 10 +++ 9 files changed, 226 insertions(+), 28 deletions(-) create mode 100644 src/evaluation/feedback_functions.py create mode 100644 test.json diff --git a/.gitignore b/.gitignore index c8983f8..6afc1cf 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,7 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -demo/ \ No newline at end of file +demo/ + +.streamlit/ +default.sqlite \ No newline at end of file diff --git a/cli.py b/cli.py index 5a73d50..bedc5f0 100644 --- a/cli.py +++ b/cli.py @@ -43,7 +43,7 @@ def run(repo: str): code_path = "edit_distance/edit_distance.py" query = f"In the file {code_path}, there is function called `lowest_...`. Edit the function by using better names for the variables. Do not rename the function" - context = ProjectContext(folder_path=repo) + context = ProjectContext(folder_path=repo, eval_project_id="demo_eval") agent = RefactoringAgent() click.echo(state_to_str(agent.run(query, context))) diff --git a/requirements.txt b/requirements.txt index f654cf4..04bbc93 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ +trulens_eval +litellm +boto3 # For linting pylint @@ -12,7 +15,7 @@ pytest sphinx # For Type Checking -mypy +#mypy ######################### @@ -32,3 +35,4 @@ diff_match_patch # LSP jedi +# Evaluation \ No newline at end of file diff --git a/src/agent.py b/src/agent.py index 66035de..80e3a38 100644 --- a/src/agent.py +++ b/src/agent.py @@ -1,6 +1,3 @@ -from langchain import hub -from langchain.agents import AgentExecutor, create_openai_functions_agent -from langchain_openai import ChatOpenAI from langgraph.graph import StateGraph, END from src.actions.code_inspection import create_code_loader from src.actions.code_manipulation import create_apply_change @@ -13,6 +10,10 @@ from .actions.basic_actions import create_logging_action from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableConfig +from trulens_eval import Tru +from langchain import hub +from langchain.agents import AgentExecutor, create_openai_functions_agent +from langchain_openai import ChatOpenAI class RefactoringAgent: @@ -52,7 +53,11 @@ def _setup_agent_graph(self): # self.graph.add_node('') self.app = self.graph.compile() + # print the graph + # TODO + def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState: + tru = Tru() state: RefactoringAgentState = { "goal": inp, "project_context": context, @@ -63,5 +68,7 @@ def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState: "code_blocks": [], "thoughts": [], } - config = RunnableConfig(recursion_limit=20) - return RefactoringAgentState(**self.app.invoke(state, config=config)) + config = RunnableConfig(recursion_limit=30) + result = RefactoringAgentState(**self.app.invoke(state, config=config)) + # tru.stop_dashboard() + return result diff --git a/src/common/definitions.py b/src/common/definitions.py index 7b9e957..712b55f 100644 --- a/src/common/definitions.py +++ b/src/common/definitions.py @@ -12,6 +12,7 @@ class ProjectContext(BaseModel): """A project context.""" folder_path: str = Field(description="The folder path of the project") + eval_project_id: str = Field(description="The project id") ########################################### diff --git a/src/evaluation/feedback_functions.py b/src/evaluation/feedback_functions.py new file mode 100644 index 0000000..e11a51c --- /dev/null +++ b/src/evaluation/feedback_functions.py @@ -0,0 +1,129 @@ +from typing import Optional +from trulens_eval import Feedback, Select +from trulens_eval import Tru +from trulens_eval import TruChain +from trulens_eval.feedback import OpenAI as fOpenAI +import numpy as np + +from src.planning.state import RefactoringAgentState, record_to_str + +tru = Tru() + +# These are to be used by the LLMController where the second query is the one that is used + +sentinel = -1.0 + + +def create_sentinel_aggregator(agg): + def aggregator(values): + # Filter out None values + values = [v for v in values if v is not sentinel] + return agg(values) + + return aggregator + + +def create_tool_relevance_feedback(state): + def tool_relevance(output) -> float: + provider = fOpenAI() + # return sentinel if the output is not a dict + if ( + not isinstance(output, dict) + or "tool" not in output + or "tool_input" not in output + ): + return sentinel + tool_id = output["tool"] + tool_input = output["tool_input"] + res = float( + provider.endpoint.client.chat.completions.create( + model="gpt-3.5-turbo", # Use better model? + messages=[ + { + "role": "system", + "content": "How relevant was the selection of TOOL with TOOL_INPUT in addressing the current task in STATE? Reply with a number between 0 and 10.", + }, + { + "role": "user", + "content": f"TOOL: {tool_id}; TOOL_INPUT: {tool_input}; STATE: {state}", + }, + ], + ) + .choices[0] + .message.content + ) + return res + + f_tool_relevance = ( + Feedback(tool_relevance) + .on_output() + .aggregate(create_sentinel_aggregator(np.mean)) + ) + return f_tool_relevance + + +def create_short_thought_feedback(): + def short_thought(thought: str) -> float: + return float(len(thought)) + + return Feedback(short_thought).on_output() + + +def create_evolving_thought_feedback(state: RefactoringAgentState): + def evolving_thought(thought: str): + provider = fOpenAI() + past_thoughts_actions = [] + for i in range(len(state["thoughts"]) - 1): + past_thoughts_actions.append( + f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}" + ) + res = float( + provider.endpoint.client.chat.completions.create( + model="gpt-3.5-turbo", # Use better model? + messages=[ + { + "role": "system", + "content": "Given PAST_THOUGHTS_AND_ACTIONS, how much has the NEXT_THOUGHT added to solving the ULTIMATE_GOAL? Give a number between 0 to 100 where 100 means it has added a lot. Reply only with a number, for example: '80'", + }, + { + "role": "user", + "content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}\n\n\n ### ULTIMATE_GOAL ###\n {state['goal']}", + }, + ], + ) + .choices[0] + .message.content + ) + return res + + return Feedback(evolving_thought).on_output() + + +def create_repeating_work_feedback(state: RefactoringAgentState): + def repeated_work(thought: str): + provider = fOpenAI() + past_thoughts_actions = [] + for i in range(len(state["thoughts"]) - 1): + past_thoughts_actions.append( + f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}" + ) + res = float( + provider.endpoint.client.chat.completions.create( + model="gpt-3.5-turbo", # Use better model? + messages=[ + { + "role": "system", + "content": "Given PAST_THOUGHTS_AND_ACTIONS, how much is the NEXT_THOUGHT suggesting we repeat work already completed? Give a number between 0 to 100 where 100 means it is suggesting a complete repeat of work already completed. Reply only with a number, for example: '80'", + }, + { + "role": "user", + "content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}", + }, + ], + ) + .choices[0] + .message.content + ) + return res + + return Feedback(repeated_work).on_output() diff --git a/src/execution.py b/src/execution.py index f8d39c4..8e5da98 100644 --- a/src/execution.py +++ b/src/execution.py @@ -2,6 +2,7 @@ from typing import Dict, List from langchain_openai import ChatOpenAI from langchain_core.utils.function_calling import convert_to_openai_function +from trulens_eval import FeedbackMode, TruChain from langchain.prompts import ( PromptTemplate, @@ -12,6 +13,7 @@ ) from langchain.output_parsers.openai_tools import JsonOutputToolsParser from langchain_core.output_parsers import JsonOutputParser +from langchain.agents import initialize_agent, AgentType from langchain import hub from langchain.agents import AgentExecutor, create_openai_tools_agent from src.actions.action import Action @@ -28,6 +30,8 @@ state_to_str, ) from src.utilities.formatting import format_list +from .evaluation.feedback_functions import * +from trulens_eval.app import App class ActionDispatcher: @@ -142,6 +146,7 @@ def __init__( current_task: str, verbose=True, additional_instructions=default_instructions, + eval_factory=None, record_history=True, ): self.actions = actions @@ -150,6 +155,8 @@ def __init__( self.verbose = verbose self.additional_instructions = additional_instructions self.record_history = record_history + self.eval_factory = eval_factory + self.create_prompt() # self.chain = self.prompt_template | self.llm | self.parser @@ -208,7 +215,18 @@ def run(self, state: RefactoringAgentState): return self.run_with_tools(state) def run_without_tools(self, state): - output = self.llm.invoke(self.format_context_prompt(state)) + self.llm = ChatOpenAI(model="gpt-4-1106-preview") + if self.eval_factory is not None: + + tru_llm = TruChain( + self.llm, + app_id=state["project_context"].eval_project_id, + feedbacks=self.eval_factory(state), + ) + with tru_llm: + output = self.llm.invoke(self.format_context_prompt(state)) + else: + output = self.llm.invoke(self.format_context_prompt(state)) return state, output.content def run_with_tools(self, state): @@ -216,13 +234,17 @@ def run_with_tools(self, state): # Construct the OpenAI Tools agent agent = create_openai_tools_agent(self.llm, tools, self.agent_prompt) - agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=self.verbose) - # Decide what to do - if self.verbose: - # print("Action List:") - # print("\n".join([str(action) for action in self.actions])) - # print("----") - pass + agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False) + + # tru_agent = TruChain( + # agent, + # app_id=state["project_context"].eval_project_id, + # feedbacks=[ + # # create_tool_relevance_feedback(state) + # ], + # # feedback_mode=FeedbackMode.DEFERRED, + # ) + # print(tru_agent.app.middle[1]) output = "" try: try: @@ -230,6 +252,7 @@ def run_with_tools(self, state): {"input": self.format_context_prompt(state)} ) output = result["output"] + except Exception as e: raise FeedbackMessage(FailureReason.ACTION_FAILED, str(e)) except FeedbackMessage as f: diff --git a/src/planning/planner.py b/src/planning/planner.py index ccd782d..f2d52b1 100644 --- a/src/planning/planner.py +++ b/src/planning/planner.py @@ -3,6 +3,11 @@ from src.actions.action import Action, FeedbackMessage from src.actions.basic_actions import create_logging_action from src.common.definitions import FailureReason +from src.evaluation.feedback_functions import ( + create_evolving_thought_feedback, + create_repeating_work_feedback, + create_short_thought_feedback, +) from src.execution import ActionDispatcher, LLMController from src.planning.plan_actions import ( create_action_adder_for_plan, @@ -38,14 +43,14 @@ def __call__(self, state: RefactoringAgentState): return self.controller(state) +class NewThought(BaseModel): + thought: str = Field(description="The thought to add to the thoughts list") + + class Thinker: def __init__(self): def create_thought(): - class NewThought(BaseModel): - thought: str = Field( - description="The thought to add to the thoughts list" - ) def thought(state: RefactoringAgentState, args: NewThought): state["thoughts"].append(args.thought) @@ -60,17 +65,29 @@ def thought(state: RefactoringAgentState, args: NewThought): return action task = """Reflect on the current state and write a brief thought to help your future self.""" - additional_instructions = """Use this as a way to plan your next steps, reflect on what went well and how you can improve. Be incredibly brief (1-2 sentences). - Call the add_thought function to add a thought to the thoughts list. Say 'Done' after you have added your thought.""" + additional_instructions = """Use this as a way to plan your next steps, reflect on what went well and how you can improve. Be incredibly brief (1-2 sentences). This message will be saved in the thoughts section. Do not prefix your answer.""" + + def eval_think_factory(state: RefactoringAgentState): + return [ + create_evolving_thought_feedback(state), + create_short_thought_feedback(), + create_repeating_work_feedback(state), + ] + + self.add_thought = create_thought() self.controller = LLMController( - [create_thought()], + [], task, additional_instructions=additional_instructions, record_history=False, + eval_factory=eval_think_factory, ) def __call__(self, state: RefactoringAgentState): - return self.controller(state) + state, result = self.controller.run(state) + args = NewThought(thought=str(result)) + self.add_thought.execute(state, args) + return state class NextStep(Enum): @@ -92,11 +109,13 @@ class ShouldContinue: next_node: str def __init__(self) -> None: - should_continue_action = self._create_should_continue_action() + self.should_continue_action = self._create_should_continue_action() task = """Decide whether to think & execute again or finish. """ - additional_instructions = """Call the `should_continue` function with a true boolean to continue thinking & executing, and false to finish. Say 'Done' after you have called `should_continue`. Call `should_continue` only once.""" + additional_instructions = ( + """Return 'true' to think and execute again, or 'false' to finish.""" + ) self.controller = LLMController( - [should_continue_action], + [], task, additional_instructions=additional_instructions, record_history=False, @@ -122,7 +141,9 @@ def should_continue(state: RefactoringAgentState, args: NextStepInput): def __call__(self, state: RefactoringAgentState): state, decision = self.controller.run(state) - return self.next_node + # Parse the decision + cont = bool(str(decision).lower()) + return "think" if cont else "finish" class LLMExecutor: diff --git a/test.json b/test.json new file mode 100644 index 0000000..f4b3f0a --- /dev/null +++ b/test.json @@ -0,0 +1,10 @@ +{'name': None, + 'first': RunnableAssign(mapper= + { + agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps'])) +}), + 'middle': [ + ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')), MessagesPlaceholder(variable_name='chat_history', optional=True), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), MessagesPlaceholder(variable_name='agent_scratchpad')]), + RunnableBinding(bound=ChatOpenAI(client=, async_client=, model_name='gpt-4-1106-preview', openai_api_key=SecretStr('**********'), openai_proxy=''), kwargs={'tools': [{'type': 'function', 'function': {'name': 'add_thought', 'description': 'Add a thought to the thoughts list.', 'parameters': {'type': 'object', 'properties': {'thought': {'description': 'The thought to add to the thoughts list', 'type': 'string'}}, 'required': ['thought']}}}]})] + , + 'last': OpenAIToolsAgentOutputParser()} \ No newline at end of file