Skip to content

Commit

Permalink
Thinking Eval working
Browse files Browse the repository at this point in the history
  • Loading branch information
A-F-V committed Feb 23, 2024
1 parent 320c3ba commit d980ada
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 28 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,7 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

demo/
demo/

.streamlit/
default.sqlite
2 changes: 1 addition & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def run(repo: str):
code_path = "edit_distance/edit_distance.py"
query = f"In the file {code_path}, there is function called `lowest_...`. Edit the function by using better names for the variables. Do not rename the function"

context = ProjectContext(folder_path=repo)
context = ProjectContext(folder_path=repo, eval_project_id="demo_eval")
agent = RefactoringAgent()
click.echo(state_to_str(agent.run(query, context)))

Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

trulens_eval
litellm
boto3
# For linting
pylint

Expand All @@ -12,7 +15,7 @@ pytest
sphinx

# For Type Checking
mypy
#mypy

#########################

Expand All @@ -32,3 +35,4 @@ diff_match_patch
# LSP
jedi

# Evaluation
17 changes: 12 additions & 5 deletions src/agent.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from langchain import hub
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, END
from src.actions.code_inspection import create_code_loader
from src.actions.code_manipulation import create_apply_change
Expand All @@ -13,6 +10,10 @@
from .actions.basic_actions import create_logging_action
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableConfig
from trulens_eval import Tru
from langchain import hub
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_openai import ChatOpenAI


class RefactoringAgent:
Expand Down Expand Up @@ -52,7 +53,11 @@ def _setup_agent_graph(self):
# self.graph.add_node('')
self.app = self.graph.compile()

# print the graph
# TODO

def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState:
tru = Tru()
state: RefactoringAgentState = {
"goal": inp,
"project_context": context,
Expand All @@ -63,5 +68,7 @@ def run(self, inp: str, context: ProjectContext) -> RefactoringAgentState:
"code_blocks": [],
"thoughts": [],
}
config = RunnableConfig(recursion_limit=20)
return RefactoringAgentState(**self.app.invoke(state, config=config))
config = RunnableConfig(recursion_limit=30)
result = RefactoringAgentState(**self.app.invoke(state, config=config))
# tru.stop_dashboard()
return result
1 change: 1 addition & 0 deletions src/common/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class ProjectContext(BaseModel):
"""A project context."""

folder_path: str = Field(description="The folder path of the project")
eval_project_id: str = Field(description="The project id")


###########################################
Expand Down
129 changes: 129 additions & 0 deletions src/evaluation/feedback_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from typing import Optional
from trulens_eval import Feedback, Select
from trulens_eval import Tru
from trulens_eval import TruChain
from trulens_eval.feedback import OpenAI as fOpenAI
import numpy as np

from src.planning.state import RefactoringAgentState, record_to_str

tru = Tru()

# These are to be used by the LLMController where the second query is the one that is used

sentinel = -1.0


def create_sentinel_aggregator(agg):
def aggregator(values):
# Filter out None values
values = [v for v in values if v is not sentinel]
return agg(values)

return aggregator


def create_tool_relevance_feedback(state):
def tool_relevance(output) -> float:
provider = fOpenAI()
# return sentinel if the output is not a dict
if (
not isinstance(output, dict)
or "tool" not in output
or "tool_input" not in output
):
return sentinel
tool_id = output["tool"]
tool_input = output["tool_input"]
res = float(
provider.endpoint.client.chat.completions.create(
model="gpt-3.5-turbo", # Use better model?
messages=[
{
"role": "system",
"content": "How relevant was the selection of TOOL with TOOL_INPUT in addressing the current task in STATE? Reply with a number between 0 and 10.",
},
{
"role": "user",
"content": f"TOOL: {tool_id}; TOOL_INPUT: {tool_input}; STATE: {state}",
},
],
)
.choices[0]
.message.content
)
return res

f_tool_relevance = (
Feedback(tool_relevance)
.on_output()
.aggregate(create_sentinel_aggregator(np.mean))
)
return f_tool_relevance


def create_short_thought_feedback():
def short_thought(thought: str) -> float:
return float(len(thought))

return Feedback(short_thought).on_output()


def create_evolving_thought_feedback(state: RefactoringAgentState):
def evolving_thought(thought: str):
provider = fOpenAI()
past_thoughts_actions = []
for i in range(len(state["thoughts"]) - 1):
past_thoughts_actions.append(
f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}"
)
res = float(
provider.endpoint.client.chat.completions.create(
model="gpt-3.5-turbo", # Use better model?
messages=[
{
"role": "system",
"content": "Given PAST_THOUGHTS_AND_ACTIONS, how much has the NEXT_THOUGHT added to solving the ULTIMATE_GOAL? Give a number between 0 to 100 where 100 means it has added a lot. Reply only with a number, for example: '80'",
},
{
"role": "user",
"content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}\n\n\n ### ULTIMATE_GOAL ###\n {state['goal']}",
},
],
)
.choices[0]
.message.content
)
return res

return Feedback(evolving_thought).on_output()


def create_repeating_work_feedback(state: RefactoringAgentState):
def repeated_work(thought: str):
provider = fOpenAI()
past_thoughts_actions = []
for i in range(len(state["thoughts"]) - 1):
past_thoughts_actions.append(
f"#T{i}: {state['thoughts'][i]}\n#A{i}: {record_to_str(state['history'][i])}"
)
res = float(
provider.endpoint.client.chat.completions.create(
model="gpt-3.5-turbo", # Use better model?
messages=[
{
"role": "system",
"content": "Given PAST_THOUGHTS_AND_ACTIONS, how much is the NEXT_THOUGHT suggesting we repeat work already completed? Give a number between 0 to 100 where 100 means it is suggesting a complete repeat of work already completed. Reply only with a number, for example: '80'",
},
{
"role": "user",
"content": f"### PAST_THOUGHTS_AND_ACTIONS ###\n {past_thoughts_actions}\n\n\n ### NEXT_THOUGHT ###\n {thought}",
},
],
)
.choices[0]
.message.content
)
return res

return Feedback(repeated_work).on_output()
39 changes: 31 additions & 8 deletions src/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict, List
from langchain_openai import ChatOpenAI
from langchain_core.utils.function_calling import convert_to_openai_function
from trulens_eval import FeedbackMode, TruChain

from langchain.prompts import (
PromptTemplate,
Expand All @@ -12,6 +13,7 @@
)
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_core.output_parsers import JsonOutputParser
from langchain.agents import initialize_agent, AgentType
from langchain import hub
from langchain.agents import AgentExecutor, create_openai_tools_agent
from src.actions.action import Action
Expand All @@ -28,6 +30,8 @@
state_to_str,
)
from src.utilities.formatting import format_list
from .evaluation.feedback_functions import *
from trulens_eval.app import App


class ActionDispatcher:
Expand Down Expand Up @@ -142,6 +146,7 @@ def __init__(
current_task: str,
verbose=True,
additional_instructions=default_instructions,
eval_factory=None,
record_history=True,
):
self.actions = actions
Expand All @@ -150,6 +155,8 @@ def __init__(
self.verbose = verbose
self.additional_instructions = additional_instructions
self.record_history = record_history
self.eval_factory = eval_factory

self.create_prompt()
# self.chain = self.prompt_template | self.llm | self.parser

Expand Down Expand Up @@ -208,28 +215,44 @@ def run(self, state: RefactoringAgentState):
return self.run_with_tools(state)

def run_without_tools(self, state):
output = self.llm.invoke(self.format_context_prompt(state))
self.llm = ChatOpenAI(model="gpt-4-1106-preview")
if self.eval_factory is not None:

tru_llm = TruChain(
self.llm,
app_id=state["project_context"].eval_project_id,
feedbacks=self.eval_factory(state),
)
with tru_llm:
output = self.llm.invoke(self.format_context_prompt(state))
else:
output = self.llm.invoke(self.format_context_prompt(state))
return state, output.content

def run_with_tools(self, state):
tools = self.get_openai_tools(state)

# Construct the OpenAI Tools agent
agent = create_openai_tools_agent(self.llm, tools, self.agent_prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=self.verbose)
# Decide what to do
if self.verbose:
# print("Action List:")
# print("\n".join([str(action) for action in self.actions]))
# print("----")
pass
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

# tru_agent = TruChain(
# agent,
# app_id=state["project_context"].eval_project_id,
# feedbacks=[
# # create_tool_relevance_feedback(state)
# ],
# # feedback_mode=FeedbackMode.DEFERRED,
# )
# print(tru_agent.app.middle[1])
output = ""
try:
try:
result = agent_executor.invoke(
{"input": self.format_context_prompt(state)}
)
output = result["output"]

except Exception as e:
raise FeedbackMessage(FailureReason.ACTION_FAILED, str(e))
except FeedbackMessage as f:
Expand Down
Loading

0 comments on commit d980ada

Please sign in to comment.