Skip to content

Commit

Permalink
Agent that thinks more thoroughly about question and considers possib…
Browse files Browse the repository at this point in the history
…le outcomes (#47)

* initial notebook

* Added agent that asks subsequential questions

* Agent working locally

* Updated poetry.lock

* Added benchmark to agent_subquestions

* Finishing touches

* Fixing model gpt-4-turbo-preview

* Adding benchmark.py EVO-style

* Updating dependencies

* Benchmark generated for subsequential agent (5 markets)

* (WIP) Implemented PR comments, not yet final

* Executed benchmark for 50 markets

* Implemented final PR comments

* Removed unnecessary notebooks

* Updated lock file with new crewai version

* Fixed mypy

* Fixed isort
  • Loading branch information
gabrielfior authored Apr 11, 2024
1 parent de5093e commit efc1ef7
Show file tree
Hide file tree
Showing 11 changed files with 1,139 additions and 53 deletions.
674 changes: 626 additions & 48 deletions poetry.lock

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import typing as t
from datetime import datetime, timedelta

import typer
from loguru import logger
from prediction_market_agent_tooling.benchmark.agents import (
AbstractBenchmarkedAgent,
FixedAgent,
RandomAgent,
)
from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker
from prediction_market_agent_tooling.benchmark.utils import (
OutcomePrediction,
Prediction,
)
from prediction_market_agent_tooling.gtypes import Probability
from prediction_market_agent_tooling.markets.agent_market import (
AgentMarket,
FilterBy,
SortBy,
)
from prediction_market_agent_tooling.markets.markets import (
MarketType,
get_binary_markets,
)
from prediction_market_agent_tooling.tools.utils import utcnow

from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
CrewAIAgentSubquestions,
)


def build_binary_agent_market_from_question(question: str) -> AgentMarket:
return AgentMarket(
id=question,
url=question,
close_time=utcnow() + timedelta(days=1),
volume=None,
question=question,
p_yes=Probability(0.5),
created_time=datetime(2024, 1, 1),
resolution=None,
outcomes=["YES", "NO"],
)


class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent):
def __init__(
self,
max_workers: int,
agent_name: str,
) -> None:
self.agent = CrewAIAgentSubquestions()
super().__init__(agent_name=agent_name, max_workers=max_workers)

def predict(self, market_question: str) -> Prediction:
result = self.agent.answer_binary_market(market_question)
return Prediction(
outcome_prediction=OutcomePrediction(
p_yes=result.p_yes, confidence=result.confidence, info_utility=None
)
)


def main(
n: int = 50,
output: str = "./benchmark_report_50markets.md",
reference: MarketType = MarketType.MANIFOLD,
filter: FilterBy = FilterBy.OPEN,
sort: SortBy = SortBy.NONE,
max_workers: int = 1,
cache_path: t.Optional[str] = "predictions_cache.json",
only_cached: bool = False,
) -> None:
"""
Polymarket usually contains higher quality questions,
but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest.
"""
markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort)
markets_deduplicated = list(({m.question: m for m in markets}.values()))
if len(markets) != len(markets_deduplicated):
logger.debug(
f"Warning: Deduplicated markets from {len(markets)} to {len(markets_deduplicated)}."
)

logger.debug(f"Found {len(markets_deduplicated)} markets.")

benchmarker = Benchmarker(
markets=markets_deduplicated,
agents=[
CrewAIAgentSubquestionsBenchmark(
agent_name="subsequential-questions-crewai",
max_workers=max_workers,
),
RandomAgent(agent_name="random", max_workers=max_workers),
FixedAgent(
fixed_answer=False, agent_name="fixed-no", max_workers=max_workers
),
FixedAgent(
fixed_answer=True, agent_name="fixed-yes", max_workers=max_workers
),
],
cache_path=cache_path,
only_cached=only_cached,
)

benchmarker.run_agents(
enable_timing=False
) # Caching of search etc. can distort timings
md = benchmarker.generate_markdown_report()

with open(output, "w") as f:
logger.info(f"Writing benchmark report to: {output}")
f.write(md)


if __name__ == "__main__":
typer.run(main)
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import typing as t

from crewai import Agent, Crew, Process, Task
from langchain_core.language_models import BaseChatModel
from langchain_openai import ChatOpenAI
from loguru import logger
from pydantic import BaseModel

from prediction_market_agent.agents.crewai_subsequential_agent.prompts import (
CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT,
CREATE_OUTCOMES_FROM_SCENARIO_PROMPT,
FINAL_DECISION_PROMPT,
PROBABILITY_CLASS_OUTPUT,
PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
RESEARCH_OUTCOME_OUTPUT,
RESEARCH_OUTCOME_PROMPT,
)
from prediction_market_agent.tools.crewai_tools import TavilyDevTool
from prediction_market_agent.utils import APIKeys

tavily_search = TavilyDevTool()


class Outcomes(BaseModel):
outcomes: list[str]


class ProbabilityOutput(BaseModel):
decision: str
p_yes: float
p_no: float
confidence: float


class CrewAIAgentSubquestions:
def __init__(self) -> None:
llm = self._build_llm()
self.researcher = Agent(
role="Research Analyst",
goal="Research and report on some future event, giving high quality and nuanced analysis",
backstory="You are a senior research analyst who is adept at researching and reporting on future events.",
verbose=True,
allow_delegation=False,
tools=[tavily_search],
llm=llm,
)

self.predictor = Agent(
role="Professional Gambler",
goal="Predict, based on some research you are presented with, whether or not a given event will occur",
backstory="You are a professional gambler who is adept at predicting and betting on the outcomes of future events.",
verbose=True,
allow_delegation=False,
llm=llm,
)

def _build_llm(self) -> BaseChatModel:
keys = APIKeys()
llm = ChatOpenAI(
model="gpt-3.5-turbo-0125",
api_key=keys.openai_api_key.get_secret_value(),
)
return llm

def split_research_into_outcomes(self, question: str) -> Outcomes:
create_outcomes_task = Task(
description=CREATE_OUTCOMES_FROM_SCENARIO_PROMPT,
expected_output=CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT,
output_json=Outcomes,
agent=self.researcher,
)

report_crew = Crew(
agents=[self.researcher],
tasks=[create_outcomes_task],
)
result = report_crew.kickoff(inputs={"scenario": question})
return Outcomes.model_validate_json(result)

def build_tasks_for_outcome(self, input_dict: dict[str, t.Any] = {}) -> list[Task]:
task_research_one_outcome = Task(
description=RESEARCH_OUTCOME_PROMPT.format(**input_dict),
agent=self.researcher,
expected_output=RESEARCH_OUTCOME_OUTPUT,
async_execution=True,
)
task_create_probability_for_one_outcome = Task(
description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
expected_output=PROBABILITY_CLASS_OUTPUT,
agent=self.predictor,
output_json=ProbabilityOutput,
async_execution=False,
context=[task_research_one_outcome],
)

return [task_research_one_outcome, task_create_probability_for_one_outcome]

def generate_prediction_for_one_outcome(self, sentence: str) -> ProbabilityOutput:
task_research_one_outcome = Task(
description=RESEARCH_OUTCOME_PROMPT,
agent=self.researcher,
expected_output=RESEARCH_OUTCOME_OUTPUT,
)
task_create_probability_for_one_outcome = Task(
description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
expected_output=PROBABILITY_CLASS_OUTPUT,
agent=self.predictor,
output_json=ProbabilityOutput,
context=[task_research_one_outcome],
)
crew = Crew(
agents=[self.researcher, self.predictor],
tasks=[task_research_one_outcome, task_create_probability_for_one_outcome],
verbose=2,
process=Process.sequential,
)

result = crew.kickoff(inputs={"sentence": sentence})
return ProbabilityOutput.model_validate_json(result)

def generate_final_decision(
self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]]
) -> ProbabilityOutput:
task_final_decision = Task(
description=FINAL_DECISION_PROMPT,
agent=self.predictor,
expected_output=PROBABILITY_CLASS_OUTPUT,
output_json=ProbabilityOutput,
)

crew = Crew(
agents=[self.predictor],
tasks=[task_final_decision],
verbose=2,
)

crew.kickoff(
inputs={
"outcomes_with_probabilities": [
(i[0], i[1].dict()) for i in outcomes_with_probabilities
],
"number_of_outcomes": len(outcomes_with_probabilities),
"outcome_to_assess": outcomes_with_probabilities[0][0],
}
)
return ProbabilityOutput.model_validate_json(
task_final_decision.output.raw_output
)

def answer_binary_market(self, question: str) -> ProbabilityOutput:
outcomes = self.split_research_into_outcomes(question)
logger.debug("outcomes ", outcomes)

outcomes_with_probs = []
task_map = {}
for outcome in outcomes.outcomes:
tasks_for_outcome = self.build_tasks_for_outcome(
input_dict={"sentence": outcome}
)
task_map[outcome] = tasks_for_outcome

# flatten nested list
all_tasks = sum(task_map.values(), [])
crew = Crew(
agents=[self.researcher, self.predictor],
tasks=all_tasks,
verbose=2,
process=Process.sequential,
)

# crew.kickoff doesn't finish all async tasks when done.
crew.kickoff()

# We parse individual task results to build outcomes_with_probs
for outcome, tasks in task_map.items():
try:
prediction_result = ProbabilityOutput.model_validate_json(
tasks[1].output.raw_output
)
except Exception as e:
logger.error("Could not parse result as ProbabilityOutput ", e)
prediction_result = ProbabilityOutput(
p_yes=0.5, p_no=0.5, confidence=0, decision=""
)

outcomes_with_probs.append((outcome, prediction_result))

final_answer = self.generate_final_decision(outcomes_with_probs)
return final_answer
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import random
import typing as t

from prediction_market_agent_tooling.deploy.agent import DeployableAgent
from prediction_market_agent_tooling.markets.agent_market import AgentMarket
from prediction_market_agent_tooling.markets.markets import MarketType
from prediction_market_agent_tooling.tools.utils import should_not_happen

from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
CrewAIAgentSubquestions,
)
from prediction_market_agent.agents.utils import market_is_saturated


class DeployableThinkThoroughlyAgent(DeployableAgent):
# For cheaper credits at this experimental stage
def __init__(self) -> None:
super().__init__()

def pick_markets(self, markets: t.Sequence[AgentMarket]) -> t.Sequence[AgentMarket]:
# We simply pick 5 random markets to bet on
picked_markets: list[AgentMarket] = []
markets = list(markets)
random.shuffle(markets)
for market in markets:
# Assume very high probability markets are already known, and have
# been correctly bet on, and therefore the value of betting on them
# is low.
if not market_is_saturated(market=market):
picked_markets.append(market)
if len(picked_markets) == 5:
break

return picked_markets

def answer_binary_market(self, market: AgentMarket) -> bool:
# The answer has already been determined in `pick_markets` so we just
# return it here.
result = CrewAIAgentSubquestions().answer_binary_market(market.question)
return (
True
if result.decision == "y"
else False
if result.decision == "n"
else should_not_happen()
)


if __name__ == "__main__":
agent = DeployableThinkThoroughlyAgent()
agent.deploy_local(
market_type=MarketType.OMEN, sleep_time=540, timeout=180, place_bet=False
)
Loading

0 comments on commit efc1ef7

Please sign in to comment.