-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Agent that thinks more thoroughly about question and considers possib…
…le outcomes (#47) * initial notebook * Added agent that asks subsequential questions * Agent working locally * Updated poetry.lock * Added benchmark to agent_subquestions * Finishing touches * Fixing model gpt-4-turbo-preview * Adding benchmark.py EVO-style * Updating dependencies * Benchmark generated for subsequential agent (5 markets) * (WIP) Implemented PR comments, not yet final * Executed benchmark for 50 markets * Implemented final PR comments * Removed unnecessary notebooks * Updated lock file with new crewai version * Fixed mypy * Fixed isort
- Loading branch information
1 parent
de5093e
commit efc1ef7
Showing
11 changed files
with
1,139 additions
and
53 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
118 changes: 118 additions & 0 deletions
118
prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import typing as t | ||
from datetime import datetime, timedelta | ||
|
||
import typer | ||
from loguru import logger | ||
from prediction_market_agent_tooling.benchmark.agents import ( | ||
AbstractBenchmarkedAgent, | ||
FixedAgent, | ||
RandomAgent, | ||
) | ||
from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker | ||
from prediction_market_agent_tooling.benchmark.utils import ( | ||
OutcomePrediction, | ||
Prediction, | ||
) | ||
from prediction_market_agent_tooling.gtypes import Probability | ||
from prediction_market_agent_tooling.markets.agent_market import ( | ||
AgentMarket, | ||
FilterBy, | ||
SortBy, | ||
) | ||
from prediction_market_agent_tooling.markets.markets import ( | ||
MarketType, | ||
get_binary_markets, | ||
) | ||
from prediction_market_agent_tooling.tools.utils import utcnow | ||
|
||
from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import ( | ||
CrewAIAgentSubquestions, | ||
) | ||
|
||
|
||
def build_binary_agent_market_from_question(question: str) -> AgentMarket: | ||
return AgentMarket( | ||
id=question, | ||
url=question, | ||
close_time=utcnow() + timedelta(days=1), | ||
volume=None, | ||
question=question, | ||
p_yes=Probability(0.5), | ||
created_time=datetime(2024, 1, 1), | ||
resolution=None, | ||
outcomes=["YES", "NO"], | ||
) | ||
|
||
|
||
class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent): | ||
def __init__( | ||
self, | ||
max_workers: int, | ||
agent_name: str, | ||
) -> None: | ||
self.agent = CrewAIAgentSubquestions() | ||
super().__init__(agent_name=agent_name, max_workers=max_workers) | ||
|
||
def predict(self, market_question: str) -> Prediction: | ||
result = self.agent.answer_binary_market(market_question) | ||
return Prediction( | ||
outcome_prediction=OutcomePrediction( | ||
p_yes=result.p_yes, confidence=result.confidence, info_utility=None | ||
) | ||
) | ||
|
||
|
||
def main( | ||
n: int = 50, | ||
output: str = "./benchmark_report_50markets.md", | ||
reference: MarketType = MarketType.MANIFOLD, | ||
filter: FilterBy = FilterBy.OPEN, | ||
sort: SortBy = SortBy.NONE, | ||
max_workers: int = 1, | ||
cache_path: t.Optional[str] = "predictions_cache.json", | ||
only_cached: bool = False, | ||
) -> None: | ||
""" | ||
Polymarket usually contains higher quality questions, | ||
but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest. | ||
""" | ||
markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort) | ||
markets_deduplicated = list(({m.question: m for m in markets}.values())) | ||
if len(markets) != len(markets_deduplicated): | ||
logger.debug( | ||
f"Warning: Deduplicated markets from {len(markets)} to {len(markets_deduplicated)}." | ||
) | ||
|
||
logger.debug(f"Found {len(markets_deduplicated)} markets.") | ||
|
||
benchmarker = Benchmarker( | ||
markets=markets_deduplicated, | ||
agents=[ | ||
CrewAIAgentSubquestionsBenchmark( | ||
agent_name="subsequential-questions-crewai", | ||
max_workers=max_workers, | ||
), | ||
RandomAgent(agent_name="random", max_workers=max_workers), | ||
FixedAgent( | ||
fixed_answer=False, agent_name="fixed-no", max_workers=max_workers | ||
), | ||
FixedAgent( | ||
fixed_answer=True, agent_name="fixed-yes", max_workers=max_workers | ||
), | ||
], | ||
cache_path=cache_path, | ||
only_cached=only_cached, | ||
) | ||
|
||
benchmarker.run_agents( | ||
enable_timing=False | ||
) # Caching of search etc. can distort timings | ||
md = benchmarker.generate_markdown_report() | ||
|
||
with open(output, "w") as f: | ||
logger.info(f"Writing benchmark report to: {output}") | ||
f.write(md) | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
189 changes: 189 additions & 0 deletions
189
prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
import typing as t | ||
|
||
from crewai import Agent, Crew, Process, Task | ||
from langchain_core.language_models import BaseChatModel | ||
from langchain_openai import ChatOpenAI | ||
from loguru import logger | ||
from pydantic import BaseModel | ||
|
||
from prediction_market_agent.agents.crewai_subsequential_agent.prompts import ( | ||
CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT, | ||
CREATE_OUTCOMES_FROM_SCENARIO_PROMPT, | ||
FINAL_DECISION_PROMPT, | ||
PROBABILITY_CLASS_OUTPUT, | ||
PROBABILITY_FOR_ONE_OUTCOME_PROMPT, | ||
RESEARCH_OUTCOME_OUTPUT, | ||
RESEARCH_OUTCOME_PROMPT, | ||
) | ||
from prediction_market_agent.tools.crewai_tools import TavilyDevTool | ||
from prediction_market_agent.utils import APIKeys | ||
|
||
tavily_search = TavilyDevTool() | ||
|
||
|
||
class Outcomes(BaseModel): | ||
outcomes: list[str] | ||
|
||
|
||
class ProbabilityOutput(BaseModel): | ||
decision: str | ||
p_yes: float | ||
p_no: float | ||
confidence: float | ||
|
||
|
||
class CrewAIAgentSubquestions: | ||
def __init__(self) -> None: | ||
llm = self._build_llm() | ||
self.researcher = Agent( | ||
role="Research Analyst", | ||
goal="Research and report on some future event, giving high quality and nuanced analysis", | ||
backstory="You are a senior research analyst who is adept at researching and reporting on future events.", | ||
verbose=True, | ||
allow_delegation=False, | ||
tools=[tavily_search], | ||
llm=llm, | ||
) | ||
|
||
self.predictor = Agent( | ||
role="Professional Gambler", | ||
goal="Predict, based on some research you are presented with, whether or not a given event will occur", | ||
backstory="You are a professional gambler who is adept at predicting and betting on the outcomes of future events.", | ||
verbose=True, | ||
allow_delegation=False, | ||
llm=llm, | ||
) | ||
|
||
def _build_llm(self) -> BaseChatModel: | ||
keys = APIKeys() | ||
llm = ChatOpenAI( | ||
model="gpt-3.5-turbo-0125", | ||
api_key=keys.openai_api_key.get_secret_value(), | ||
) | ||
return llm | ||
|
||
def split_research_into_outcomes(self, question: str) -> Outcomes: | ||
create_outcomes_task = Task( | ||
description=CREATE_OUTCOMES_FROM_SCENARIO_PROMPT, | ||
expected_output=CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT, | ||
output_json=Outcomes, | ||
agent=self.researcher, | ||
) | ||
|
||
report_crew = Crew( | ||
agents=[self.researcher], | ||
tasks=[create_outcomes_task], | ||
) | ||
result = report_crew.kickoff(inputs={"scenario": question}) | ||
return Outcomes.model_validate_json(result) | ||
|
||
def build_tasks_for_outcome(self, input_dict: dict[str, t.Any] = {}) -> list[Task]: | ||
task_research_one_outcome = Task( | ||
description=RESEARCH_OUTCOME_PROMPT.format(**input_dict), | ||
agent=self.researcher, | ||
expected_output=RESEARCH_OUTCOME_OUTPUT, | ||
async_execution=True, | ||
) | ||
task_create_probability_for_one_outcome = Task( | ||
description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT, | ||
expected_output=PROBABILITY_CLASS_OUTPUT, | ||
agent=self.predictor, | ||
output_json=ProbabilityOutput, | ||
async_execution=False, | ||
context=[task_research_one_outcome], | ||
) | ||
|
||
return [task_research_one_outcome, task_create_probability_for_one_outcome] | ||
|
||
def generate_prediction_for_one_outcome(self, sentence: str) -> ProbabilityOutput: | ||
task_research_one_outcome = Task( | ||
description=RESEARCH_OUTCOME_PROMPT, | ||
agent=self.researcher, | ||
expected_output=RESEARCH_OUTCOME_OUTPUT, | ||
) | ||
task_create_probability_for_one_outcome = Task( | ||
description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT, | ||
expected_output=PROBABILITY_CLASS_OUTPUT, | ||
agent=self.predictor, | ||
output_json=ProbabilityOutput, | ||
context=[task_research_one_outcome], | ||
) | ||
crew = Crew( | ||
agents=[self.researcher, self.predictor], | ||
tasks=[task_research_one_outcome, task_create_probability_for_one_outcome], | ||
verbose=2, | ||
process=Process.sequential, | ||
) | ||
|
||
result = crew.kickoff(inputs={"sentence": sentence}) | ||
return ProbabilityOutput.model_validate_json(result) | ||
|
||
def generate_final_decision( | ||
self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]] | ||
) -> ProbabilityOutput: | ||
task_final_decision = Task( | ||
description=FINAL_DECISION_PROMPT, | ||
agent=self.predictor, | ||
expected_output=PROBABILITY_CLASS_OUTPUT, | ||
output_json=ProbabilityOutput, | ||
) | ||
|
||
crew = Crew( | ||
agents=[self.predictor], | ||
tasks=[task_final_decision], | ||
verbose=2, | ||
) | ||
|
||
crew.kickoff( | ||
inputs={ | ||
"outcomes_with_probabilities": [ | ||
(i[0], i[1].dict()) for i in outcomes_with_probabilities | ||
], | ||
"number_of_outcomes": len(outcomes_with_probabilities), | ||
"outcome_to_assess": outcomes_with_probabilities[0][0], | ||
} | ||
) | ||
return ProbabilityOutput.model_validate_json( | ||
task_final_decision.output.raw_output | ||
) | ||
|
||
def answer_binary_market(self, question: str) -> ProbabilityOutput: | ||
outcomes = self.split_research_into_outcomes(question) | ||
logger.debug("outcomes ", outcomes) | ||
|
||
outcomes_with_probs = [] | ||
task_map = {} | ||
for outcome in outcomes.outcomes: | ||
tasks_for_outcome = self.build_tasks_for_outcome( | ||
input_dict={"sentence": outcome} | ||
) | ||
task_map[outcome] = tasks_for_outcome | ||
|
||
# flatten nested list | ||
all_tasks = sum(task_map.values(), []) | ||
crew = Crew( | ||
agents=[self.researcher, self.predictor], | ||
tasks=all_tasks, | ||
verbose=2, | ||
process=Process.sequential, | ||
) | ||
|
||
# crew.kickoff doesn't finish all async tasks when done. | ||
crew.kickoff() | ||
|
||
# We parse individual task results to build outcomes_with_probs | ||
for outcome, tasks in task_map.items(): | ||
try: | ||
prediction_result = ProbabilityOutput.model_validate_json( | ||
tasks[1].output.raw_output | ||
) | ||
except Exception as e: | ||
logger.error("Could not parse result as ProbabilityOutput ", e) | ||
prediction_result = ProbabilityOutput( | ||
p_yes=0.5, p_no=0.5, confidence=0, decision="" | ||
) | ||
|
||
outcomes_with_probs.append((outcome, prediction_result)) | ||
|
||
final_answer = self.generate_final_decision(outcomes_with_probs) | ||
return final_answer |
53 changes: 53 additions & 0 deletions
53
prediction_market_agent/agents/crewai_subsequential_agent/deploy.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import random | ||
import typing as t | ||
|
||
from prediction_market_agent_tooling.deploy.agent import DeployableAgent | ||
from prediction_market_agent_tooling.markets.agent_market import AgentMarket | ||
from prediction_market_agent_tooling.markets.markets import MarketType | ||
from prediction_market_agent_tooling.tools.utils import should_not_happen | ||
|
||
from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import ( | ||
CrewAIAgentSubquestions, | ||
) | ||
from prediction_market_agent.agents.utils import market_is_saturated | ||
|
||
|
||
class DeployableThinkThoroughlyAgent(DeployableAgent): | ||
# For cheaper credits at this experimental stage | ||
def __init__(self) -> None: | ||
super().__init__() | ||
|
||
def pick_markets(self, markets: t.Sequence[AgentMarket]) -> t.Sequence[AgentMarket]: | ||
# We simply pick 5 random markets to bet on | ||
picked_markets: list[AgentMarket] = [] | ||
markets = list(markets) | ||
random.shuffle(markets) | ||
for market in markets: | ||
# Assume very high probability markets are already known, and have | ||
# been correctly bet on, and therefore the value of betting on them | ||
# is low. | ||
if not market_is_saturated(market=market): | ||
picked_markets.append(market) | ||
if len(picked_markets) == 5: | ||
break | ||
|
||
return picked_markets | ||
|
||
def answer_binary_market(self, market: AgentMarket) -> bool: | ||
# The answer has already been determined in `pick_markets` so we just | ||
# return it here. | ||
result = CrewAIAgentSubquestions().answer_binary_market(market.question) | ||
return ( | ||
True | ||
if result.decision == "y" | ||
else False | ||
if result.decision == "n" | ||
else should_not_happen() | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
agent = DeployableThinkThoroughlyAgent() | ||
agent.deploy_local( | ||
market_type=MarketType.OMEN, sleep_time=540, timeout=180, place_bet=False | ||
) |
Oops, something went wrong.