Skip to content

Commit

Permalink
Benchmark generated for subsequential agent (5 markets)
Browse files Browse the repository at this point in the history
  • Loading branch information
gabrielfior committed Apr 9, 2024
1 parent 0aeb3c2 commit f1cfadc
Show file tree
Hide file tree
Showing 7 changed files with 696 additions and 74 deletions.
583 changes: 582 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,37 @@
import typer
from dotenv import load_dotenv
from prediction_market_agent_tooling.benchmark.agents import (
AbstractBenchmarkedAgent, RandomAgent, FixedAgent,
AbstractBenchmarkedAgent,
FixedAgent,
RandomAgent,
)
from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker
from prediction_market_agent_tooling.benchmark.utils import (
OutcomePrediction,
Prediction,
)
from prediction_market_agent_tooling.gtypes import Probability
from prediction_market_agent_tooling.markets.agent_market import SortBy, FilterBy, AgentMarket
from prediction_market_agent_tooling.markets.markets import get_binary_markets, MarketType
from prediction_market_agent_tooling.markets.agent_market import (
AgentMarket,
FilterBy,
SortBy,
)
from prediction_market_agent_tooling.markets.markets import (
MarketType,
get_binary_markets,
)

from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions
from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
CrewAIAgentSubquestions,
)


def build_binary_agent_market_from_question(question: str) -> AgentMarket:
return AgentMarket(
id=question,
url=question,
close_time=None,
volume=None,
question=question,
p_yes=Probability(0.5),
created_time=datetime(2024, 1, 1),
Expand All @@ -31,11 +45,11 @@ def build_binary_agent_market_from_question(question: str) -> AgentMarket:

class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent):
def __init__(
self,
agent_name: str,
max_workers: int,
model: str,
max_tries: int,
self,
agent_name: str,
max_workers: int,
model: str,
max_tries: int,
) -> None:
self.model = model
self.max_tries = max_tries
Expand All @@ -44,28 +58,29 @@ def __init__(

def predict(self, market_question: str) -> Prediction:
result = self.agent.answer_binary_market(market_question)
return Prediction(outcome_prediction=OutcomePrediction(
p_yes=result.p_yes,
confidence=result.confidence))
return Prediction(
outcome_prediction=OutcomePrediction(
p_yes=result.p_yes, confidence=result.confidence, info_utility=None
)
)


def main(
n: int = 10,
output: str = "./benchmark_report.md",
reference: MarketType = MarketType.MANIFOLD,
filter: FilterBy = FilterBy.OPEN,
sort: SortBy = SortBy.NONE,
max_workers: int = 1,
cache_path: t.Optional[str] = "predictions_cache.json",
only_cached: bool = False,
n: int = 5,
output: str = "./benchmark_report.md",
reference: MarketType = MarketType.MANIFOLD,
filter: FilterBy = FilterBy.OPEN,
sort: SortBy = SortBy.NONE,
max_workers: int = 1,
cache_path: t.Optional[str] = "predictions_cache.json",
only_cached: bool = False,
) -> None:
"""
Polymarket usually contains higher quality questions,
but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest.
"""
load_dotenv()
markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort)
markets = markets[:1]
markets_deduplicated = list(({m.question: m for m in markets}.values()))
if len(markets) != len(markets_deduplicated):
print(
Expand All @@ -77,8 +92,12 @@ def main(
benchmarker = Benchmarker(
markets=markets_deduplicated,
agents=[
CrewAIAgentSubquestionsBenchmark("subsequential-questions-crewai", max_workers=max_workers, max_tries=3,
model="gpt-3.5-turbo-0125"),
CrewAIAgentSubquestionsBenchmark(
"subsequential-questions-crewai",
max_workers=max_workers,
max_tries=1,
model="gpt-3.5-turbo-0125",
),
RandomAgent(agent_name="random", max_workers=max_workers),
FixedAgent(
fixed_answer=False, agent_name="fixed-no", max_workers=max_workers
Expand All @@ -91,7 +110,9 @@ def main(
only_cached=only_cached,
)

benchmarker.run_agents(enable_timing=False) # Caching of search etc. can distort timings
benchmarker.run_agents(
enable_timing=False
) # Caching of search etc. can distort timings
md = benchmarker.generate_markdown_report()

with open(output, "w") as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import typing as t

from crewai import Agent, Task, Process, Crew
from crewai import Agent, Crew, Process, Task
from pydantic import BaseModel

from prediction_market_agent.agents.crewai_subsequential_agent.prompts import *
from prediction_market_agent.tools.crewai_tools import TavilyDevTool

#search_tool = SerperDevTool()
# search_tool = SerperDevTool()
tavily_search = TavilyDevTool()


Expand Down Expand Up @@ -53,23 +53,23 @@ def split_research_into_outcomes(self, question: str) -> Outcomes:
agents=[self.researcher],
tasks=[create_outcomes_task],
)
result = report_crew.kickoff(inputs={'scenario': question})
result = report_crew.kickoff(inputs={"scenario": question})
return Outcomes.model_validate_json(result)

def build_tasks_for_outcome(self, input_dict: dict[str, t.Any] = {}) -> list[Task]:
task_research_one_outcome = Task(
description=RESEARCH_OUTCOME_PROMPT.format(**input_dict),
agent=self.researcher,
expected_output=RESEARCH_OUTCOME_OUTPUT,
async_execution=True
async_execution=True,
)
task_create_probability_for_one_outcome = Task(
description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
expected_output=PROBABILITY_CLASS_OUTPUT,
agent=self.predictor,
output_json=ProbabilityOutput,
async_execution=False,
context=[task_research_one_outcome]
context=[task_research_one_outcome],
)

return [task_research_one_outcome, task_create_probability_for_one_outcome]
Expand All @@ -85,19 +85,21 @@ def generate_prediction_for_one_outcome(self, sentence: str) -> ProbabilityOutpu
expected_output=PROBABILITY_CLASS_OUTPUT,
agent=self.predictor,
output_json=ProbabilityOutput,
context=[task_research_one_outcome]
context=[task_research_one_outcome],
)
crew = Crew(
agents=[self.researcher, self.predictor],
tasks=[task_research_one_outcome, task_create_probability_for_one_outcome],
verbose=2,
process=Process.sequential
process=Process.sequential,
)

result = crew.kickoff(inputs={'sentence': sentence})
result = crew.kickoff(inputs={"sentence": sentence})
return ProbabilityOutput.model_validate_json(result)

def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]]) -> ProbabilityOutput:
def generate_final_decision(
self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]]
) -> ProbabilityOutput:
task_final_decision = Task(
description=FINAL_DECISION_PROMPT,
agent=self.predictor,
Expand All @@ -111,21 +113,29 @@ def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str,
verbose=2,
)

crew.kickoff(inputs={'outcomes_with_probabilities':
[(i[0], i[1].dict()) for i in outcomes_with_probabilities],
'number_of_outcomes': len(outcomes_with_probabilities),
'outcome_to_assess': outcomes_with_probabilities[0][0]})
return ProbabilityOutput.model_validate_json(task_final_decision.output.raw_output)
crew.kickoff(
inputs={
"outcomes_with_probabilities": [
(i[0], i[1].dict()) for i in outcomes_with_probabilities
],
"number_of_outcomes": len(outcomes_with_probabilities),
"outcome_to_assess": outcomes_with_probabilities[0][0],
}
)
return ProbabilityOutput.model_validate_json(
task_final_decision.output.raw_output
)

def answer_binary_market(self, question: str) -> ProbabilityOutput:

outcomes = self.split_research_into_outcomes(question)
print ("outcomes ", outcomes)
print("outcomes ", outcomes)

outcomes_with_probs = []
task_map = {}
for outcome in outcomes.outcomes:
tasks_for_outcome = self.build_tasks_for_outcome(input_dict={"sentence": outcome})
tasks_for_outcome = self.build_tasks_for_outcome(
input_dict={"sentence": outcome}
)
task_map[outcome] = tasks_for_outcome

# flatten nested list
Expand All @@ -134,7 +144,7 @@ def answer_binary_market(self, question: str) -> ProbabilityOutput:
agents=[self.researcher, self.predictor],
tasks=all_tasks,
verbose=2,
process=Process.sequential
process=Process.sequential,
)

# crew.kickoff doesn't finish all async tasks when done.
Expand All @@ -143,10 +153,14 @@ def answer_binary_market(self, question: str) -> ProbabilityOutput:
# We parse individual task results to build outcomes_with_probs
for outcome, tasks in task_map.items():
try:
prediction_result = ProbabilityOutput.model_validate_json(tasks[1].output.raw_output)
prediction_result = ProbabilityOutput.model_validate_json(
tasks[1].output.raw_output
)
except Exception as e:
print("Could not parse result as ProbabilityOutput ", e)
prediction_result = ProbabilityOutput(p_yes=0.5, p_no=0.5, confidence=0, decision="")
prediction_result = ProbabilityOutput(
p_yes=0.5, p_no=0.5, confidence=0, decision=""
)

outcomes_with_probs.append((outcome, prediction_result))

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import os
from decimal import Decimal
import random
from decimal import Decimal

from langchain_openai import OpenAI
from prediction_market_agent_tooling.deploy.agent import DeployableAgent
from prediction_market_agent_tooling.markets.agent_market import AgentMarket
from prediction_market_agent_tooling.markets.data_models import BetAmount, Currency
from prediction_market_agent_tooling.markets.markets import MarketType

from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions
from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
CrewAIAgentSubquestions,
)
from prediction_market_agent.agents.known_outcome_agent.known_outcome_agent import (
Result,
)
Expand Down Expand Up @@ -40,10 +42,10 @@ def pick_markets(self, markets: list[AgentMarket]) -> list[AgentMarket]:
def answer_binary_market(self, market: AgentMarket) -> bool:
# The answer has already been determined in `pick_markets` so we just
# return it here.
os.environ["OPENAI_MODEL_NAME"]="gpt-4-turbo-preview"
os.environ["OPENAI_MODEL_NAME"] = "gpt-4-turbo-preview"
agent = CrewAIAgentSubquestions()
result = agent.answer_binary_market(market)
return result
result = agent.answer_binary_market(market.question)
return True if result.decision == "y" else False

def calculate_bet_amount(self, answer: bool, market: AgentMarket) -> BetAmount:
if market.currency == Currency.xDai:
Expand All @@ -54,10 +56,9 @@ def calculate_bet_amount(self, answer: bool, market: AgentMarket) -> BetAmount:

if __name__ == "__main__":
agent = DeployableThinkThoroughlyAgent()
agent.deploy_local(market_type=MarketType.OMEN,
sleep_time=540,
timeout=180,
place_bet=False)
agent.deploy_local(
market_type=MarketType.OMEN, sleep_time=540, timeout=180, place_bet=False
)
# agent.deploy_gcp(
# repository=f"git+{get_current_git_url()}@{get_current_git_commit_sha()}",
# market_type=MarketType.OMEN,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
{scenario}
"""

CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT = '''
CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT = """
A list containing multiple bullet points. Each bullet point should start with '-'.
Each bullet point should contain a possible outcome resulting from the
provided SCENARIO. The produced outcomes should be mutually exclusive, i.e. only one of them should be true whereas
the remaining ones should be false.
'''
"""

PROBABILITY_FOR_ONE_OUTCOME_PROMPT = """
Your task is to determine the probability of a prediction market affirmation being answered 'Yes' or 'No'.
Expand Down Expand Up @@ -93,9 +93,8 @@
- "confidence": Indicating the confidence in the estimated probabilities you provided ranging from 0 (lowest confidence) to
1 (maximum confidence). Confidence can be calculated based on the quality and quantity of data used for the estimation.
A valid JSON string as output could look like the example below:
Example output: {"decision": "y","p_yes": 0.1, "p_no": 0.9, "confidence": 0.4}
Do not use escape quotes and line breaks. Do not output any reasoning, only the JSON object.
Do not surround the output object with escape quotes, line breaks nor '''.
Do not output any reasoning, only the JSON object.
Ensure p_yes + p_no equals 1.
"""
34 changes: 20 additions & 14 deletions prediction_market_agent/tools/crewai_tools.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
import os
from typing import Type, Any
from typing import Any, Type

from crewai_tools.tools.base_tool import BaseTool
from langchain_community.utilities.tavily_search import TavilySearchAPIWrapper
from pydantic.v1 import BaseModel, Field
from pydantic.v1.types import SecretStr


class TavilyDevToolSchema(BaseModel):
"""Input for TXTSearchTool."""
search_query: str = Field(..., description="Mandatory search query you want to use to search the internet")
"""Input for TXTSearchTool."""

search_query: str = Field(
..., description="Mandatory search query you want to use to search the internet"
)


class TavilyDevTool(BaseTool):
name: str = "Search the internet"
# From Langchain's Tavily integration
description: str = """"A search engine optimized for comprehensive, accurate, \
name: str = "Search the internet"
# From Langchain's Tavily integration
description: str = """"A search engine optimized for comprehensive, accurate, \
and trusted results. Useful for when you need to answer questions \
about current events or about recent information. \
Input should be a search query. \
If the user is asking about something that you don't know about, \
you should probably use this tool to see if that can provide any information."""
args_schema: Type[BaseModel] = TavilyDevToolSchema

def _run(
self,
search_query: str,
**kwargs: Any,
) -> Any:
args_schema: Type[BaseModel] = TavilyDevToolSchema

return TavilySearchAPIWrapper(tavily_api_key = os.environ['TAVILY_API_KEY']).results(query=search_query)
def _run(
self,
search_query: str,
**kwargs: Any,
) -> Any:
return TavilySearchAPIWrapper(
tavily_api_key=SecretStr(os.environ["TAVILY_API_KEY"])
).results(query=search_query)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ google-search-results = "*"
pytest = "*"
llama-index = "~0.9.0"
duckduckgo-search = "*"
crewai = {extras = ["tools"], version = ">=0.22.5 <0."}
crewai = {extras = ["tools"], version = ">=0.22.5"}
# metagpt = "*" # Commented out because requires super old version of langchain, and conflicts with crewai.
replicate = "*"
typer = "^0.9.0"
Expand Down

0 comments on commit f1cfadc

Please sign in to comment.