Benchmark generated for subsequential agent (5 markets)

gnosis · Apr 9, 2024 · f1cfadc · f1cfadc
1 parent 0aeb3c2
commit f1cfadc
Show file tree

Hide file tree

Showing 7 changed files with 696 additions and 74 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py b/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py
@@ -4,23 +4,37 @@
 import typer
 from dotenv import load_dotenv
 from prediction_market_agent_tooling.benchmark.agents import (
-    AbstractBenchmarkedAgent, RandomAgent, FixedAgent,
+    AbstractBenchmarkedAgent,
+    FixedAgent,
+    RandomAgent,
 )
 from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker
 from prediction_market_agent_tooling.benchmark.utils import (
     OutcomePrediction,
     Prediction,
 )
 from prediction_market_agent_tooling.gtypes import Probability
-from prediction_market_agent_tooling.markets.agent_market import SortBy, FilterBy, AgentMarket
-from prediction_market_agent_tooling.markets.markets import get_binary_markets, MarketType
+from prediction_market_agent_tooling.markets.agent_market import (
+    AgentMarket,
+    FilterBy,
+    SortBy,
+)
+from prediction_market_agent_tooling.markets.markets import (
+    MarketType,
+    get_binary_markets,
+)
 
-from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions
+from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
+    CrewAIAgentSubquestions,
+)
 
 
 def build_binary_agent_market_from_question(question: str) -> AgentMarket:
     return AgentMarket(
         id=question,
+        url=question,
+        close_time=None,
+        volume=None,
         question=question,
         p_yes=Probability(0.5),
         created_time=datetime(2024, 1, 1),
@@ -31,11 +45,11 @@ def build_binary_agent_market_from_question(question: str) -> AgentMarket:
 
 class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent):
     def __init__(
-            self,
-            agent_name: str,
-            max_workers: int,
-            model: str,
-            max_tries: int,
+        self,
+        agent_name: str,
+        max_workers: int,
+        model: str,
+        max_tries: int,
     ) -> None:
         self.model = model
         self.max_tries = max_tries
@@ -44,28 +58,29 @@ def __init__(
 
     def predict(self, market_question: str) -> Prediction:
         result = self.agent.answer_binary_market(market_question)
-        return Prediction(outcome_prediction=OutcomePrediction(
-            p_yes=result.p_yes,
-            confidence=result.confidence))
+        return Prediction(
+            outcome_prediction=OutcomePrediction(
+                p_yes=result.p_yes, confidence=result.confidence, info_utility=None
+            )
+        )
 
 
 def main(
-        n: int = 10,
-        output: str = "./benchmark_report.md",
-        reference: MarketType = MarketType.MANIFOLD,
-        filter: FilterBy = FilterBy.OPEN,
-        sort: SortBy = SortBy.NONE,
-        max_workers: int = 1,
-        cache_path: t.Optional[str] = "predictions_cache.json",
-        only_cached: bool = False,
+    n: int = 5,
+    output: str = "./benchmark_report.md",
+    reference: MarketType = MarketType.MANIFOLD,
+    filter: FilterBy = FilterBy.OPEN,
+    sort: SortBy = SortBy.NONE,
+    max_workers: int = 1,
+    cache_path: t.Optional[str] = "predictions_cache.json",
+    only_cached: bool = False,
 ) -> None:
     """
     Polymarket usually contains higher quality questions,
     but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest.
     """
     load_dotenv()
     markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort)
-    markets = markets[:1]
     markets_deduplicated = list(({m.question: m for m in markets}.values()))
     if len(markets) != len(markets_deduplicated):
         print(
@@ -77,8 +92,12 @@ def main(
     benchmarker = Benchmarker(
         markets=markets_deduplicated,
         agents=[
-            CrewAIAgentSubquestionsBenchmark("subsequential-questions-crewai", max_workers=max_workers, max_tries=3,
-                                             model="gpt-3.5-turbo-0125"),
+            CrewAIAgentSubquestionsBenchmark(
+                "subsequential-questions-crewai",
+                max_workers=max_workers,
+                max_tries=1,
+                model="gpt-3.5-turbo-0125",
+            ),
             RandomAgent(agent_name="random", max_workers=max_workers),
             FixedAgent(
                 fixed_answer=False, agent_name="fixed-no", max_workers=max_workers
@@ -91,7 +110,9 @@ def main(
         only_cached=only_cached,
     )
 
-    benchmarker.run_agents(enable_timing=False)  # Caching of search etc. can distort timings
+    benchmarker.run_agents(
+        enable_timing=False
+    )  # Caching of search etc. can distort timings
     md = benchmarker.generate_markdown_report()
 
     with open(output, "w") as f:

diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py b/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py
@@ -1,12 +1,12 @@
 import typing as t
 
-from crewai import Agent, Task, Process, Crew
+from crewai import Agent, Crew, Process, Task
 from pydantic import BaseModel
 
 from prediction_market_agent.agents.crewai_subsequential_agent.prompts import *
 from prediction_market_agent.tools.crewai_tools import TavilyDevTool
 
-#search_tool = SerperDevTool()
+# search_tool = SerperDevTool()
 tavily_search = TavilyDevTool()
 
 
@@ -53,23 +53,23 @@ def split_research_into_outcomes(self, question: str) -> Outcomes:
             agents=[self.researcher],
             tasks=[create_outcomes_task],
         )
-        result = report_crew.kickoff(inputs={'scenario': question})
+        result = report_crew.kickoff(inputs={"scenario": question})
         return Outcomes.model_validate_json(result)
 
     def build_tasks_for_outcome(self, input_dict: dict[str, t.Any] = {}) -> list[Task]:
         task_research_one_outcome = Task(
             description=RESEARCH_OUTCOME_PROMPT.format(**input_dict),
             agent=self.researcher,
             expected_output=RESEARCH_OUTCOME_OUTPUT,
-            async_execution=True
+            async_execution=True,
         )
         task_create_probability_for_one_outcome = Task(
             description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
             expected_output=PROBABILITY_CLASS_OUTPUT,
             agent=self.predictor,
             output_json=ProbabilityOutput,
             async_execution=False,
-            context=[task_research_one_outcome]
+            context=[task_research_one_outcome],
         )
 
         return [task_research_one_outcome, task_create_probability_for_one_outcome]
@@ -85,19 +85,21 @@ def generate_prediction_for_one_outcome(self, sentence: str) -> ProbabilityOutpu
             expected_output=PROBABILITY_CLASS_OUTPUT,
             agent=self.predictor,
             output_json=ProbabilityOutput,
-            context=[task_research_one_outcome]
+            context=[task_research_one_outcome],
         )
         crew = Crew(
             agents=[self.researcher, self.predictor],
             tasks=[task_research_one_outcome, task_create_probability_for_one_outcome],
             verbose=2,
-            process=Process.sequential
+            process=Process.sequential,
         )
 
-        result = crew.kickoff(inputs={'sentence': sentence})
+        result = crew.kickoff(inputs={"sentence": sentence})
         return ProbabilityOutput.model_validate_json(result)
 
-    def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]]) -> ProbabilityOutput:
+    def generate_final_decision(
+        self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]]
+    ) -> ProbabilityOutput:
         task_final_decision = Task(
             description=FINAL_DECISION_PROMPT,
             agent=self.predictor,
@@ -111,21 +113,29 @@ def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str,
             verbose=2,
         )
 
-        crew.kickoff(inputs={'outcomes_with_probabilities':
-                                          [(i[0], i[1].dict()) for i in outcomes_with_probabilities],
-                                      'number_of_outcomes': len(outcomes_with_probabilities),
-                                      'outcome_to_assess': outcomes_with_probabilities[0][0]})
-        return ProbabilityOutput.model_validate_json(task_final_decision.output.raw_output)
+        crew.kickoff(
+            inputs={
+                "outcomes_with_probabilities": [
+                    (i[0], i[1].dict()) for i in outcomes_with_probabilities
+                ],
+                "number_of_outcomes": len(outcomes_with_probabilities),
+                "outcome_to_assess": outcomes_with_probabilities[0][0],
+            }
+        )
+        return ProbabilityOutput.model_validate_json(
+            task_final_decision.output.raw_output
+        )
 
     def answer_binary_market(self, question: str) -> ProbabilityOutput:
-
         outcomes = self.split_research_into_outcomes(question)
-        print ("outcomes ", outcomes)
+        print("outcomes ", outcomes)
 
         outcomes_with_probs = []
         task_map = {}
         for outcome in outcomes.outcomes:
-            tasks_for_outcome = self.build_tasks_for_outcome(input_dict={"sentence": outcome})
+            tasks_for_outcome = self.build_tasks_for_outcome(
+                input_dict={"sentence": outcome}
+            )
             task_map[outcome] = tasks_for_outcome
 
         # flatten nested list
@@ -134,7 +144,7 @@ def answer_binary_market(self, question: str) -> ProbabilityOutput:
             agents=[self.researcher, self.predictor],
             tasks=all_tasks,
             verbose=2,
-            process=Process.sequential
+            process=Process.sequential,
         )
 
         # crew.kickoff doesn't finish all async tasks when done.
@@ -143,10 +153,14 @@ def answer_binary_market(self, question: str) -> ProbabilityOutput:
         # We parse individual task results to build outcomes_with_probs
         for outcome, tasks in task_map.items():
             try:
-                prediction_result = ProbabilityOutput.model_validate_json(tasks[1].output.raw_output)
+                prediction_result = ProbabilityOutput.model_validate_json(
+                    tasks[1].output.raw_output
+                )
             except Exception as e:
                 print("Could not parse result as ProbabilityOutput ", e)
-                prediction_result = ProbabilityOutput(p_yes=0.5, p_no=0.5, confidence=0, decision="")
+                prediction_result = ProbabilityOutput(
+                    p_yes=0.5, p_no=0.5, confidence=0, decision=""
+                )
 
             outcomes_with_probs.append((outcome, prediction_result))
 

diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py b/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py
@@ -1,14 +1,16 @@
 import os
-from decimal import Decimal
 import random
+from decimal import Decimal
 
 from langchain_openai import OpenAI
 from prediction_market_agent_tooling.deploy.agent import DeployableAgent
 from prediction_market_agent_tooling.markets.agent_market import AgentMarket
 from prediction_market_agent_tooling.markets.data_models import BetAmount, Currency
 from prediction_market_agent_tooling.markets.markets import MarketType
 
-from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions
+from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
+    CrewAIAgentSubquestions,
+)
 from prediction_market_agent.agents.known_outcome_agent.known_outcome_agent import (
     Result,
 )
@@ -40,10 +42,10 @@ def pick_markets(self, markets: list[AgentMarket]) -> list[AgentMarket]:
     def answer_binary_market(self, market: AgentMarket) -> bool:
         # The answer has already been determined in `pick_markets` so we just
         # return it here.
-        os.environ["OPENAI_MODEL_NAME"]="gpt-4-turbo-preview"
+        os.environ["OPENAI_MODEL_NAME"] = "gpt-4-turbo-preview"
         agent = CrewAIAgentSubquestions()
-        result = agent.answer_binary_market(market)
-        return result
+        result = agent.answer_binary_market(market.question)
+        return True if result.decision == "y" else False
 
     def calculate_bet_amount(self, answer: bool, market: AgentMarket) -> BetAmount:
         if market.currency == Currency.xDai:
@@ -54,10 +56,9 @@ def calculate_bet_amount(self, answer: bool, market: AgentMarket) -> BetAmount:
 
 if __name__ == "__main__":
     agent = DeployableThinkThoroughlyAgent()
-    agent.deploy_local(market_type=MarketType.OMEN,
-                       sleep_time=540,
-                       timeout=180,
-                       place_bet=False)
+    agent.deploy_local(
+        market_type=MarketType.OMEN, sleep_time=540, timeout=180, place_bet=False
+    )
     # agent.deploy_gcp(
     #     repository=f"git+{get_current_git_url()}@{get_current_git_commit_sha()}",
     #     market_type=MarketType.OMEN,

diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py b/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py
@@ -22,12 +22,12 @@
         {scenario}
         """
 
-CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT = '''
+CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT = """
   A list containing multiple bullet points. Each bullet point should start with '-'.
   Each bullet point should contain a possible outcome resulting from the
   provided SCENARIO. The produced outcomes should be mutually exclusive, i.e. only one of them should be true whereas
   the remaining ones should be false.
-  '''
+  """
 
 PROBABILITY_FOR_ONE_OUTCOME_PROMPT = """
                             Your task is to determine the probability of a prediction market affirmation being answered 'Yes' or 'No'.
@@ -93,9 +93,8 @@
     - "confidence": Indicating the confidence in the estimated probabilities you provided ranging from 0 (lowest confidence) to 
     1 (maximum confidence). Confidence can be calculated based on the quality and quantity of data used for the estimation.
 
-    A valid JSON string as output could look like the example below:
-    Example output: {"decision": "y","p_yes": 0.1, "p_no": 0.9, "confidence": 0.4}
-    Do not use escape quotes and line breaks. Do not output any reasoning, only the JSON object.
+    Do not surround the output object with escape quotes, line breaks nor '''. 
+    Do not output any reasoning, only the JSON object.
 
   Ensure p_yes + p_no equals 1.
 """
diff --git a/prediction_market_agent/tools/crewai_tools.py b/prediction_market_agent/tools/crewai_tools.py
@@ -1,30 +1,36 @@
 import os
-from typing import Type, Any
+from typing import Any, Type
 
 from crewai_tools.tools.base_tool import BaseTool
 from langchain_community.utilities.tavily_search import TavilySearchAPIWrapper
 from pydantic.v1 import BaseModel, Field
+from pydantic.v1.types import SecretStr
 
 
 class TavilyDevToolSchema(BaseModel):
-	"""Input for TXTSearchTool."""
-	search_query: str = Field(..., description="Mandatory search query you want to use to search the internet")
+    """Input for TXTSearchTool."""
+
+    search_query: str = Field(
+        ..., description="Mandatory search query you want to use to search the internet"
+    )
+
 
 class TavilyDevTool(BaseTool):
-	name: str = "Search the internet"
-	# From Langchain's Tavily integration
-	description: str = """"A search engine optimized for comprehensive, accurate, \
+    name: str = "Search the internet"
+    # From Langchain's Tavily integration
+    description: str = """"A search engine optimized for comprehensive, accurate, \
 and trusted results. Useful for when you need to answer questions \
 about current events or about recent information. \
 Input should be a search query. \
 If the user is asking about something that you don't know about, \
 you should probably use this tool to see if that can provide any information."""
-	args_schema: Type[BaseModel] = TavilyDevToolSchema
-
-	def _run(
-		self,
-		search_query: str,
-		**kwargs: Any,
-	) -> Any:
+    args_schema: Type[BaseModel] = TavilyDevToolSchema
 
-		return TavilySearchAPIWrapper(tavily_api_key = os.environ['TAVILY_API_KEY']).results(query=search_query)
+    def _run(
+        self,
+        search_query: str,
+        **kwargs: Any,
+    ) -> Any:
+        return TavilySearchAPIWrapper(
+            tavily_api_key=SecretStr(os.environ["TAVILY_API_KEY"])
+        ).results(query=search_query)
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ google-search-results = "*"
 pytest = "*"
 llama-index = "~0.9.0"
 duckduckgo-search = "*"
-crewai = {extras = ["tools"], version = ">=0.22.5 <0."}
+crewai = {extras = ["tools"], version = ">=0.22.5"}
 # metagpt = "*" # Commented out because requires super old version of langchain, and conflicts with crewai.
 replicate = "*"
 typer = "^0.9.0"