Agent that thinks more thoroughly about question and considers possib…

…le outcomes (#47) * initial notebook * Added agent that asks subsequential questions * Agent working locally * Updated poetry.lock * Added benchmark to agent_subquestions * Finishing touches * Fixing model gpt-4-turbo-preview * Adding benchmark.py EVO-style * Updating dependencies * Benchmark generated for subsequential agent (5 markets) * (WIP) Implemented PR comments, not yet final * Executed benchmark for 50 markets * Implemented final PR comments * Removed unnecessary notebooks * Updated lock file with new crewai version * Fixed mypy * Fixed isort
gnosis · Apr 11, 2024 · efc1ef7 · efc1ef7
1 parent de5093e
commit efc1ef7
Show file tree

Hide file tree

Showing 11 changed files with 1,139 additions and 53 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py b/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py
@@ -0,0 +1,118 @@
+import typing as t
+from datetime import datetime, timedelta
+
+import typer
+from loguru import logger
+from prediction_market_agent_tooling.benchmark.agents import (
+    AbstractBenchmarkedAgent,
+    FixedAgent,
+    RandomAgent,
+)
+from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker
+from prediction_market_agent_tooling.benchmark.utils import (
+    OutcomePrediction,
+    Prediction,
+)
+from prediction_market_agent_tooling.gtypes import Probability
+from prediction_market_agent_tooling.markets.agent_market import (
+    AgentMarket,
+    FilterBy,
+    SortBy,
+)
+from prediction_market_agent_tooling.markets.markets import (
+    MarketType,
+    get_binary_markets,
+)
+from prediction_market_agent_tooling.tools.utils import utcnow
+
+from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
+    CrewAIAgentSubquestions,
+)
+
+
+def build_binary_agent_market_from_question(question: str) -> AgentMarket:
+    return AgentMarket(
+        id=question,
+        url=question,
+        close_time=utcnow() + timedelta(days=1),
+        volume=None,
+        question=question,
+        p_yes=Probability(0.5),
+        created_time=datetime(2024, 1, 1),
+        resolution=None,
+        outcomes=["YES", "NO"],
+    )
+
+
+class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent):
+    def __init__(
+        self,
+        max_workers: int,
+        agent_name: str,
+    ) -> None:
+        self.agent = CrewAIAgentSubquestions()
+        super().__init__(agent_name=agent_name, max_workers=max_workers)
+
+    def predict(self, market_question: str) -> Prediction:
+        result = self.agent.answer_binary_market(market_question)
+        return Prediction(
+            outcome_prediction=OutcomePrediction(
+                p_yes=result.p_yes, confidence=result.confidence, info_utility=None
+            )
+        )
+
+
+def main(
+    n: int = 50,
+    output: str = "./benchmark_report_50markets.md",
+    reference: MarketType = MarketType.MANIFOLD,
+    filter: FilterBy = FilterBy.OPEN,
+    sort: SortBy = SortBy.NONE,
+    max_workers: int = 1,
+    cache_path: t.Optional[str] = "predictions_cache.json",
+    only_cached: bool = False,
+) -> None:
+    """
+    Polymarket usually contains higher quality questions,
+    but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest.
+    """
+    markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort)
+    markets_deduplicated = list(({m.question: m for m in markets}.values()))
+    if len(markets) != len(markets_deduplicated):
+        logger.debug(
+            f"Warning: Deduplicated markets from {len(markets)} to {len(markets_deduplicated)}."
+        )
+
+    logger.debug(f"Found {len(markets_deduplicated)} markets.")
+
+    benchmarker = Benchmarker(
+        markets=markets_deduplicated,
+        agents=[
+            CrewAIAgentSubquestionsBenchmark(
+                agent_name="subsequential-questions-crewai",
+                max_workers=max_workers,
+            ),
+            RandomAgent(agent_name="random", max_workers=max_workers),
+            FixedAgent(
+                fixed_answer=False, agent_name="fixed-no", max_workers=max_workers
+            ),
+            FixedAgent(
+                fixed_answer=True, agent_name="fixed-yes", max_workers=max_workers
+            ),
+        ],
+        cache_path=cache_path,
+        only_cached=only_cached,
+    )
+
+    benchmarker.run_agents(
+        enable_timing=False
+    )  # Caching of search etc. can distort timings
+    md = benchmarker.generate_markdown_report()
+
+    with open(output, "w") as f:
+        logger.info(f"Writing benchmark report to: {output}")
+        f.write(md)
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py b/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py
@@ -0,0 +1,189 @@
+import typing as t
+
+from crewai import Agent, Crew, Process, Task
+from langchain_core.language_models import BaseChatModel
+from langchain_openai import ChatOpenAI
+from loguru import logger
+from pydantic import BaseModel
+
+from prediction_market_agent.agents.crewai_subsequential_agent.prompts import (
+    CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT,
+    CREATE_OUTCOMES_FROM_SCENARIO_PROMPT,
+    FINAL_DECISION_PROMPT,
+    PROBABILITY_CLASS_OUTPUT,
+    PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
+    RESEARCH_OUTCOME_OUTPUT,
+    RESEARCH_OUTCOME_PROMPT,
+)
+from prediction_market_agent.tools.crewai_tools import TavilyDevTool
+from prediction_market_agent.utils import APIKeys
+
+tavily_search = TavilyDevTool()
+
+
+class Outcomes(BaseModel):
+    outcomes: list[str]
+
+
+class ProbabilityOutput(BaseModel):
+    decision: str
+    p_yes: float
+    p_no: float
+    confidence: float
+
+
+class CrewAIAgentSubquestions:
+    def __init__(self) -> None:
+        llm = self._build_llm()
+        self.researcher = Agent(
+            role="Research Analyst",
+            goal="Research and report on some future event, giving high quality and nuanced analysis",
+            backstory="You are a senior research analyst who is adept at researching and reporting on future events.",
+            verbose=True,
+            allow_delegation=False,
+            tools=[tavily_search],
+            llm=llm,
+        )
+
+        self.predictor = Agent(
+            role="Professional Gambler",
+            goal="Predict, based on some research you are presented with, whether or not a given event will occur",
+            backstory="You are a professional gambler who is adept at predicting and betting on the outcomes of future events.",
+            verbose=True,
+            allow_delegation=False,
+            llm=llm,
+        )
+
+    def _build_llm(self) -> BaseChatModel:
+        keys = APIKeys()
+        llm = ChatOpenAI(
+            model="gpt-3.5-turbo-0125",
+            api_key=keys.openai_api_key.get_secret_value(),
+        )
+        return llm
+
+    def split_research_into_outcomes(self, question: str) -> Outcomes:
+        create_outcomes_task = Task(
+            description=CREATE_OUTCOMES_FROM_SCENARIO_PROMPT,
+            expected_output=CREATE_OUTCOMES_FROM_SCENARIO_OUTPUT,
+            output_json=Outcomes,
+            agent=self.researcher,
+        )
+
+        report_crew = Crew(
+            agents=[self.researcher],
+            tasks=[create_outcomes_task],
+        )
+        result = report_crew.kickoff(inputs={"scenario": question})
+        return Outcomes.model_validate_json(result)
+
+    def build_tasks_for_outcome(self, input_dict: dict[str, t.Any] = {}) -> list[Task]:
+        task_research_one_outcome = Task(
+            description=RESEARCH_OUTCOME_PROMPT.format(**input_dict),
+            agent=self.researcher,
+            expected_output=RESEARCH_OUTCOME_OUTPUT,
+            async_execution=True,
+        )
+        task_create_probability_for_one_outcome = Task(
+            description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
+            expected_output=PROBABILITY_CLASS_OUTPUT,
+            agent=self.predictor,
+            output_json=ProbabilityOutput,
+            async_execution=False,
+            context=[task_research_one_outcome],
+        )
+
+        return [task_research_one_outcome, task_create_probability_for_one_outcome]
+
+    def generate_prediction_for_one_outcome(self, sentence: str) -> ProbabilityOutput:
+        task_research_one_outcome = Task(
+            description=RESEARCH_OUTCOME_PROMPT,
+            agent=self.researcher,
+            expected_output=RESEARCH_OUTCOME_OUTPUT,
+        )
+        task_create_probability_for_one_outcome = Task(
+            description=PROBABILITY_FOR_ONE_OUTCOME_PROMPT,
+            expected_output=PROBABILITY_CLASS_OUTPUT,
+            agent=self.predictor,
+            output_json=ProbabilityOutput,
+            context=[task_research_one_outcome],
+        )
+        crew = Crew(
+            agents=[self.researcher, self.predictor],
+            tasks=[task_research_one_outcome, task_create_probability_for_one_outcome],
+            verbose=2,
+            process=Process.sequential,
+        )
+
+        result = crew.kickoff(inputs={"sentence": sentence})
+        return ProbabilityOutput.model_validate_json(result)
+
+    def generate_final_decision(
+        self, outcomes_with_probabilities: list[t.Tuple[str, ProbabilityOutput]]
+    ) -> ProbabilityOutput:
+        task_final_decision = Task(
+            description=FINAL_DECISION_PROMPT,
+            agent=self.predictor,
+            expected_output=PROBABILITY_CLASS_OUTPUT,
+            output_json=ProbabilityOutput,
+        )
+
+        crew = Crew(
+            agents=[self.predictor],
+            tasks=[task_final_decision],
+            verbose=2,
+        )
+
+        crew.kickoff(
+            inputs={
+                "outcomes_with_probabilities": [
+                    (i[0], i[1].dict()) for i in outcomes_with_probabilities
+                ],
+                "number_of_outcomes": len(outcomes_with_probabilities),
+                "outcome_to_assess": outcomes_with_probabilities[0][0],
+            }
+        )
+        return ProbabilityOutput.model_validate_json(
+            task_final_decision.output.raw_output
+        )
+
+    def answer_binary_market(self, question: str) -> ProbabilityOutput:
+        outcomes = self.split_research_into_outcomes(question)
+        logger.debug("outcomes ", outcomes)
+
+        outcomes_with_probs = []
+        task_map = {}
+        for outcome in outcomes.outcomes:
+            tasks_for_outcome = self.build_tasks_for_outcome(
+                input_dict={"sentence": outcome}
+            )
+            task_map[outcome] = tasks_for_outcome
+
+        # flatten nested list
+        all_tasks = sum(task_map.values(), [])
+        crew = Crew(
+            agents=[self.researcher, self.predictor],
+            tasks=all_tasks,
+            verbose=2,
+            process=Process.sequential,
+        )
+
+        # crew.kickoff doesn't finish all async tasks when done.
+        crew.kickoff()
+
+        # We parse individual task results to build outcomes_with_probs
+        for outcome, tasks in task_map.items():
+            try:
+                prediction_result = ProbabilityOutput.model_validate_json(
+                    tasks[1].output.raw_output
+                )
+            except Exception as e:
+                logger.error("Could not parse result as ProbabilityOutput ", e)
+                prediction_result = ProbabilityOutput(
+                    p_yes=0.5, p_no=0.5, confidence=0, decision=""
+                )
+
+            outcomes_with_probs.append((outcome, prediction_result))
+
+        final_answer = self.generate_final_decision(outcomes_with_probs)
+        return final_answer
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py b/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py
@@ -0,0 +1,53 @@
+import random
+import typing as t
+
+from prediction_market_agent_tooling.deploy.agent import DeployableAgent
+from prediction_market_agent_tooling.markets.agent_market import AgentMarket
+from prediction_market_agent_tooling.markets.markets import MarketType
+from prediction_market_agent_tooling.tools.utils import should_not_happen
+
+from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import (
+    CrewAIAgentSubquestions,
+)
+from prediction_market_agent.agents.utils import market_is_saturated
+
+
+class DeployableThinkThoroughlyAgent(DeployableAgent):
+    # For cheaper credits at this experimental stage
+    def __init__(self) -> None:
+        super().__init__()
+
+    def pick_markets(self, markets: t.Sequence[AgentMarket]) -> t.Sequence[AgentMarket]:
+        # We simply pick 5 random markets to bet on
+        picked_markets: list[AgentMarket] = []
+        markets = list(markets)
+        random.shuffle(markets)
+        for market in markets:
+            # Assume very high probability markets are already known, and have
+            # been correctly bet on, and therefore the value of betting on them
+            # is low.
+            if not market_is_saturated(market=market):
+                picked_markets.append(market)
+                if len(picked_markets) == 5:
+                    break
+
+        return picked_markets
+
+    def answer_binary_market(self, market: AgentMarket) -> bool:
+        # The answer has already been determined in `pick_markets` so we just
+        # return it here.
+        result = CrewAIAgentSubquestions().answer_binary_market(market.question)
+        return (
+            True
+            if result.decision == "y"
+            else False
+            if result.decision == "n"
+            else should_not_happen()
+        )
+
+
+if __name__ == "__main__":
+    agent = DeployableThinkThoroughlyAgent()
+    agent.deploy_local(
+        market_type=MarketType.OMEN, sleep_time=540, timeout=180, place_bet=False
+    )