Merge pull request #2 from evangriffiths/evan/more-metrics

Add more default metrics to the benchmarker
gnosis · Feb 1, 2024 · b9a5e99 · b9a5e99
2 parents 80b784e + f6f97c0
commit b9a5e99
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 8 deletions.
diff --git a/evo_researcher/benchmark/benchmark.py b/evo_researcher/benchmark/benchmark.py
@@ -1,6 +1,6 @@
 import argparse
 import concurrent.futures
-import json
+import numpy as np
 import os
 import pandas as pd
 import time
@@ -15,10 +15,11 @@
 )
 from evo_researcher.benchmark.utils import (
     Market,
+    MarketSource,
     Prediction,
     PredictionsCache,
     get_llm_api_call_cost,
-    get_manifold_markets,
+    get_markets,
 )
 
 
@@ -47,10 +48,20 @@ def __init__(
         predefined_metric_fns = {
             "MSE for `p_yes`": self._compute_mse,
             "Mean confidence": self._compute_mean_confidence,
+            "% within +-0.05": lambda predictions, markets: self._compute_percentage_within_range(
+                predictions, markets, tolerance=0.05
+            ),
+            "% within +-0.1": lambda predictions, markets: self._compute_percentage_within_range(
+                predictions, markets, tolerance=0.1
+            ),
+            "% within +-0.2": lambda predictions, markets: self._compute_percentage_within_range(
+                predictions, markets, tolerance=0.2
+            ),
+            "% correct outcome": self._compute_correct_outcome_percentage,
+            "confidence/p_yes error correlation": self._compute_confidence_p_yes_error_correlation,
             "Mean info_utility": self._compute_mean_info_utility,
             "Mean cost ($)": self._compute_mean_cost,
             "Mean time (s)": self._compute_mean_time,
-            # TODO add 'normalized' mse to take into account confidence?
         }
         self.metric_fns.update(predefined_metric_fns)
 
@@ -132,6 +143,36 @@ def _compute_mean_info_utility(
         )
         return mean_info_utility
 
+    def _compute_percentage_within_range(
+        self,
+        predictions: t.List[Prediction],
+        markets: t.List[Market],
+        tolerance: float = 0.05,
+    ):
+        within_range_count = 0
+        for p, m in zip(predictions, markets):
+            if abs(p.p_yes - m.p_yes) <= tolerance:
+                within_range_count += 1
+
+        return (100 * within_range_count) / len(predictions)
+
+    def _compute_correct_outcome_percentage(
+        self, predictions: t.List[Prediction], markets: t.List[Market]
+    ):
+        correct_outcome_count = 0
+        for p, m in zip(predictions, markets):
+            if (p.p_yes > 0.5 and m.p_yes > 0.5) or (p.p_yes < 0.5 and m.p_yes < 0.5):
+                correct_outcome_count += 1
+
+        return (100 * correct_outcome_count) / len(predictions)
+
+    def _compute_confidence_p_yes_error_correlation(
+        self, predictions: t.List[Prediction], markets: t.List[Market]
+    ):
+        p_yes_errors = [abs(p.p_yes - m.p_yes) for p, m in zip(predictions, markets)]
+        confidences = [p.confidence for p in predictions]
+        return np.corrcoef(confidences, p_yes_errors)[0, 1]
+
     def _compute_mean_cost(
         self, predictions: t.List[Prediction], markets: t.List[Market]
     ):
@@ -180,7 +221,7 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str]]:
             markets_summary[f"{model_type} p_yes"] = [
                 p.p_yes for p in self.predictions[model_type].values()
             ]
-        markets_summary["manifold p_yes"] = [m.p_yes for m in self.markets]
+        markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets]
         return markets_summary
 
     def generate_markdown_report(self):
@@ -200,10 +241,16 @@ def generate_markdown_report(self):
         type=str,
         default="./benchmark_report.md",
     )
+    args.add_argument(
+        "--reference",
+        type=str,
+        choices=[ms.value for ms in MarketSource],
+        default="manifold",
+    )
     args = args.parse_args()
 
     benchmarker = Benchmarker(
-        markets=get_manifold_markets(number=3),
+        markets=get_markets(number=3, source=MarketSource(args.reference)),
         agents=[
             OlasAgent(model="gpt-3.5-turbo"),  # TODO use same models!
             EvoAgent(model="gpt-4-1106-preview"),

diff --git a/evo_researcher/benchmark/utils.py b/evo_researcher/benchmark/utils.py
@@ -1,10 +1,18 @@
+from dotenv import load_dotenv
+from enum import Enum
 import os
 import requests
 import typing as t
 from pydantic import BaseModel
 
 
+class MarketSource(Enum):
+    MANIFOLD = "manifold"
+    POLYMARKET = "polymarket"
+
+
 class Market(BaseModel):
+    source: MarketSource
     question: str
     url: str
     p_yes: float
@@ -53,19 +61,23 @@ def load(cls, markets: t.List[Market], path: str):
         }
 
 
-def get_manifold_markets(number: int = 100) -> t.List[Market]:
+def get_manifold_markets(
+    number: int = 100, excluded_questions: t.List[str] = []
+) -> t.List[Market]:
     url = "https://api.manifold.markets/v0/search-markets"
     params = {
         "term": "",
         "sort": "liquidity",
         "filter": "open",
-        "limit": f"{number}",
+        "limit": f"{number + len(excluded_questions)}",
         "contractType": "BINARY",  # TODO support CATEGORICAL markets
     }
     response = requests.get(url, params=params)
 
     response.raise_for_status()
     markets_json = response.json()
+    for m in markets_json:
+        m["source"] = MarketSource.MANIFOLD
 
     # Map JSON fields to Market fields
     fields_map = {
@@ -78,10 +90,61 @@ def _map_fields(old: dict, mapping: dict) -> dict:
 
     markets = [Market.parse_obj(_map_fields(m, fields_map)) for m in markets_json]
     markets = [m for m in markets if not m.is_resolved]
-    assert len(markets) == number
+
+    # Filter out markets with excluded questions
+    markets = [m for m in markets if m.question not in excluded_questions]
+
+    return markets[:number]
+
+
+def get_polymarket_markets(
+    number: int = 100, excluded_questions: t.List[str] = []
+) -> t.List[Market]:
+    if number > 100:
+        raise ValueError("Polymarket API only returns 100 markets at a time")
+
+    api_uri = f"https://strapi-matic.poly.market/markets?_limit={number + len(excluded_questions)}&active=true&closed=false"
+    ms_json = requests.get(api_uri).json()
+    markets: t.List[Market] = []
+    for m_json in ms_json:
+        # Skip non-binary markets. Unfortunately no way to filter in the API call
+        if m_json["outcomes"] != ["Yes", "No"]:
+            continue
+
+        if m_json["question"] in excluded_questions:
+            print(f"Skipping market with 'excluded question': {m_json['question']}")
+            continue
+
+        markets.append(
+            Market(
+                question=m_json["question"],
+                url=f"https://polymarket.com/event/{m_json['slug']}",
+                p_yes=m_json["outcomePrices"][0],
+                volume=m_json["volume"],
+                is_resolved=False,
+                source=MarketSource.POLYMARKET,
+            )
+        )
     return markets
 
 
+def get_markets(
+    number: int,
+    source: MarketSource,
+    excluded_questions: t.List[str] = [],
+) -> t.List[Market]:
+    if source == MarketSource.MANIFOLD:
+        return get_manifold_markets(
+            number=number, excluded_questions=excluded_questions
+        )
+    elif source == MarketSource.POLYMARKET:
+        return get_polymarket_markets(
+            number=number, excluded_questions=excluded_questions
+        )
+    else:
+        raise ValueError(f"Unknown market source: {source}")
+
+
 def get_llm_api_call_cost(model: str, prompt_tokens: int, completion_tokens) -> float:
     """
     In older versions of langchain, the cost calculation doesn't work for