Skip to content

Commit

Permalink
Merge pull request #2 from evangriffiths/evan/more-metrics
Browse files Browse the repository at this point in the history
Add more default metrics to the benchmarker
  • Loading branch information
evangriffiths authored Feb 1, 2024
2 parents 80b784e + f6f97c0 commit b9a5e99
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 8 deletions.
57 changes: 52 additions & 5 deletions evo_researcher/benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse
import concurrent.futures
import json
import numpy as np
import os
import pandas as pd
import time
Expand All @@ -15,10 +15,11 @@
)
from evo_researcher.benchmark.utils import (
Market,
MarketSource,
Prediction,
PredictionsCache,
get_llm_api_call_cost,
get_manifold_markets,
get_markets,
)


Expand Down Expand Up @@ -47,10 +48,20 @@ def __init__(
predefined_metric_fns = {
"MSE for `p_yes`": self._compute_mse,
"Mean confidence": self._compute_mean_confidence,
"% within +-0.05": lambda predictions, markets: self._compute_percentage_within_range(
predictions, markets, tolerance=0.05
),
"% within +-0.1": lambda predictions, markets: self._compute_percentage_within_range(
predictions, markets, tolerance=0.1
),
"% within +-0.2": lambda predictions, markets: self._compute_percentage_within_range(
predictions, markets, tolerance=0.2
),
"% correct outcome": self._compute_correct_outcome_percentage,
"confidence/p_yes error correlation": self._compute_confidence_p_yes_error_correlation,
"Mean info_utility": self._compute_mean_info_utility,
"Mean cost ($)": self._compute_mean_cost,
"Mean time (s)": self._compute_mean_time,
# TODO add 'normalized' mse to take into account confidence?
}
self.metric_fns.update(predefined_metric_fns)

Expand Down Expand Up @@ -132,6 +143,36 @@ def _compute_mean_info_utility(
)
return mean_info_utility

def _compute_percentage_within_range(
self,
predictions: t.List[Prediction],
markets: t.List[Market],
tolerance: float = 0.05,
):
within_range_count = 0
for p, m in zip(predictions, markets):
if abs(p.p_yes - m.p_yes) <= tolerance:
within_range_count += 1

return (100 * within_range_count) / len(predictions)

def _compute_correct_outcome_percentage(
self, predictions: t.List[Prediction], markets: t.List[Market]
):
correct_outcome_count = 0
for p, m in zip(predictions, markets):
if (p.p_yes > 0.5 and m.p_yes > 0.5) or (p.p_yes < 0.5 and m.p_yes < 0.5):
correct_outcome_count += 1

return (100 * correct_outcome_count) / len(predictions)

def _compute_confidence_p_yes_error_correlation(
self, predictions: t.List[Prediction], markets: t.List[Market]
):
p_yes_errors = [abs(p.p_yes - m.p_yes) for p, m in zip(predictions, markets)]
confidences = [p.confidence for p in predictions]
return np.corrcoef(confidences, p_yes_errors)[0, 1]

def _compute_mean_cost(
self, predictions: t.List[Prediction], markets: t.List[Market]
):
Expand Down Expand Up @@ -180,7 +221,7 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str]]:
markets_summary[f"{model_type} p_yes"] = [
p.p_yes for p in self.predictions[model_type].values()
]
markets_summary["manifold p_yes"] = [m.p_yes for m in self.markets]
markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets]
return markets_summary

def generate_markdown_report(self):
Expand All @@ -200,10 +241,16 @@ def generate_markdown_report(self):
type=str,
default="./benchmark_report.md",
)
args.add_argument(
"--reference",
type=str,
choices=[ms.value for ms in MarketSource],
default="manifold",
)
args = args.parse_args()

benchmarker = Benchmarker(
markets=get_manifold_markets(number=3),
markets=get_markets(number=3, source=MarketSource(args.reference)),
agents=[
OlasAgent(model="gpt-3.5-turbo"), # TODO use same models!
EvoAgent(model="gpt-4-1106-preview"),
Expand Down
69 changes: 66 additions & 3 deletions evo_researcher/benchmark/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
from dotenv import load_dotenv
from enum import Enum
import os
import requests
import typing as t
from pydantic import BaseModel


class MarketSource(Enum):
MANIFOLD = "manifold"
POLYMARKET = "polymarket"


class Market(BaseModel):
source: MarketSource
question: str
url: str
p_yes: float
Expand Down Expand Up @@ -53,19 +61,23 @@ def load(cls, markets: t.List[Market], path: str):
}


def get_manifold_markets(number: int = 100) -> t.List[Market]:
def get_manifold_markets(
number: int = 100, excluded_questions: t.List[str] = []
) -> t.List[Market]:
url = "https://api.manifold.markets/v0/search-markets"
params = {
"term": "",
"sort": "liquidity",
"filter": "open",
"limit": f"{number}",
"limit": f"{number + len(excluded_questions)}",
"contractType": "BINARY", # TODO support CATEGORICAL markets
}
response = requests.get(url, params=params)

response.raise_for_status()
markets_json = response.json()
for m in markets_json:
m["source"] = MarketSource.MANIFOLD

# Map JSON fields to Market fields
fields_map = {
Expand All @@ -78,10 +90,61 @@ def _map_fields(old: dict, mapping: dict) -> dict:

markets = [Market.parse_obj(_map_fields(m, fields_map)) for m in markets_json]
markets = [m for m in markets if not m.is_resolved]
assert len(markets) == number

# Filter out markets with excluded questions
markets = [m for m in markets if m.question not in excluded_questions]

return markets[:number]


def get_polymarket_markets(
number: int = 100, excluded_questions: t.List[str] = []
) -> t.List[Market]:
if number > 100:
raise ValueError("Polymarket API only returns 100 markets at a time")

api_uri = f"https://strapi-matic.poly.market/markets?_limit={number + len(excluded_questions)}&active=true&closed=false"
ms_json = requests.get(api_uri).json()
markets: t.List[Market] = []
for m_json in ms_json:
# Skip non-binary markets. Unfortunately no way to filter in the API call
if m_json["outcomes"] != ["Yes", "No"]:
continue

if m_json["question"] in excluded_questions:
print(f"Skipping market with 'excluded question': {m_json['question']}")
continue

markets.append(
Market(
question=m_json["question"],
url=f"https://polymarket.com/event/{m_json['slug']}",
p_yes=m_json["outcomePrices"][0],
volume=m_json["volume"],
is_resolved=False,
source=MarketSource.POLYMARKET,
)
)
return markets


def get_markets(
number: int,
source: MarketSource,
excluded_questions: t.List[str] = [],
) -> t.List[Market]:
if source == MarketSource.MANIFOLD:
return get_manifold_markets(
number=number, excluded_questions=excluded_questions
)
elif source == MarketSource.POLYMARKET:
return get_polymarket_markets(
number=number, excluded_questions=excluded_questions
)
else:
raise ValueError(f"Unknown market source: {source}")


def get_llm_api_call_cost(model: str, prompt_tokens: int, completion_tokens) -> float:
"""
In older versions of langchain, the cost calculation doesn't work for
Expand Down

0 comments on commit b9a5e99

Please sign in to comment.