Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use inference endpoints as judge #237

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/lighteval/metrics/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,21 @@
from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available


class JudgeOpenAI:
class JudgeEndpoint:
"""
A class representing a judge for evaluating answers using the OpenAI API.
A class representing a judge for evaluating answers using the OpenAI API or the Inference Endpoints API.

Args:
model (str): The name of the OpenAI model to use.
model (str): The name of the model to use.
url (str): Endpoint to go to (open ai or inference endpoint)
seed (int): The seed value for generating random responses.
temperature (float): The temperature value for controlling the randomness of the responses.
templates_path (str): The path to the JSON file containing the templates for prompts.
api_key (str): The API key to use to create/connect to the endpoint

Attributes:
client: An instance of the OpenAI client.
model (str): The name of the OpenAI model.
client: An instance of the endpoint client.
model (str): The name of the endpoint model.
seed (int): The seed value, passed to the API when generating responses.
temperature (float): The temperature value, passed to the API when generating responses.
templates (dict): A dictionary containing the templates for prompts.
Expand All @@ -63,15 +65,17 @@ class JudgeOpenAI:
def __init__(
self,
model: str,
url: str,
seed: int,
temperature: float,
templates_path: str,
openai_api_key: str,
api_key: str,
multi_turn: bool = False,
):
self.client = None # loaded lazily
self.openai_api_key = openai_api_key
self.api_key = api_key
self.model = model
self.url = url # None for Open AI, value for Inference endpoint
self.seed = seed
self.temperature = temperature
self.multi_turn = multi_turn
Expand Down Expand Up @@ -118,7 +122,7 @@ def evaluate_answer(

from openai import OpenAI

self.client = OpenAI(api_key=self.openai_api_key)
self.client = OpenAI(base_url=self.url, api_key=self.api_key)

prompts = [
self.__get_prompts_single_turn(
Expand Down
17 changes: 16 additions & 1 deletion src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
llm_judge_multi_turn_openai = SampleLevelMetricGrouping(
llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping(
metric_name=["single_turn", "multi_turn"],
higher_is_better=True,
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
Expand All @@ -243,6 +243,21 @@ class Metrics(Enum):
"multi_turn": np.mean,
},
)
llm_judge_multi_turn_llama3_405 = SampleLevelMetricGrouping(
metric_name=["single_turn", "multi_turn"],
higher_is_better=True,
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=True,
).compute,
corpus_level_fn={
"single_turn": np.mean,
"multi_turn": np.mean,
},
)
llm_judge_openai = SampleLevelMetricGrouping(
metric_name=["judge_score"],
higher_is_better=True,
Expand Down
19 changes: 11 additions & 8 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from lighteval.metrics.imports.bert_scorer import BERTScorer
from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
from lighteval.metrics.imports.summac import SummaCZS
from lighteval.metrics.llm_as_judge import JudgeOpenAI
from lighteval.metrics.llm_as_judge import JudgeEndpoint
from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
from lighteval.tasks.requests import Doc
from lighteval.utils import as_list
Expand Down Expand Up @@ -622,21 +622,24 @@ def edit_similarity(self, s1, s2):


class JudgeLLM:
available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]

def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
if judge_model_name not in self.available_models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
if judge_model_name in self.available_models_openai:
API_KEY = os.getenv("OPENAI_API_KEY")
url = None
else:
API_KEY = os.getenv("HF_TOKEN")
url = "https://api-inference.huggingface.co/v1/"

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
self.multi_turn = multi_turn

self.judge = JudgeOpenAI(
self.judge = JudgeEndpoint(
model=judge_model_name,
url=url,
seed=42,
temperature=0.0,
templates_path=template_path,
openai_api_key=OPENAI_API_KEY,
api_key=API_KEY,
multi_turn=multi_turn,
)

Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/tasks/extended/mt_bench/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def mt_bench_prompt(line, task_name: str = None):
evaluation_splits=["train"],
few_shots_split="",
few_shots_select="random",
metric=["llm_judge_multi_turn_openai"],
metric=["llm_judge_multi_turn_gpt3p5", "llm_judge_multi_turn_llama3_405"],
generation_size=1024,
stop_sequence=[],
)
Expand Down
Loading