Skip to content

Commit

Permalink
Add mt-bench (#75)
Browse files Browse the repository at this point in the history
What this PR does:
- Uses custom metrics and tasks to add llm a as judge
- adds multi turn generation
- Adds mt-bench metric

This implementation uses mt-bench prompts from [InflectionAI](https://github.com/InflectionAI/Inflection-Benchmarks). The code is inspired from the original implementation of mt-bench with notable differences.
- mt-bench uses a custom-made chat templating system, we use the tokenizer
- mt-bench uses an old version of the openai API, we use the newest one, with very simplified logic for chat prompt formating. We can easily add more models to act as judge.
- We do not use varying temperature based on the sample we are evaluating. All samples are generated using `do_sample=False` and temperature set to `0.0`.
  • Loading branch information
NathanHB authored Mar 29, 2024
1 parent bbe3b5f commit af24080
Show file tree
Hide file tree
Showing 16 changed files with 716 additions and 29 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ quality = ["ruff==v0.2.2","pre-commit"]
tests = ["pytest==7.4.0"]
dev = ["lighteval[accelerate,quality,tests]"]
extended_tasks = [
"langdetect", #ifeval
"langdetect", # ifeval
"openai", # mt-bench
]

[project.urls]
Expand Down
18 changes: 17 additions & 1 deletion src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def evaluate( # noqa: C901
full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
else:
raise NotImplementedError(f"Request type {request_type} not supported")

Expand Down Expand Up @@ -115,8 +117,22 @@ def evaluate( # noqa: C901
# using a deep copy here because process results pops from the model responses
metrics = task.process_results(doc, copy.deepcopy(model_responses))

# Remove the user_prompt from the metrics in case of llm-as-judge metric
if "user_prompt" in metrics:
user_prompt = metrics["user_prompt"]
del metrics["user_prompt"]
else:
user_prompt = None
if "judgement" in metrics:
judgement = metrics["judgement"]
del metrics["judgement"]
else:
judgement = None

evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics)
evaluation_tracker.details_logger.log(
task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
)

return evaluation_tracker

Expand Down
42 changes: 41 additions & 1 deletion src/lighteval/few_shot_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from itertools import cycle
from typing import TYPE_CHECKING, Optional

from transformers import AutoTokenizer
from transformers import AutoTokenizer, PreTrainedTokenizer

from lighteval.logging.hierarchical_logger import hlog_warn
from lighteval.tasks.requests import Doc
Expand Down Expand Up @@ -219,6 +219,46 @@ def get_examples(
)
return instruction + labeled_examples + example

def create_multi_turn_contexts(
self, doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
) -> list[str]:
"""Creates N contexts (depending on the number of turn) for a tasks.
Multi turn tasks need use chat templating.
Args:
doc (Doc): Formated document.
use_chat_template (bool): wether or not to use chat template. Will fail if false.
system_prompt (Optional[str]): The system prompt to use
tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
Raises:
ValueError: If use_chat_template is set to false.
Returns:
list[str]: contexts for every turn
"""
if not use_chat_template:
raise ValueError("You need to use the chat template to create multi turn contexts")

role_content_list = []
if system_prompt is not None:
role_content_list.append({"role": "system", "content": system_prompt})

for i in doc.specific["multi_turn_queries"]:
role_content_list.append({"role": "user", "content": i})
role_content_list.append({"role": "assistant", "content": "{model_response}"})
role_content_list.pop(-1)

contexts = []
offset = 2 if system_prompt is not None else 1
for i in range(0, len(role_content_list), offset + 1):
c = tokenizer.apply_chat_template(
role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
)
contexts.append(c)

return contexts, 0

def fewshot_context(
self,
task: "LightevalTask",
Expand Down
24 changes: 21 additions & 3 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import os
import time
from dataclasses import asdict, dataclass, field
from typing import Union
from typing import Optional, Union

import git
import numpy as np
Expand Down Expand Up @@ -205,6 +205,9 @@ class Detail:
choices: list = field(default_factory=list)
gold_index: list = field(default_factory=list)
metrics: dict = field(default_factory=dict)
judement_prompt: str = None
judgement: str = None
specifics: dict = field(default_factory=dict)

@dataclass
class CompiledDetail:
Expand Down Expand Up @@ -302,7 +305,15 @@ class CompiledHash:
compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()

def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict) -> None:
def log(
self,
task_name: str,
task: LightevalTask,
doc: Doc,
outputs: list[ModelReturn],
metrics: dict,
llm_as_prompt_judgement: Optional[tuple[str, str]] = None,
) -> None:
"""Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
Args:
Expand All @@ -311,6 +322,8 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
doc (Doc): Current sample that we want to store.
outputs (list[ModelReturn]): Model outputs for the current sample
metrics (_type_): Model scores for said sample on the current task's metrics.
llm_as_prompt_judgement (tuple[str, str]): Tuple containing the
prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric.
"""
detail = self.Detail()
detail.example = doc.query
Expand Down Expand Up @@ -354,6 +367,11 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
detail.choices = doc.choices
detail.gold_index = as_list(doc.gold_index)
pred_saved = True
if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
pred_saved = True
detail.judement_prompt = llm_as_prompt_judgement[0]
detail.judgement = llm_as_prompt_judgement[1]
detail.specifics = doc.specific
if not pred_saved:
raise NotImplementedError(
"No metric prediction saved."
Expand All @@ -364,7 +382,7 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model

hash = self.Hash()
hash.example = xxhash.xxh64(doc.query).hexdigest()
hash.full_prompt = xxhash.xxh64(doc.ctx).hexdigest()
hash.full_prompt = xxhash.xxh64(str(doc.ctx)).hexdigest()
hash.input_tokens = xxhash.xxh64(str([o.input_tokens for o in outputs])).hexdigest()
hash.cont_tokens = xxhash.xxh64(str([o.generated_tokens for o in outputs])).hexdigest()
self.hashes[task_name].append(hash)
Expand Down
11 changes: 11 additions & 0 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,14 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
)

return results, outputs


def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
outputs = {}
predictions = results.pop(0).result

for metric in metrics:
if Metrics[metric].value.category == MetricCategory.GENERATIVE_MULTI_TURN:
outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))

return results, outputs
1 change: 1 addition & 0 deletions src/lighteval/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MetricCategory(Enum):
TARGET_PERPLEXITY = auto()
PERPLEXITY = auto()
GENERATIVE = auto()
GENERATIVE_MULTI_TURN = auto()
GENERATIVE_LOGPROB = auto()
MULTICHOICE = auto()
MULTICHOICE_ONE_TOKEN = auto()
Expand Down
14 changes: 13 additions & 1 deletion src/lighteval/models/abstract_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,14 @@
from transformers import BatchEncoding

from lighteval.models.model_config import EnvConfig
from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
from lighteval.models.model_output import (
GenerateMultiTurnReturn,
GenerateReturn,
LoglikelihoodReturn,
LoglikelihoodSingleTokenReturn,
)
from lighteval.tasks.requests import (
GreedyUntilMultiTurnRequest,
GreedyUntilRequest,
GreedyUntilWithLogitsRequest,
LoglikelihoodRequest,
Expand Down Expand Up @@ -102,6 +108,12 @@ def greedy_until_with_logits(
returns_logits=True,
)

def greedy_until_multi_turn( # noqa: C901
self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
) -> GenerateMultiTurnReturn:
"""Generates responses using a greedy decoding strategy until certain ending conditions are met."""
return NotImplemented

@abstractmethod
def greedy_until(
self,
Expand Down
Loading

0 comments on commit af24080

Please sign in to comment.