Add mt-bench (#75)

What this PR does: - Uses custom metrics and tasks to add llm a as judge - adds multi turn generation - Adds mt-bench metric This implementation uses mt-bench prompts from [InflectionAI](https://github.com/InflectionAI/Inflection-Benchmarks). The code is inspired from the original implementation of mt-bench with notable differences. - mt-bench uses a custom-made chat templating system, we use the tokenizer - mt-bench uses an old version of the openai API, we use the newest one, with very simplified logic for chat prompt formating. We can easily add more models to act as judge. - We do not use varying temperature based on the sample we are evaluating. All samples are generated using `do_sample=False` and temperature set to `0.0`.
huggingface · Mar 29, 2024 · af24080 · af24080
1 parent bbe3b5f
commit af24080
Show file tree

Hide file tree

Showing 16 changed files with 716 additions and 29 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,7 +88,8 @@ quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]
 extended_tasks = [
-  "langdetect", #ifeval
+  "langdetect", # ifeval
+  "openai", # mt-bench
 ]
 
 [project.urls]

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
@@ -88,6 +88,8 @@ def evaluate(  # noqa: C901
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
             full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
+        elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
+            full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
             raise NotImplementedError(f"Request type {request_type} not supported")
 
@@ -115,8 +117,22 @@ def evaluate(  # noqa: C901
         # using a deep copy here because process results pops from the model responses
         metrics = task.process_results(doc, copy.deepcopy(model_responses))
 
+        # Remove the user_prompt from the metrics in case of llm-as-judge metric
+        if "user_prompt" in metrics:
+            user_prompt = metrics["user_prompt"]
+            del metrics["user_prompt"]
+        else:
+            user_prompt = None
+        if "judgement" in metrics:
+            judgement = metrics["judgement"]
+            del metrics["judgement"]
+        else:
+            judgement = None
+
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
-        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics)
+        evaluation_tracker.details_logger.log(
+            task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
+        )
 
     return evaluation_tracker
 

diff --git a/src/lighteval/few_shot_manager.py b/src/lighteval/few_shot_manager.py
@@ -27,7 +27,7 @@
 from itertools import cycle
 from typing import TYPE_CHECKING, Optional
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import Doc
@@ -219,6 +219,46 @@ def get_examples(
         )
         return instruction + labeled_examples + example
 
+    def create_multi_turn_contexts(
+        self, doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
+    ) -> list[str]:
+        """Creates N contexts (depending on the number of turn) for a tasks.
+        Multi turn tasks need use chat templating.
+
+        Args:
+            doc (Doc): Formated document.
+            use_chat_template (bool): wether or not to use chat template. Will fail if false.
+            system_prompt (Optional[str]): The system prompt to use
+            tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
+
+        Raises:
+            ValueError: If use_chat_template is set to false.
+
+        Returns:
+            list[str]: contexts for every turn
+        """
+        if not use_chat_template:
+            raise ValueError("You need to use the chat template to create multi turn contexts")
+
+        role_content_list = []
+        if system_prompt is not None:
+            role_content_list.append({"role": "system", "content": system_prompt})
+
+        for i in doc.specific["multi_turn_queries"]:
+            role_content_list.append({"role": "user", "content": i})
+            role_content_list.append({"role": "assistant", "content": "{model_response}"})
+        role_content_list.pop(-1)
+
+        contexts = []
+        offset = 2 if system_prompt is not None else 1
+        for i in range(0, len(role_content_list), offset + 1):
+            c = tokenizer.apply_chat_template(
+                role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
+            )
+            contexts.append(c)
+
+        return contexts, 0
+
     def fewshot_context(
         self,
         task: "LightevalTask",

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -24,7 +24,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
-from typing import Union
+from typing import Optional, Union
 
 import git
 import numpy as np
@@ -205,6 +205,9 @@ class Detail:
         choices: list = field(default_factory=list)
         gold_index: list = field(default_factory=list)
         metrics: dict = field(default_factory=dict)
+        judement_prompt: str = None
+        judgement: str = None
+        specifics: dict = field(default_factory=dict)
 
     @dataclass
     class CompiledDetail:
@@ -302,7 +305,15 @@ class CompiledHash:
     compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
     compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()
 
-    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict) -> None:
+    def log(
+        self,
+        task_name: str,
+        task: LightevalTask,
+        doc: Doc,
+        outputs: list[ModelReturn],
+        metrics: dict,
+        llm_as_prompt_judgement: Optional[tuple[str, str]] = None,
+    ) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
         Args:
@@ -311,6 +322,8 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             doc (Doc): Current sample that we want to store.
             outputs (list[ModelReturn]): Model outputs for the current sample
             metrics (_type_): Model scores for said sample on the current task's metrics.
+            llm_as_prompt_judgement (tuple[str, str]): Tuple containing the
+                prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric.
         """
         detail = self.Detail()
         detail.example = doc.query
@@ -354,6 +367,11 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             detail.choices = doc.choices
             detail.gold_index = as_list(doc.gold_index)
             pred_saved = True
+        if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
+            pred_saved = True
+            detail.judement_prompt = llm_as_prompt_judgement[0]
+            detail.judgement = llm_as_prompt_judgement[1]
+        detail.specifics = doc.specific
         if not pred_saved:
             raise NotImplementedError(
                 "No metric prediction saved."
@@ -364,7 +382,7 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
 
         hash = self.Hash()
         hash.example = xxhash.xxh64(doc.query).hexdigest()
-        hash.full_prompt = xxhash.xxh64(doc.ctx).hexdigest()
+        hash.full_prompt = xxhash.xxh64(str(doc.ctx)).hexdigest()
         hash.input_tokens = xxhash.xxh64(str([o.input_tokens for o in outputs])).hexdigest()
         hash.cont_tokens = xxhash.xxh64(str([o.generated_tokens for o in outputs])).hexdigest()
         self.hashes[task_name].append(hash)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
@@ -146,3 +146,14 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
             )
 
     return results, outputs
+
+
+def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
+    outputs = {}
+    predictions = results.pop(0).result
+
+    for metric in metrics:
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_MULTI_TURN:
+            outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))
+
+    return results, outputs
diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py
@@ -28,6 +28,7 @@ class MetricCategory(Enum):
     TARGET_PERPLEXITY = auto()
     PERPLEXITY = auto()
     GENERATIVE = auto()
+    GENERATIVE_MULTI_TURN = auto()
     GENERATIVE_LOGPROB = auto()
     MULTICHOICE = auto()
     MULTICHOICE_ONE_TOKEN = auto()

diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
@@ -27,8 +27,14 @@
 from transformers import BatchEncoding
 
 from lighteval.models.model_config import EnvConfig
-from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
+from lighteval.models.model_output import (
+    GenerateMultiTurnReturn,
+    GenerateReturn,
+    LoglikelihoodReturn,
+    LoglikelihoodSingleTokenReturn,
+)
 from lighteval.tasks.requests import (
+    GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
@@ -102,6 +108,12 @@ def greedy_until_with_logits(
             returns_logits=True,
         )
 
+    def greedy_until_multi_turn(  # noqa: C901
+        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
+    ) -> GenerateMultiTurnReturn:
+        """Generates responses using a greedy decoding strategy until certain ending conditions are met."""
+        return NotImplemented
+
     @abstractmethod
     def greedy_until(
         self,