huggingface · NathanHB · Mar 29, 2024 · Feb 20, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,7 +88,8 @@ quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]
 extended_tasks = [
-  "langdetect", #ifeval
+  "langdetect", # ifeval
+  "openai", # mt-bench
 ]
 
 [project.urls]

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
@@ -88,6 +88,8 @@ def evaluate(  # noqa: C901
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
             full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
+        elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
+            full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
             raise NotImplementedError(f"Request type {request_type} not supported")
 
@@ -115,8 +117,22 @@ def evaluate(  # noqa: C901
         # using a deep copy here because process results pops from the model responses
         metrics = task.process_results(doc, copy.deepcopy(model_responses))
 
+        # Remove the user_prompt from the metrics in case of llm-as-judge metric
+        if "user_prompt" in metrics:
+            user_prompt = metrics["user_prompt"]
+            del metrics["user_prompt"]
+        else:
+            user_prompt = None
+        if "judgement" in metrics:
+            judgement = metrics["judgement"]
+            del metrics["judgement"]
+        else:
+            judgement = None
+
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
-        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics)
+        evaluation_tracker.details_logger.log(
+            task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
+        )
 
     return evaluation_tracker
 

diff --git a/src/lighteval/few_shot_manager.py b/src/lighteval/few_shot_manager.py
@@ -27,7 +27,7 @@
 from itertools import cycle
 from typing import TYPE_CHECKING, Optional
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import Doc
@@ -219,6 +219,46 @@ def get_examples(
         )
         return instruction + labeled_examples + example
 
+    def create_multi_turn_contexts(
+        self, doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
+    ) -> list[str]:
+        """Creates N contexts (depending on the number of turn) for a tasks.
+        Multi turn tasks need use chat templating.
+
+        Args:
+            doc (Doc): Formated document.
+            use_chat_template (bool): wether or not to use chat template. Will fail if false.
+            system_prompt (Optional[str]): The system prompt to use
+            tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
+
+        Raises:
+            ValueError: If use_chat_template is set to false.
+
+        Returns:
+            list[str]: contexts for every turn
+        """
+        if not use_chat_template:
+            raise ValueError("You need to use the chat template to create multi turn contexts")
+
+        role_content_list = []
+        if system_prompt is not None:
+            role_content_list.append({"role": "system", "content": system_prompt})
+
+        for i in doc.specific["multi_turn_queries"]:
+            role_content_list.append({"role": "user", "content": i})
+            role_content_list.append({"role": "assistant", "content": "{model_response}"})
+        role_content_list.pop(-1)
+
+        contexts = []
+        offset = 2 if system_prompt is not None else 1
+        for i in range(0, len(role_content_list), offset + 1):
+            c = tokenizer.apply_chat_template(
+                role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
+            )
+            contexts.append(c)
+
+        return contexts, 0
+
     def fewshot_context(
         self,
         task: "LightevalTask",

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -24,7 +24,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
-from typing import Union
+from typing import Optional, Union
 
 import git
 import numpy as np
@@ -205,6 +205,9 @@ class Detail:
         choices: list = field(default_factory=list)
         gold_index: list = field(default_factory=list)
         metrics: dict = field(default_factory=dict)
+        judement_prompt: str = None
+        judgement: str = None
+        specifics: dict = field(default_factory=dict)
 
     @dataclass
     class CompiledDetail:
@@ -302,7 +305,15 @@ class CompiledHash:
     compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
     compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()
 
-    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict) -> None:
+    def log(
+        self,
+        task_name: str,
+        task: LightevalTask,
+        doc: Doc,
+        outputs: list[ModelReturn],
+        metrics: dict,
+        llm_as_prompt_judgement: Optional[tuple[str, str]] = None,
+    ) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
         Args:
@@ -311,6 +322,8 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             doc (Doc): Current sample that we want to store.
             outputs (list[ModelReturn]): Model outputs for the current sample
             metrics (_type_): Model scores for said sample on the current task's metrics.
+            llm_as_prompt_judgement (tuple[str, str]): Tuple containing the
+                prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric.
         """
         detail = self.Detail()
         detail.example = doc.query
@@ -354,6 +367,11 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             detail.choices = doc.choices
             detail.gold_index = as_list(doc.gold_index)
             pred_saved = True
+        if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
+            pred_saved = True
+            detail.judement_prompt = llm_as_prompt_judgement[0]
+            detail.judgement = llm_as_prompt_judgement[1]
+        detail.specifics = doc.specific
         if not pred_saved:
             raise NotImplementedError(
                 "No metric prediction saved."
@@ -364,7 +382,7 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
 
         hash = self.Hash()
         hash.example = xxhash.xxh64(doc.query).hexdigest()
-        hash.full_prompt = xxhash.xxh64(doc.ctx).hexdigest()
+        hash.full_prompt = xxhash.xxh64(str(doc.ctx)).hexdigest()
         hash.input_tokens = xxhash.xxh64(str([o.input_tokens for o in outputs])).hexdigest()
         hash.cont_tokens = xxhash.xxh64(str([o.generated_tokens for o in outputs])).hexdigest()
         self.hashes[task_name].append(hash)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
@@ -146,3 +146,14 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
             )
 
     return results, outputs
+
+
+def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
+    outputs = {}
+    predictions = results.pop(0).result
+
+    for metric in metrics:
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_MULTI_TURN:
+            outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))
+
+    return results, outputs
diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py
@@ -28,6 +28,7 @@ class MetricCategory(Enum):
     TARGET_PERPLEXITY = auto()
     PERPLEXITY = auto()
     GENERATIVE = auto()
+    GENERATIVE_MULTI_TURN = auto()
     GENERATIVE_LOGPROB = auto()
     MULTICHOICE = auto()
     MULTICHOICE_ONE_TOKEN = auto()

diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
@@ -27,8 +27,14 @@
 from transformers import BatchEncoding
 
 from lighteval.models.model_config import EnvConfig
-from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
+from lighteval.models.model_output import (
+    GenerateMultiTurnReturn,
+    GenerateReturn,
+    LoglikelihoodReturn,
+    LoglikelihoodSingleTokenReturn,
+)
 from lighteval.tasks.requests import (
+    GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
@@ -102,6 +108,12 @@ def greedy_until_with_logits(
             returns_logits=True,
         )
 
+    def greedy_until_multi_turn(  # noqa: C901
+        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
+    ) -> GenerateMultiTurnReturn:
+        """Generates responses using a greedy decoding strategy until certain ending conditions are met."""
+        return NotImplemented
+
     @abstractmethod
     def greedy_until(
         self,