From 89e7fda5de650e273e0d132b8a393f47020728a3 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 20 Feb 2024 14:12:23 +0000
Subject: [PATCH 01/45] init ifeval, now need to add loading custom metric
 system

---
 .../ifeval/ifeval.py                          |  120 ++
 .../ifeval/instructions.py                    | 1530 +++++++++++++++
 .../ifeval/instructions_registry.py           |  167 ++
 .../ifeval/instructions_utils.py              | 1684 +++++++++++++++++
 4 files changed, 3501 insertions(+)
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py

diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
new file mode 100644
index 000000000..129368c1e
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
@@ -0,0 +1,120 @@
+import instructions_registry
+import numpy as np
+
+from lighteval.metrics.utils import (
+    MetricCategory,
+    MetricUseCase,
+    SampleLevelMetricGrouping,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+# We create the task config
+ifeval = LightevalTaskConfig(
+    name="ifeval",
+    prompt_function="ifeval_prompt",
+    hf_repo="wis-k/instruction-following-eval",
+    hf_subset="train",
+    metric=["ifeval_metric"],
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split="train",
+    few_shots_select="random_sampling",
+    generation_size=1280,  # to check
+    stop_sequence=["\n"],  # to check
+)
+
+
+def ifeval_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"],
+        choices=[],  # very specific task where there are no precise outputs but instead we test if the format obeys rules
+        gold_index=-1,
+        instruction="",
+        specific={"instruction_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]},
+    )
+
+
+submetric_names = [
+    "prompt_level_strict_acc",
+    "inst_level_strict_acc",
+    "prompt_level_loose_acc",
+    "inst_level_loose_acc",
+]
+
+
+def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float:
+    response = predictions[0]
+
+    # Strict instructions
+    instruction_list = formatted_doc.specific["instructions_id_list"]
+    kwargs = formatted_doc.specific["kwargs"]
+    prompt = formatted_doc.query
+
+    # Loose instructions
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+
+    is_following_list_strict = []
+    is_following_list_loose = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=prompt)
+
+        # Strict
+        if response.strip() and instruction.check_following(response):
+            is_following_list_strict.append(True)
+        else:
+            is_following_list_strict.append(False)
+
+        # Loose
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list_loose.append(is_following)
+
+    return {
+        "prompt_level_strict_acc": int(all(is_following_list_strict)),
+        "inst_level_strict_acc": np.mean(is_following_list_strict),
+        "prompt_level_loose_acc": int(all(is_following_list_loose)),
+        "inst_level_loose_acc": np.mean(is_following_list_loose),
+    }
+
+
+ifeval_metrics = SampleLevelMetricGrouping(
+    metric=submetric_names,
+    higher_is_better={n: True for n in submetric_names},
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.ACCURACY,
+    sample_level_fn=ifeval_metric,
+    corpus_level_fn=np.mean,
+)
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
new file mode 100644
index 000000000..7ec97f83b
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
@@ -0,0 +1,1530 @@
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library of instructions."""
+import collections
+import json
+import logging
+import random
+import re
+import string
+from typing import Dict, Optional, Sequence, Union
+
+import langdetect
+from lm_eval.tasks.ifeval import instructions_util
+
+
+logger = logging.getLogger(__name__)
+
+_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
+
+_LANGUAGES = instructions_util.LANGUAGE_CODES
+
+# The relational operation for comparison.
+_COMPARISON_RELATION = ("less than", "at least")
+
+# The maximum number of sentences.
+_MAX_NUM_SENTENCES = 20
+
+# The number of placeholders.
+_NUM_PLACEHOLDERS = 4
+
+# The number of bullet lists.
+_NUM_BULLETS = 5
+
+# The options of constrained response.
+_CONSTRAINED_RESPONSE_OPTIONS = (
+    "My answer is yes.",
+    "My answer is no.",
+    "My answer is maybe.",
+)
+
+# The options of starter keywords.
+_STARTER_OPTIONS = (
+    "I would say",
+    "My answer is",
+    "I believe",
+    "In my opinion",
+    "I think",
+    "I reckon",
+    "I feel",
+    "From my perspective",
+    "As I see it",
+    "According to me",
+    "As far as I'm concerned",
+    "To my understanding",
+    "In my view",
+    "My take on it is",
+    "As per my perception",
+)
+
+# The options of ending keywords.
+# TODO(jeffreyzhou) add more ending options
+_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?")
+
+# The number of highlighted sections.
+_NUM_HIGHLIGHTED_SECTIONS = 4
+
+# The section spliter.
+_SECTION_SPLITER = ("Section", "SECTION")
+
+# The number of sections.
+_NUM_SECTIONS = 5
+
+# The number of paragraphs.
+_NUM_PARAGRAPHS = 5
+
+# The postscript marker.
+_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
+
+# The number of keywords.
+_NUM_KEYWORDS = 2
+
+# The occurrences of a single keyword.
+_KEYWORD_FREQUENCY = 3
+
+# The occurrences of a single letter.
+_LETTER_FREQUENCY = 10
+
+# The occurrences of words with all capital letters.
+_ALL_CAPITAL_WORD_FREQUENCY = 20
+
+# The number of words in the response.
+_NUM_WORDS_LOWER_LIMIT = 100
+_NUM_WORDS_UPPER_LIMIT = 500
+
+
+class Instruction:
+    """An instruction template."""
+
+    def __init__(self, instruction_id):
+        self.id = instruction_id
+
+    def build_description(self, **kwargs):
+        raise NotImplementedError("`build_description` not implemented.")
+
+    def get_instruction_args(self):
+        raise NotImplementedError("`get_instruction_args` not implemented.")
+
+    def get_instruction_args_keys(self):
+        raise NotImplementedError("`get_instruction_args_keys` not implemented.")
+
+    def check_following(self, value):
+        raise NotImplementedError("`check_following` not implemented.")
+
+
+class ResponseLanguageChecker(Instruction):
+    """Check the language of the entire response."""
+
+    def build_description(self, *, language=None):
+        """Build the instruction description.
+
+        Args:
+          language: A string representing the expected language of the response. The
+            language has to comply to the 97 types defined in
+            `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
+            ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
+            for example, `en` for English, `zh` for Chinese, `fr` for French.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._language = language
+        if self._language is None:
+            self._language = random.choice(list(_LANGUAGES.keys()))
+        # TODO(tianjianlu): opens the description generation to more choices.
+        self._description_pattern = (
+            "Your ENTIRE response should be in {language} language, no other " + "language is allowed."
+        )
+        return self._description_pattern.format(language=_LANGUAGES[self._language])
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"language": self._language}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["language"]
+
+    def check_following(self, value):
+        """Check if the language of the entire response follows the instruction.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the language of `value` follows instruction; otherwise False.
+        """
+        assert isinstance(value, str)
+
+        try:
+            return langdetect.detect(value) == self._language
+        except langdetect.LangDetectException as e:
+            # Count as instruction is followed.
+            logging.error("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
+            return True
+
+
+class NumberOfSentences(Instruction):
+    """Check the number of sentences."""
+
+    def build_description(self, *, num_sentences=None, relation=None):
+        """Build the instruction description.
+
+        Args:
+          num_sentences: An integer specifying the number of sentences as a
+            threshold.
+          relation: A string in (`less than`, `at least`), defining the relational
+            operator for comparison.
+            Two relational comparisons are supported for now:
+            if 'less than', the actual number of sentences < the threshold;
+            if 'at least', the actual number of sentences >= the threshold.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        # The number of sentences as a threshold for comparison.
+        self._num_sentences_threshold = num_sentences
+        if self._num_sentences_threshold is None or self._num_sentences_threshold < 0:
+            self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
+
+        if relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
+            )
+        else:
+            self._comparison_relation = relation
+
+        self._description_pattern = "Your response should contain {relation} {num_sentences} sentences."
+        return self._description_pattern.format(
+            relation=self._comparison_relation,
+            num_sentences=self._num_sentences_threshold,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "num_sentences": self._num_sentences_threshold,
+            "relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_sentences", "relation"]
+
+    def check_following(self, value):
+        """Check if the number of sentences follows the instruction.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the response follows the instruction.
+
+        Raise:
+            ValueError if the string in `instruction_args` is not in
+            [`less_than`, `at_least`].
+        """
+        num_sentences = instructions_util.count_sentences(value)
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return num_sentences < self._num_sentences_threshold
+        elif self._comparison_relation == _COMPARISON_RELATION[1]:
+            return num_sentences >= self._num_sentences_threshold
+
+
+class PlaceholderChecker(Instruction):
+    """Check the placeholders in template writing."""
+
+    def build_description(self, *, num_placeholders=None):
+        """Build the instruction description.
+
+        Args:
+          num_placeholders: An integer denoting the minimum number of
+            placeholders required in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_placeholders = num_placeholders
+        if self._num_placeholders is None or self._num_placeholders < 0:
+            self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
+        self._description_pattern = (
+            "The response must contain at least {num_placeholders} placeholders "
+            + "represented by square brackets, such as [address]."
+        )
+        return self._description_pattern.format(num_placeholders=self._num_placeholders)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_placeholders": self._num_placeholders}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_placeholders"]
+
+    def check_following(self, value):
+        """Check if the number of placeholders follows the instruction.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the actual number of placeholders in the response is greater than
+          or equal to `num_placeholders`; otherwise, False.
+        """
+        placeholders = re.findall(r"\[.*?\]", value)
+        num_placeholders = len(placeholders)
+        return num_placeholders >= self._num_placeholders
+
+
+class BulletListChecker(Instruction):
+    """Checks the bullet list in the prompt."""
+
+    def build_description(self, *, num_bullets=None):
+        """Build the instruction description.
+
+        Args:
+          num_bullets: An integer specifying the exact number of bullet lists
+            that is required to appear in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_bullets = num_bullets
+        if self._num_bullets is None or self._num_bullets < 0:
+            self._num_bullets = random.randint(1, _NUM_BULLETS)
+        self._description_pattern = (
+            "Your answer must contain exactly {num_bullets} bullet points. "
+            + "Use the markdown bullet points such as:\n"
+            + "* This is point 1. \n"
+            + "* This is point 2"
+        )
+        return self._description_pattern.format(num_bullets=self._num_bullets)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_bullets": self._num_bullets}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_bullets"]
+
+    def check_following(self, value):
+        r"""Check if the number of bullet lists meets the requirement.
+
+        Args:
+          value: A string representing the response. The response is expected to
+            contain some bullet lists that start with `\*`.
+
+        Returns:
+          True if the actual number of bullet lists in the response meets the
+          requirement.
+        """
+        bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
+        bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
+        num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
+        return num_bullet_lists == self._num_bullets
+
+
+class ConstrainedResponseChecker(Instruction):
+    """Checks the constrained response."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        # A sequence of string(s) representing the options of the expected response.
+        self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
+        self._description_pattern = "Answer with one of the following options: {response_options}"
+        return self._description_pattern.format(response_options=self._constrained_responses)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response matches the constrained options.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the actual response contains one of the options in the constrained
+          responses; otherwise False.
+        """
+        value = value.strip()
+        for constrained_response in self._constrained_responses:
+            if constrained_response in value:
+                return True
+        return False
+
+
+class ConstrainedStartChecker(Instruction):
+    """Checks the response start."""
+
+    def build_description(self, *, starter=None):
+        """Build the instruction description.
+
+        Args:
+          starter: A string representing the keyward that the response should start
+            with.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._starter = starter.strip() if isinstance(starter, str) else starter
+        if self._starter is None:
+            self._starter = random.choice(_STARTER_OPTIONS)
+        self._description_pattern = (
+            "During the conversation, when it is your turn, " + "please always start with {starter}"
+        )
+        return self._description_pattern.format(starter=self._starter)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"starter": self._starter}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["starter"]
+
+    def check_following(self, value):
+        """Checks if the response starts with the constrained keyword or phrase.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the response starts with the given phrase or keyword that is
+          contained in `instruction_args`; otherwise, False.
+        """
+        response_pattern = r"^\s*" + self._starter + r".*$"
+        response_with_constrained_start = re.search(response_pattern, value, flags=re.MULTILINE)
+        return True if response_with_constrained_start else False
+
+
+class HighlightSectionChecker(Instruction):
+    """Checks the highlighted section."""
+
+    def build_description(self, *, num_highlights=None):
+        """Build the instruction description.
+
+        Args:
+          num_highlights: An integer specifying the minimum number of highlighted
+            sections.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_highlights = num_highlights
+        if self._num_highlights is None or self._num_highlights < 0:
+            self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
+
+        self._description_pattern = (
+            "Highlight at least {num_highlights} sections in your answer with "
+            + "markdown, i.e. *highlighted section*."
+        )
+
+        return self._description_pattern.format(num_highlights=self._num_highlights)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_highlights": self._num_highlights}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_highlights"]
+
+    def check_following(self, value):
+        """Checks if the number of highlighted sections meets the requirement.
+
+        Args:
+          value: a string repesenting the response. The response is expected to
+            contain highlighted sections in the format of *highlighted*.
+
+        Returns:
+          True if the actual number of highlighted sections in the format of
+          *highlighed sections* meets the minimum requirement; otherwise False.
+        """
+        num_highlights = 0
+        highlights = re.findall(r"\*[^\n\*]*\*", value)
+        double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
+        for highlight in highlights:
+            if highlight.strip("*").strip():
+                num_highlights += 1
+        for highlight in double_highlights:
+            if highlight.removeprefix("**").removesuffix("**").strip():
+                num_highlights += 1
+
+        return num_highlights >= self._num_highlights
+
+
+class SectionChecker(Instruction):
+    """Checks the sections."""
+
+    def build_description(self, *, section_spliter=None, num_sections=None):
+        """Build the instruction description.
+
+        Args:
+          section_spliter: A string represents the section spliter keyword that
+            marks a new section, i.e., `Section` or `SECTION`.
+          num_sections: An integer specifying the number of sections.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._section_spliter = section_spliter.strip() if isinstance(section_spliter, str) else section_spliter
+        if self._section_spliter is None:
+            self._section_spliter = random.choice(_SECTION_SPLITER)
+
+        self._num_sections = num_sections
+        if self._num_sections is None or self._num_sections < 0:
+            self._num_sections = random.randint(1, _NUM_SECTIONS)
+
+        self._description_pattern = (
+            "Your response must have {num_sections} sections. Mark the beginning "
+            + "of each section with {section_spliter} X, such as:\n"
+            + "{section_spliter} 1\n"
+            + "[content of section 1]\n"
+            + "{section_spliter} 2\n"
+            + "[content of section 2]"
+        )
+
+        return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "section_spliter": self._section_spliter,
+            "num_sections": self._num_sections,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["section_spliter", "num_sections"]
+
+    def check_following(self, value):
+        """Checks the response contains multiple sections.
+
+        Args:
+          value: A string representing the response. The response is expected
+            to contain multiple sections (number of sections is greater than 1).
+            A new section starts with `Section 1`, where the number denotes the
+            section index.
+
+        Returns:
+          True if the number of sections in the response is greater than or equal to
+          the minimum number of sections; otherwise, False.
+        """
+        section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?"
+        sections = re.split(section_splitter_patten, value)
+        num_sections = len(sections) - 1
+        return num_sections >= self._num_sections
+
+
+class ParagraphChecker(Instruction):
+    """Checks the paragraphs."""
+
+    def build_description(self, *, num_paragraphs=None):
+        """Build the instruction description.
+
+        Args:
+          num_paragraphs: An integer specifying the number of paragraphs.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_paragraphs = num_paragraphs
+        if self._num_paragraphs is None or self._num_paragraphs < 0:
+            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
+
+        self._description_pattern = (
+            "There should be {num_paragraphs} paragraphs. " + "Paragraphs are separated with the markdown divider: ***"
+        )
+
+        return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_paragraphs": self._num_paragraphs}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_paragraphs"]
+
+    def check_following(self, value):
+        """Checks the response contains required number of paragraphs.
+
+        Args:
+          value: A string representing the response. The response may contain
+            paragraphs that are separated by the markdown divider: `***`.
+
+        Returns:
+          True if the actual number of paragraphs is the same as required;
+          otherwise, False.
+        """
+        paragraphs = re.split(r"\s?\*\*\*\s?", value)
+        num_paragraphs = len(paragraphs)
+
+        for index, paragraph in enumerate(paragraphs):
+            if not paragraph.strip():
+                if index == 0 or index == len(paragraphs) - 1:
+                    num_paragraphs -= 1
+                else:
+                    return False
+
+        return num_paragraphs == self._num_paragraphs
+
+
+class PostscriptChecker(Instruction):
+    """Checks the postscript."""
+
+    def build_description(self, *, postscript_marker=None):
+        """Build the instruction description.
+
+        Args:
+          postscript_marker: A string containing the keyword that marks the start
+            of the postscript section.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._postscript_marker = (
+            postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
+        )
+        if self._postscript_marker is None:
+            self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
+
+        self._description_pattern = (
+            "At the end of your response, please explicitly add a postscript " + "starting with {postscript}"
+        )
+
+        return self._description_pattern.format(postscript=self._postscript_marker)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"postscript_marker": self._postscript_marker}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["postscript_marker"]
+
+    def check_following(self, value):
+        """Checks if the response follows the postscript format.
+
+        Args:
+          value: a string representing the response. The response is expected to
+            contain a postscript section.
+
+        Returns:
+          True if the response contains a postscript section starting with
+          the keyword containing in the `instruction_args`; otherwise False.
+        """
+        value = value.lower()
+        if self._postscript_marker == "P.P.S":
+            postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
+        elif self._postscript_marker == "P.S.":
+            postscript_pattern = r"\s*p\.\s?s\..*$"
+        else:
+            postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
+        postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
+        return True if postscript else False
+
+
+class RephraseChecker(Instruction):
+    """Checks the repharse."""
+
+    def build_description(self, *, original_message):
+        """Build the instruction description.
+
+        Args:
+          original_message: A string representing the original message. The
+            rephrased response should only change its words/sentences in between
+            its two asterisks, for example, *change me*. Both original and rephrased
+            messages should contain the changes in the form of *change me*.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not self.is_change(original_message):
+            raise ValueError(f"Message {original_message} does not contain changes " "in the form of *change me*.")
+
+        self._reference_without_change = original_message
+        self._description = (
+            "Rephrasing: Your rephrased response should only"
+            + "change the words/sentences in between two asterisks"
+            + "such as *change me*."
+        )
+        return self._description
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"original_message": self._reference_without_change}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["original_message"]
+
+    def check_following(self, value):
+        r"""Checks if the rephrasing follows the instruction.
+
+        Args:
+          value: A string representing the response, which is expected to rephras
+            the string of `instruction_args`.
+
+        Returns:
+          True if `value` and `instruction_args` only differ by the words/sentences
+          in between two asterisks such as *change me*; otherwise, False.
+        """
+
+        if not self.is_change(value):
+            raise ValueError(f"value {value} does not contain " "changes in the form of *change me*.")
+
+        response_without_changes = self.strip_changes(value)
+        reference_without_changes = self.strip_changes(self._reference_without_change)
+
+        return response_without_changes == reference_without_changes
+
+    def is_change(self, response):
+        """Check if there is change in the response in the form of *change me*."""
+        return re.search(r"\*.*\*", response)
+
+    def strip_changes(self, response):
+        """Strips off the changes."""
+        return re.sub(r"\*.*\*", "", response)
+
+
+class KeywordChecker(Instruction):
+    """Check the exisitence of certain keywords."""
+
+    def build_description(self, *, keywords=None):
+        """Build the instruction description.
+
+        Args:
+          keywords: A sequence of strings representing the keywords that are
+            expected in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        if not keywords:
+            self._keywords = instructions_util.generate_keywords(num_keywords=_NUM_KEYWORDS)
+        else:
+            self._keywords = keywords
+        self._keywords = sorted(self._keywords)
+
+        self._description_pattern = "Include keywords {keywords} in the response."
+
+        return self._description_pattern.format(keywords=self._keywords)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"keywords": self._keywords}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["keywords"]
+
+    def check_following(self, value):
+        """Check if the response contain the expected keywords."""
+        for keyword in self._keywords:
+            if not re.search(keyword, value, flags=re.IGNORECASE):
+                return False
+        return True
+
+
+class KeywordFrequencyChecker(Instruction):
+    """Check the keyword frequency."""
+
+    def build_description(self, *, keyword=None, frequency=None, relation=None):
+        """Build the instruction description.
+
+        Args:
+          keyword: A string representing a keyword that is expected in the response.
+          frequency: An integer specifying the number of times `keyword` is expected
+            to appear in the response.
+          relation: A string in (`less than`, `at least`), defining the relational
+            operator for comparison.
+            Two relational comparisons are supported for now:
+            if 'less than', the actual number of occurrences < frequency;
+            if 'at least', the actual number of occurrences >= frequency.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not keyword:
+            self._keyword = instructions_util.generate_keywords(num_keywords=1)[0]
+        else:
+            self._keyword = keyword.strip()
+
+        self._frequency = frequency
+        if self._frequency is None or self._frequency < 0:
+            self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
+
+        if relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
+            )
+        else:
+            self._comparison_relation = relation
+
+        self._description_pattern = (
+            "In your response, the word {keyword} should appear {relation} " + "{frequency} times."
+        )
+
+        return self._description_pattern.format(
+            keyword=self._keyword,
+            relation=self._comparison_relation,
+            frequency=self._frequency,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "keyword": self._keyword,
+            "frequency": self._frequency,
+            "relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["keyword", "frequency", "relation"]
+
+    def check_following(self, value):
+        """Checks if the response contain the keyword with required frequency."""
+        actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE))
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return actual_occurrences < self._frequency
+        elif self._comparison_relation == _COMPARISON_RELATION[1]:
+            return actual_occurrences >= self._frequency
+
+
+class NumberOfWords(Instruction):
+    """Checks the number of words."""
+
+    def build_description(self, *, num_words=None, relation=None):
+        """Build the instruction description.
+
+        Args:
+          num_words: An integer specifying the number of words contained in the
+            response.
+          relation: A string in (`less than`, `at least`), defining the relational
+            operator for comparison.
+            Two relational comparisons are supported for now:
+            if 'less than', the actual number of words < num_words;
+            if 'at least', the actual number of words >= num_words.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        self._num_words = num_words
+        if self._num_words is None or self._num_words < 0:
+            self._num_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT)
+
+        if relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
+            )
+        else:
+            self._comparison_relation = relation
+
+        self._description_pattern = "Answer with {relation} {num_words} words."
+
+        return self._description_pattern.format(relation=self._comparison_relation, num_words=self._num_words)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_words": self._num_words, "relation": self._comparison_relation}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_words", "relation"]
+
+    def check_following(self, value):
+        """Checks if the response contains the expected number of words."""
+        num_words = instructions_util.count_words(value)
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return num_words < self._num_words
+        elif self._comparison_relation == _COMPARISON_RELATION[1]:
+            return num_words >= self._num_words
+
+
+class JsonFormat(Instruction):
+    """Check the Json format."""
+
+    def build_description(self):
+        self._description_pattern = (
+            "Entire output should be wrapped in JSON format. You can use markdown" " ticks such as ```."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        value = (
+            value.strip()
+            .removeprefix("```json")
+            .removeprefix("```Json")
+            .removeprefix("```JSON")
+            .removeprefix("```")
+            .removesuffix("```")
+            .strip()
+        )
+        try:
+            json.loads(value)
+        except ValueError:
+            return False
+        return True
+
+
+class ParagraphFirstWordCheck(Instruction):
+    """Check the paragraph and the first word of the nth paragraph."""
+
+    def build_description(self, num_paragraphs=None, nth_paragraph=None, first_word=None):
+        r"""Build the instruction description.
+
+        Args:
+          num_paragraphs: An integer indicating the number of paragraphs expected
+            in the response. A paragraph is a subset of the string that is
+            expected to be separated by '\n\n'.
+          nth_paragraph: An integer indicating the paragraph number that we look at.
+            Note that n starts from 1.
+          first_word: A string that represent the first word of the bth paragraph.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_paragraphs = num_paragraphs
+        if self._num_paragraphs is None or self._num_paragraphs < 0:
+            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
+
+        self._nth_paragraph = nth_paragraph
+        if self._nth_paragraph is None or self._nth_paragraph <= 0 or self._nth_paragraph > self._num_paragraphs:
+            self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
+
+        self._first_word = first_word
+        if self._first_word is None:
+            self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
+        self._first_word = self._first_word.lower()
+
+        self._description_pattern = (
+            "There should be {num_paragraphs} paragraphs. "
+            + "Paragraphs and only paragraphs are separated with each other by two "
+            + "new lines as if it was '\\n\\n' in python. "
+            + "Paragraph {nth_paragraph} must start with word {first_word}."
+        )
+
+        return self._description_pattern.format(
+            num_paragraphs=self._num_paragraphs,
+            nth_paragraph=self._nth_paragraph,
+            first_word=self._first_word,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "num_paragraphs": self._num_paragraphs,
+            "nth_paragraph": self._nth_paragraph,
+            "first_word": self._first_word,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_paragraphs", "nth_paragraph", "first_word"]
+
+    def check_following(self, value):
+        """Checks for required number of paragraphs and correct first word.
+
+        Args:
+          value: a string representing the response. The response may contain
+            paragraphs that are separated by two new lines and the first word of
+            the nth paragraph will have to match a specified word.
+
+        Returns:
+          True if the number of paragraphs is the same as required and the first
+          word of the specified paragraph is the same as required. Otherwise, false.
+        """
+
+        paragraphs = re.split(r"\n\n", value)
+        num_paragraphs = len(paragraphs)
+
+        for paragraph in paragraphs:
+            if not paragraph.strip():
+                num_paragraphs -= 1
+
+        # check that index doesn't go out of bounds
+        if self._nth_paragraph <= num_paragraphs:
+            paragraph = paragraphs[self._nth_paragraph - 1].strip()
+            if not paragraph:
+                return False
+        else:
+            return False
+
+        first_word = ""
+        punctuation = {".", ",", "?", "!", "'", '"'}
+
+        # get first word and remove punctuation
+        word = paragraph.split()[0].strip()
+        # TODO(jeffrey): make more complex?
+        word = word.lstrip("'")
+        word = word.lstrip('"')
+
+        for letter in word:
+            if letter in punctuation:
+                break
+            first_word += letter.lower()
+
+        return num_paragraphs == self._num_paragraphs and first_word == self._first_word
+
+
+# TODO(jeffrey) add relation - at least/at most?
+class KeySentenceChecker(Instruction):
+    """Check the existence of certain key sentences."""
+
+    def build_description(self, key_sentences=None, num_sentences=None):
+        """Build the instruction description.
+
+        Args:
+          key_sentences: A sequences of strings representing the key sentences that
+            are expected in the response.
+          num_sentences: The number of key sentences that are expected to be seen in
+            the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        if not key_sentences:
+            # TODO(jeffrey) make a generate sentences function? wonderwords package
+            self._key_sentences = set("For now, this is fine.")
+        else:
+            self._key_sentences = key_sentences
+
+        if not num_sentences:
+            self._num_sentences = random.randint(1, len(self._key_sentences))
+        else:
+            self._num_sentences = num_sentences
+
+        self._description_pattern = "Include {num_sentences} of the following sentences {key_sentences}"
+
+        return self._description_pattern.format(num_sentences=self._num_sentences, key_sentences=self._key_sentences)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "num_sentences": self._num_sentences,
+            "key_sentences": list(self._key_sentences),
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_sentences", "key_sentences"]
+
+    def check_following(self, value):
+        """Checks if the response contains the expected key sentences."""
+        count = 0
+        sentences = instructions_util.split_into_sentences(value)
+        for sentence in self._key_sentences:
+            if sentence in sentences:
+                count += 1
+
+        return count == self._num_sentences
+
+
+class ForbiddenWords(Instruction):
+    """Checks that specified words are not used in response."""
+
+    def build_description(self, forbidden_words=None):
+        """Build the instruction description.
+
+        Args:
+          forbidden_words: A sequences of strings respresenting words that are not
+            allowed in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        if not forbidden_words:
+            self._forbidden_words = instructions_util.generate_keywords(num_keywords=_NUM_KEYWORDS)
+        else:
+            self._forbidden_words = list(set(forbidden_words))
+        self._forbidden_words = sorted(self._forbidden_words)
+        self._description_pattern = "Do not include keywords {forbidden_words} in the response."
+
+        return self._description_pattern.format(forbidden_words=self._forbidden_words)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"forbidden_words": self._forbidden_words}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["forbidden_words"]
+
+    def check_following(self, value):
+        """Check if the response does not contain the expected keywords."""
+        for word in self._forbidden_words:
+            if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
+                return False
+        return True
+
+
+class RephraseParagraph(Instruction):
+    """Checks that the paragraph is rephrased."""
+
+    def build_description(self, *, original_paragraph, low, high):
+        """Builds the instruction description.
+
+        Args:
+          original_paragraph: A string presenting the original paragraph. The
+            rephrases response should have betweeb low-high words in common.
+          low: An integer presenting the lower bound of similar words.
+          high: An integer representing the upper bound of similar words.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        # TODO(jeffrey) make more encompassing
+        self._original_paragraph = original_paragraph
+        self._low = low
+        self._high = high
+
+        self._description = (
+            "Rephrase the following paragraph: "
+            + "{original_paragraph}\nYour response should have "
+            + "between {low} and {high} of the same words. "
+            + "Words are the same if and only if all of the "
+            + "letters, ignoring cases, are the same. For "
+            + "example, 'run' is the same as 'Run' but different "
+            + "to 'ran'."
+        )
+
+        return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "original_paragraph": self._original_paragraph,
+            "low": self._low,
+            "high": self._high,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["original_paragraph", "low", "high"]
+
+    def check_following(self, value):
+        val_words = re.findall(r"\w+", value.lower())
+        original_words = re.findall(r"\w+", self._original_paragraph.lower())
+        similar_words = 0
+
+        dict_val = collections.Counter(val_words)
+        dict_original = collections.Counter(original_words)
+
+        for word in dict_original:
+            similar_words += min(dict_original[word], dict_val[word])
+
+        return similar_words >= self._low and similar_words <= self._high
+
+
+class TwoResponsesChecker(Instruction):
+    """Check that two responses were given."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Give two different responses. Responses and only responses should"
+            " be separated by 6 asterisk symbols: ******."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response has two different answers.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if two responses are detected and false otherwise.
+        """
+        valid_responses = []
+        responses = value.split("******")
+        for index, response in enumerate(responses):
+            if not response.strip():
+                if index != 0 and index != len(responses) - 1:
+                    return False
+            else:
+                valid_responses.append(response)
+        return len(valid_responses) == 2 and valid_responses[0].strip() != valid_responses[1].strip()
+
+
+class RepeatPromptThenAnswer(Instruction):
+    """Checks that Prompt is first repeated then answered."""
+
+    def build_description(self, *, prompt_to_repeat=None):
+        """Build the instruction description.
+
+        Args:
+          prompt_to_repeat: The prompt that is meant to be repeated.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not prompt_to_repeat:
+            raise ValueError("prompt_to_repeat must be set.")
+        else:
+            self._prompt_to_repeat = prompt_to_repeat
+        self._description_pattern = (
+            "First repeat the request word for word without change,"
+            " then give your answer (1. do not say any words or characters"
+            " before repeating the request; 2. the request you need to repeat"
+            " does not include this sentence)"
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return {"prompt_to_repeat": self._prompt_to_repeat}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["prompt_to_repeat"]
+
+    def check_following(self, value):
+        if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()):
+            return True
+        return False
+
+
+class EndChecker(Instruction):
+    """Checks that the prompt ends with a given phrase."""
+
+    def build_description(self, *, end_phrase=None):
+        """Build the instruction description.
+
+        Args:
+          end_phrase: A string representing the phrase the response should end with.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._end_phrase = end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
+        if self._end_phrase is None:
+            self._end_phrase = random.choice(_ENDING_OPTIONS)
+        self._description_pattern = (
+            "Finish your response with this exact phrase {ender}. " "No other words should follow this phrase."
+        )
+        return self._description_pattern.format(ender=self._end_phrase)
+
+    def get_instruction_args(self):
+        return {"end_phrase": self._end_phrase}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["end_phrase"]
+
+    def check_following(self, value):
+        """Checks if the response ends with the expected phrase."""
+        value = value.strip().strip('"').lower()
+        self._end_phrase = self._end_phrase.strip().lower()
+        return value.endswith(self._end_phrase)
+
+
+class TitleChecker(Instruction):
+    """Checks the response for a title."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Your answer must contain a title, wrapped in double angular brackets," " such as <<poem of joy>>."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response contains a title."""
+        pattern = r"<<[^\n]+>>"
+        re_pattern = re.compile(pattern)
+        titles = re.findall(re_pattern, value)
+
+        for title in titles:
+            if title.lstrip("<").rstrip(">").strip():
+                return True
+        return False
+
+
+class LetterFrequencyChecker(Instruction):
+    """Checks letter frequency."""
+
+    def build_description(self, *, letter=None, let_frequency=None, let_relation=None):
+        """Build the instruction description.
+
+        Args:
+          letter: A string representing a letter that is expected in the response.
+          let_frequency: An integer specifying the number of times `keyword` is
+            expected to appear in the response.
+          let_relation: A string in (`less than`, `at least`), defining the
+            relational operator for comparison. Two relational comparisons are
+            supported for now; if 'less than', the actual number of
+            occurrences < frequency; if 'at least', the actual number of
+            occurrences >= frequency.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not letter or len(letter) > 1 or ord(letter.lower()) < 97 or ord(letter.lower()) > 122:
+            self._letter = random.choice(list(string.ascii_letters))
+        else:
+            self._letter = letter.strip()
+        self._letter = self._letter.lower()
+
+        self._frequency = let_frequency
+        if self._frequency is None or self._frequency < 0:
+            self._frequency = random.randint(1, _LETTER_FREQUENCY)
+
+        if let_relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif let_relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {let_relation} is given."
+            )
+        else:
+            self._comparison_relation = let_relation
+
+        self._description_pattern = (
+            "In your response, the letter {letter} should appear {let_relation}" " {let_frequency} times."
+        )
+
+        return self._description_pattern.format(
+            letter=self._letter,
+            let_frequency=self._frequency,
+            let_relation=self._comparison_relation,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyword args of build description."""
+        return {
+            "letter": self._letter,
+            "let_frequency": self._frequency,
+            "let_relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["letter", "let_frequency", "let_relation"]
+
+    def check_following(self, value):
+        """Checks that the response contains the letter at the right frequency."""
+        value = value.lower()
+        letters = collections.Counter(value)
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return letters[self._letter] < self._frequency
+        else:
+            return letters[self._letter] >= self._frequency
+
+
+class CapitalLettersEnglishChecker(Instruction):
+    """Checks that the response is in english and is in all capital letters."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = "Your entire response should be in English, and in all capital letters."
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks that the response is in English and in all capital letters."""
+        assert isinstance(value, str)
+
+        try:
+            return value.isupper() and langdetect.detect(value) == "en"
+        except langdetect.LangDetectException as e:
+            # Count as instruction is followed.
+            logging.error("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
+            return True
+
+
+class LowercaseLettersEnglishChecker(Instruction):
+    """Checks that the response is in english and is in all lowercase letters."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Your entire response should be in English, and in all lowercase"
+            " letters. No capital letters are allowed."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks that the response is in English and in all lowercase letters."""
+        assert isinstance(value, str)
+
+        try:
+            return value.islower() and langdetect.detect(value) == "en"
+        except langdetect.LangDetectException as e:
+            # Count as instruction is followed.
+            logging.error("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
+            return True
+
+
+class CommaChecker(Instruction):
+    """Checks the response for no commas."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = "In your entire response, refrain from the use of any commas."
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks that the response does not contain commas."""
+        return not re.search(r"\,", value)
+
+
+class CapitalWordFrequencyChecker(Instruction):
+    """Checks frequency of words with all capital letters."""
+
+    def build_description(
+        self,
+        capital_frequency=None,
+        capital_relation=None,
+    ):
+        """Build the instruction description.
+
+        Args:
+          capital_frequency: An integer that represents the number of words that
+            should be in all capital letters.
+          capital_relation: A string that is 'at least' or 'at most' that refers to
+            the frequency.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._frequency = capital_frequency
+        if self._frequency is None:
+            self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
+
+        self._comparison_relation = capital_relation
+        if capital_relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif capital_relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {capital_relation} is given."
+            )
+
+        self._description_pattern = (
+            "In your response, words with all capital letters should appear" " {relation} {frequency} times."
+        )
+
+        return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
+
+    def get_instruction_args(self):
+        """Returns the keyword args of build description."""
+        return {
+            "capital_frequency": self._frequency,
+            "capital_relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["capital_frequency", "capital_relation"]
+
+    def check_following(self, value):
+        """Checks the frequency of words with all capital letters."""
+        # Hyphenated words will count as one word
+        words = instructions_util.nltk.word_tokenize(value)
+        capital_words = [word for word in words if word.isupper()]
+
+        capital_words = len(capital_words)
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return capital_words < self._frequency
+        else:
+            return capital_words >= self._frequency
+
+
+class QuotationChecker(Instruction):
+    """Checks response is wrapped with double quotation marks."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = "Wrap your entire response with double quotation marks."
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        """Returns the keyword args of build description."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response is wrapped with double quotation marks."""
+        value = value.strip()
+        return len(value) > 1 and value[0] == '"' and value[-1] == '"'
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
new file mode 100644
index 000000000..30a092c37
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
@@ -0,0 +1,167 @@
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry of all instructions."""
+from lm_eval.tasks.ifeval import instructions
+
+
+_KEYWORD = "keywords:"
+
+_LANGUAGE = "language:"
+
+_LENGTH = "length_constraints:"
+
+_CONTENT = "detectable_content:"
+
+_FORMAT = "detectable_format:"
+
+_MULTITURN = "multi-turn:"
+
+_COMBINATION = "combination:"
+
+_STARTEND = "startend:"
+
+_CHANGE_CASES = "change_case:"
+
+_PUNCTUATION = "punctuation:"
+
+INSTRUCTION_DICT = {
+    _KEYWORD + "existence": instructions.KeywordChecker,
+    _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
+    _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
+    _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
+    _LENGTH + "number_sentences": instructions.NumberOfSentences,
+    _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
+    _LENGTH + "number_words": instructions.NumberOfWords,
+    _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
+    _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
+    _CONTENT + "postscript": instructions.PostscriptChecker,
+    _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
+    _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker),
+    _FORMAT + "multiple_sections": instructions.SectionChecker,
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT + "json_format": instructions.JsonFormat,
+    _FORMAT + "title": instructions.TitleChecker,
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
+    _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
+    _STARTEND + "end_checker": instructions.EndChecker,
+    _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
+    _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker,
+    _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
+    _PUNCTUATION + "no_comma": instructions.CommaChecker,
+    _STARTEND + "quotation": instructions.QuotationChecker,
+}
+
+INSTRUCTION_CONFLICTS = {
+    _KEYWORD + "existence": {_KEYWORD + "existence"},
+    _KEYWORD + "frequency": {_KEYWORD + "frequency"},
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
+    _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
+    _LANGUAGE + "response_language": {
+        _LANGUAGE + "response_language",
+        _FORMAT + "multiple_sections",
+        _KEYWORD + "existence",
+        _KEYWORD + "frequency",
+        _KEYWORD + "forbidden_words",
+        _STARTEND + "end_checker",
+        _CHANGE_CASES + "english_capital",
+        _CHANGE_CASES + "english_lowercase",
+    },
+    _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
+    _LENGTH + "number_paragraphs": {
+        _LENGTH + "number_paragraphs",
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_sentences",
+        _LENGTH + "nth_paragraph_first_word",
+    },
+    _LENGTH + "number_words": {_LENGTH + "number_words"},
+    _LENGTH + "nth_paragraph_first_word": {
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_paragraphs",
+    },
+    _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
+    _CONTENT + "postscript": {_CONTENT + "postscript"},
+    _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
+    _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
+    _FORMAT + "multiple_sections": {
+        _FORMAT + "multiple_sections",
+        _LANGUAGE + "response_language",
+        _FORMAT + "number_highlighted_sections",
+    },
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+        {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
+    ),
+    _FORMAT + "title": {_FORMAT + "title"},
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
+        {
+            _KEYWORD + "forbidden_words",
+            _KEYWORD + "existence",
+            _LANGUAGE + "response_language",
+            _FORMAT + "title",
+            _PUNCTUATION + "no_comma",
+        }
+    ),
+    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
+        {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
+    ),
+    _STARTEND + "end_checker": {_STARTEND + "end_checker"},
+    _CHANGE_CASES + "capital_word_frequency": {
+        _CHANGE_CASES + "capital_word_frequency",
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
+    _CHANGE_CASES + "english_lowercase": {
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
+    _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
+}
+
+
+def conflict_make(conflicts):
+    """Makes sure if A conflicts with B, B will conflict with A.
+
+    Args:
+      conflicts: Dictionary of potential conflicts where key is instruction id
+        and value is set of instruction ids that it conflicts with.
+
+    Returns:
+      Revised version of the dictionary. All instructions conflict with
+      themselves. If A conflicts with B, B will conflict with A.
+    """
+    for key in conflicts:
+        for k in conflicts[key]:
+            conflicts[k].add(key)
+        conflicts[key].add(key)
+    return conflicts
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
new file mode 100644
index 000000000..e631e770c
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
@@ -0,0 +1,1684 @@
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility library of instructions."""
+
+import functools
+import random
+import re
+
+import immutabledict
+import nltk
+
+
+def download_nltk_resources():
+    """Download 'punkt' if not already installed"""
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt")
+
+
+download_nltk_resources()
+
+WORD_LIST = [
+    "western",
+    "sentence",
+    "signal",
+    "dump",
+    "spot",
+    "opposite",
+    "bottom",
+    "potato",
+    "administration",
+    "working",
+    "welcome",
+    "morning",
+    "good",
+    "agency",
+    "primary",
+    "wish",
+    "responsibility",
+    "press",
+    "problem",
+    "president",
+    "steal",
+    "brush",
+    "read",
+    "type",
+    "beat",
+    "trainer",
+    "growth",
+    "lock",
+    "bone",
+    "case",
+    "equal",
+    "comfortable",
+    "region",
+    "replacement",
+    "performance",
+    "mate",
+    "walk",
+    "medicine",
+    "film",
+    "thing",
+    "rock",
+    "tap",
+    "total",
+    "competition",
+    "ease",
+    "south",
+    "establishment",
+    "gather",
+    "parking",
+    "world",
+    "plenty",
+    "breath",
+    "claim",
+    "alcohol",
+    "trade",
+    "dear",
+    "highlight",
+    "street",
+    "matter",
+    "decision",
+    "mess",
+    "agreement",
+    "studio",
+    "coach",
+    "assist",
+    "brain",
+    "wing",
+    "style",
+    "private",
+    "top",
+    "brown",
+    "leg",
+    "buy",
+    "procedure",
+    "method",
+    "speed",
+    "high",
+    "company",
+    "valuable",
+    "pie",
+    "analyst",
+    "session",
+    "pattern",
+    "district",
+    "pleasure",
+    "dinner",
+    "swimming",
+    "joke",
+    "order",
+    "plate",
+    "department",
+    "motor",
+    "cell",
+    "spend",
+    "cabinet",
+    "difference",
+    "power",
+    "examination",
+    "engine",
+    "horse",
+    "dimension",
+    "pay",
+    "toe",
+    "curve",
+    "literature",
+    "bother",
+    "fire",
+    "possibility",
+    "debate",
+    "activity",
+    "passage",
+    "hello",
+    "cycle",
+    "background",
+    "quiet",
+    "author",
+    "effect",
+    "actor",
+    "page",
+    "bicycle",
+    "error",
+    "throat",
+    "attack",
+    "character",
+    "phone",
+    "tea",
+    "increase",
+    "outcome",
+    "file",
+    "specific",
+    "inspector",
+    "internal",
+    "potential",
+    "staff",
+    "building",
+    "employer",
+    "shoe",
+    "hand",
+    "direction",
+    "garden",
+    "purchase",
+    "interview",
+    "study",
+    "recognition",
+    "member",
+    "spiritual",
+    "oven",
+    "sandwich",
+    "weird",
+    "passenger",
+    "particular",
+    "response",
+    "reaction",
+    "size",
+    "variation",
+    "a",
+    "cancel",
+    "candy",
+    "exit",
+    "guest",
+    "condition",
+    "fly",
+    "price",
+    "weakness",
+    "convert",
+    "hotel",
+    "great",
+    "mouth",
+    "mind",
+    "song",
+    "sugar",
+    "suspect",
+    "telephone",
+    "ear",
+    "roof",
+    "paint",
+    "refrigerator",
+    "organization",
+    "jury",
+    "reward",
+    "engineering",
+    "day",
+    "possession",
+    "crew",
+    "bar",
+    "road",
+    "description",
+    "celebration",
+    "score",
+    "mark",
+    "letter",
+    "shower",
+    "suggestion",
+    "sir",
+    "luck",
+    "national",
+    "progress",
+    "hall",
+    "stroke",
+    "theory",
+    "offer",
+    "story",
+    "tax",
+    "definition",
+    "history",
+    "ride",
+    "medium",
+    "opening",
+    "glass",
+    "elevator",
+    "stomach",
+    "question",
+    "ability",
+    "leading",
+    "village",
+    "computer",
+    "city",
+    "grand",
+    "confidence",
+    "candle",
+    "priest",
+    "recommendation",
+    "point",
+    "necessary",
+    "body",
+    "desk",
+    "secret",
+    "horror",
+    "noise",
+    "culture",
+    "warning",
+    "water",
+    "round",
+    "diet",
+    "flower",
+    "bus",
+    "tough",
+    "permission",
+    "week",
+    "prompt",
+    "connection",
+    "abuse",
+    "height",
+    "save",
+    "corner",
+    "border",
+    "stress",
+    "drive",
+    "stop",
+    "rip",
+    "meal",
+    "listen",
+    "confusion",
+    "girlfriend",
+    "living",
+    "relation",
+    "significance",
+    "plan",
+    "creative",
+    "atmosphere",
+    "blame",
+    "invite",
+    "housing",
+    "paper",
+    "drink",
+    "roll",
+    "silver",
+    "drunk",
+    "age",
+    "damage",
+    "smoke",
+    "environment",
+    "pack",
+    "savings",
+    "influence",
+    "tourist",
+    "rain",
+    "post",
+    "sign",
+    "grandmother",
+    "run",
+    "profit",
+    "push",
+    "clerk",
+    "final",
+    "wine",
+    "swim",
+    "pause",
+    "stuff",
+    "singer",
+    "funeral",
+    "average",
+    "source",
+    "scene",
+    "tradition",
+    "personal",
+    "snow",
+    "nobody",
+    "distance",
+    "sort",
+    "sensitive",
+    "animal",
+    "major",
+    "negotiation",
+    "click",
+    "mood",
+    "period",
+    "arrival",
+    "expression",
+    "holiday",
+    "repeat",
+    "dust",
+    "closet",
+    "gold",
+    "bad",
+    "sail",
+    "combination",
+    "clothes",
+    "emphasis",
+    "duty",
+    "black",
+    "step",
+    "school",
+    "jump",
+    "document",
+    "professional",
+    "lip",
+    "chemical",
+    "front",
+    "wake",
+    "while",
+    "inside",
+    "watch",
+    "row",
+    "subject",
+    "penalty",
+    "balance",
+    "possible",
+    "adult",
+    "aside",
+    "sample",
+    "appeal",
+    "wedding",
+    "depth",
+    "king",
+    "award",
+    "wife",
+    "blow",
+    "site",
+    "camp",
+    "music",
+    "safe",
+    "gift",
+    "fault",
+    "guess",
+    "act",
+    "shame",
+    "drama",
+    "capital",
+    "exam",
+    "stupid",
+    "record",
+    "sound",
+    "swing",
+    "novel",
+    "minimum",
+    "ratio",
+    "machine",
+    "shape",
+    "lead",
+    "operation",
+    "salary",
+    "cloud",
+    "affair",
+    "hit",
+    "chapter",
+    "stage",
+    "quantity",
+    "access",
+    "army",
+    "chain",
+    "traffic",
+    "kick",
+    "analysis",
+    "airport",
+    "time",
+    "vacation",
+    "philosophy",
+    "ball",
+    "chest",
+    "thanks",
+    "place",
+    "mountain",
+    "advertising",
+    "red",
+    "past",
+    "rent",
+    "return",
+    "tour",
+    "house",
+    "construction",
+    "net",
+    "native",
+    "war",
+    "figure",
+    "fee",
+    "spray",
+    "user",
+    "dirt",
+    "shot",
+    "task",
+    "stick",
+    "friend",
+    "software",
+    "promotion",
+    "interaction",
+    "surround",
+    "block",
+    "purpose",
+    "practice",
+    "conflict",
+    "routine",
+    "requirement",
+    "bonus",
+    "hole",
+    "state",
+    "junior",
+    "sweet",
+    "catch",
+    "tear",
+    "fold",
+    "wall",
+    "editor",
+    "life",
+    "position",
+    "pound",
+    "respect",
+    "bathroom",
+    "coat",
+    "script",
+    "job",
+    "teach",
+    "birth",
+    "view",
+    "resolve",
+    "theme",
+    "employee",
+    "doubt",
+    "market",
+    "education",
+    "serve",
+    "recover",
+    "tone",
+    "harm",
+    "miss",
+    "union",
+    "understanding",
+    "cow",
+    "river",
+    "association",
+    "concept",
+    "training",
+    "recipe",
+    "relationship",
+    "reserve",
+    "depression",
+    "proof",
+    "hair",
+    "revenue",
+    "independent",
+    "lift",
+    "assignment",
+    "temporary",
+    "amount",
+    "loss",
+    "edge",
+    "track",
+    "check",
+    "rope",
+    "estimate",
+    "pollution",
+    "stable",
+    "message",
+    "delivery",
+    "perspective",
+    "mirror",
+    "assistant",
+    "representative",
+    "witness",
+    "nature",
+    "judge",
+    "fruit",
+    "tip",
+    "devil",
+    "town",
+    "emergency",
+    "upper",
+    "drop",
+    "stay",
+    "human",
+    "neck",
+    "speaker",
+    "network",
+    "sing",
+    "resist",
+    "league",
+    "trip",
+    "signature",
+    "lawyer",
+    "importance",
+    "gas",
+    "choice",
+    "engineer",
+    "success",
+    "part",
+    "external",
+    "worker",
+    "simple",
+    "quarter",
+    "student",
+    "heart",
+    "pass",
+    "spite",
+    "shift",
+    "rough",
+    "lady",
+    "grass",
+    "community",
+    "garage",
+    "youth",
+    "standard",
+    "skirt",
+    "promise",
+    "blind",
+    "television",
+    "disease",
+    "commission",
+    "positive",
+    "energy",
+    "calm",
+    "presence",
+    "tune",
+    "basis",
+    "preference",
+    "head",
+    "common",
+    "cut",
+    "somewhere",
+    "presentation",
+    "current",
+    "thought",
+    "revolution",
+    "effort",
+    "master",
+    "implement",
+    "republic",
+    "floor",
+    "principle",
+    "stranger",
+    "shoulder",
+    "grade",
+    "button",
+    "tennis",
+    "police",
+    "collection",
+    "account",
+    "register",
+    "glove",
+    "divide",
+    "professor",
+    "chair",
+    "priority",
+    "combine",
+    "peace",
+    "extension",
+    "maybe",
+    "evening",
+    "frame",
+    "sister",
+    "wave",
+    "code",
+    "application",
+    "mouse",
+    "match",
+    "counter",
+    "bottle",
+    "half",
+    "cheek",
+    "resolution",
+    "back",
+    "knowledge",
+    "make",
+    "discussion",
+    "screw",
+    "length",
+    "accident",
+    "battle",
+    "dress",
+    "knee",
+    "log",
+    "package",
+    "it",
+    "turn",
+    "hearing",
+    "newspaper",
+    "layer",
+    "wealth",
+    "profile",
+    "imagination",
+    "answer",
+    "weekend",
+    "teacher",
+    "appearance",
+    "meet",
+    "bike",
+    "rise",
+    "belt",
+    "crash",
+    "bowl",
+    "equivalent",
+    "support",
+    "image",
+    "poem",
+    "risk",
+    "excitement",
+    "remote",
+    "secretary",
+    "public",
+    "produce",
+    "plane",
+    "display",
+    "money",
+    "sand",
+    "situation",
+    "punch",
+    "customer",
+    "title",
+    "shake",
+    "mortgage",
+    "option",
+    "number",
+    "pop",
+    "window",
+    "extent",
+    "nothing",
+    "experience",
+    "opinion",
+    "departure",
+    "dance",
+    "indication",
+    "boy",
+    "material",
+    "band",
+    "leader",
+    "sun",
+    "beautiful",
+    "muscle",
+    "farmer",
+    "variety",
+    "fat",
+    "handle",
+    "director",
+    "opportunity",
+    "calendar",
+    "outside",
+    "pace",
+    "bath",
+    "fish",
+    "consequence",
+    "put",
+    "owner",
+    "go",
+    "doctor",
+    "information",
+    "share",
+    "hurt",
+    "protection",
+    "career",
+    "finance",
+    "force",
+    "golf",
+    "garbage",
+    "aspect",
+    "kid",
+    "food",
+    "boot",
+    "milk",
+    "respond",
+    "objective",
+    "reality",
+    "raw",
+    "ring",
+    "mall",
+    "one",
+    "impact",
+    "area",
+    "news",
+    "international",
+    "series",
+    "impress",
+    "mother",
+    "shelter",
+    "strike",
+    "loan",
+    "month",
+    "seat",
+    "anything",
+    "entertainment",
+    "familiar",
+    "clue",
+    "year",
+    "glad",
+    "supermarket",
+    "natural",
+    "god",
+    "cost",
+    "conversation",
+    "tie",
+    "ruin",
+    "comfort",
+    "earth",
+    "storm",
+    "percentage",
+    "assistance",
+    "budget",
+    "strength",
+    "beginning",
+    "sleep",
+    "other",
+    "young",
+    "unit",
+    "fill",
+    "store",
+    "desire",
+    "hide",
+    "value",
+    "cup",
+    "maintenance",
+    "nurse",
+    "function",
+    "tower",
+    "role",
+    "class",
+    "camera",
+    "database",
+    "panic",
+    "nation",
+    "basket",
+    "ice",
+    "art",
+    "spirit",
+    "chart",
+    "exchange",
+    "feedback",
+    "statement",
+    "reputation",
+    "search",
+    "hunt",
+    "exercise",
+    "nasty",
+    "notice",
+    "male",
+    "yard",
+    "annual",
+    "collar",
+    "date",
+    "platform",
+    "plant",
+    "fortune",
+    "passion",
+    "friendship",
+    "spread",
+    "cancer",
+    "ticket",
+    "attitude",
+    "island",
+    "active",
+    "object",
+    "service",
+    "buyer",
+    "bite",
+    "card",
+    "face",
+    "steak",
+    "proposal",
+    "patient",
+    "heat",
+    "rule",
+    "resident",
+    "broad",
+    "politics",
+    "west",
+    "knife",
+    "expert",
+    "girl",
+    "design",
+    "salt",
+    "baseball",
+    "grab",
+    "inspection",
+    "cousin",
+    "couple",
+    "magazine",
+    "cook",
+    "dependent",
+    "security",
+    "chicken",
+    "version",
+    "currency",
+    "ladder",
+    "scheme",
+    "kitchen",
+    "employment",
+    "local",
+    "attention",
+    "manager",
+    "fact",
+    "cover",
+    "sad",
+    "guard",
+    "relative",
+    "county",
+    "rate",
+    "lunch",
+    "program",
+    "initiative",
+    "gear",
+    "bridge",
+    "breast",
+    "talk",
+    "dish",
+    "guarantee",
+    "beer",
+    "vehicle",
+    "reception",
+    "woman",
+    "substance",
+    "copy",
+    "lecture",
+    "advantage",
+    "park",
+    "cold",
+    "death",
+    "mix",
+    "hold",
+    "scale",
+    "tomorrow",
+    "blood",
+    "request",
+    "green",
+    "cookie",
+    "church",
+    "strip",
+    "forever",
+    "beyond",
+    "debt",
+    "tackle",
+    "wash",
+    "following",
+    "feel",
+    "maximum",
+    "sector",
+    "sea",
+    "property",
+    "economics",
+    "menu",
+    "bench",
+    "try",
+    "language",
+    "start",
+    "call",
+    "solid",
+    "address",
+    "income",
+    "foot",
+    "senior",
+    "honey",
+    "few",
+    "mixture",
+    "cash",
+    "grocery",
+    "link",
+    "map",
+    "form",
+    "factor",
+    "pot",
+    "model",
+    "writer",
+    "farm",
+    "winter",
+    "skill",
+    "anywhere",
+    "birthday",
+    "policy",
+    "release",
+    "husband",
+    "lab",
+    "hurry",
+    "mail",
+    "equipment",
+    "sink",
+    "pair",
+    "driver",
+    "consideration",
+    "leather",
+    "skin",
+    "blue",
+    "boat",
+    "sale",
+    "brick",
+    "two",
+    "feed",
+    "square",
+    "dot",
+    "rush",
+    "dream",
+    "location",
+    "afternoon",
+    "manufacturer",
+    "control",
+    "occasion",
+    "trouble",
+    "introduction",
+    "advice",
+    "bet",
+    "eat",
+    "kill",
+    "category",
+    "manner",
+    "office",
+    "estate",
+    "pride",
+    "awareness",
+    "slip",
+    "crack",
+    "client",
+    "nail",
+    "shoot",
+    "membership",
+    "soft",
+    "anybody",
+    "web",
+    "official",
+    "individual",
+    "pizza",
+    "interest",
+    "bag",
+    "spell",
+    "profession",
+    "queen",
+    "deal",
+    "resource",
+    "ship",
+    "guy",
+    "chocolate",
+    "joint",
+    "formal",
+    "upstairs",
+    "car",
+    "resort",
+    "abroad",
+    "dealer",
+    "associate",
+    "finger",
+    "surgery",
+    "comment",
+    "team",
+    "detail",
+    "crazy",
+    "path",
+    "tale",
+    "initial",
+    "arm",
+    "radio",
+    "demand",
+    "single",
+    "draw",
+    "yellow",
+    "contest",
+    "piece",
+    "quote",
+    "pull",
+    "commercial",
+    "shirt",
+    "contribution",
+    "cream",
+    "channel",
+    "suit",
+    "discipline",
+    "instruction",
+    "concert",
+    "speech",
+    "low",
+    "effective",
+    "hang",
+    "scratch",
+    "industry",
+    "breakfast",
+    "lay",
+    "join",
+    "metal",
+    "bedroom",
+    "minute",
+    "product",
+    "rest",
+    "temperature",
+    "many",
+    "give",
+    "argument",
+    "print",
+    "purple",
+    "laugh",
+    "health",
+    "credit",
+    "investment",
+    "sell",
+    "setting",
+    "lesson",
+    "egg",
+    "middle",
+    "marriage",
+    "level",
+    "evidence",
+    "phrase",
+    "love",
+    "self",
+    "benefit",
+    "guidance",
+    "affect",
+    "you",
+    "dad",
+    "anxiety",
+    "special",
+    "boyfriend",
+    "test",
+    "blank",
+    "payment",
+    "soup",
+    "obligation",
+    "reply",
+    "smile",
+    "deep",
+    "complaint",
+    "addition",
+    "review",
+    "box",
+    "towel",
+    "minor",
+    "fun",
+    "soil",
+    "issue",
+    "cigarette",
+    "internet",
+    "gain",
+    "tell",
+    "entry",
+    "spare",
+    "incident",
+    "family",
+    "refuse",
+    "branch",
+    "can",
+    "pen",
+    "grandfather",
+    "constant",
+    "tank",
+    "uncle",
+    "climate",
+    "ground",
+    "volume",
+    "communication",
+    "kind",
+    "poet",
+    "child",
+    "screen",
+    "mine",
+    "quit",
+    "gene",
+    "lack",
+    "charity",
+    "memory",
+    "tooth",
+    "fear",
+    "mention",
+    "marketing",
+    "reveal",
+    "reason",
+    "court",
+    "season",
+    "freedom",
+    "land",
+    "sport",
+    "audience",
+    "classroom",
+    "law",
+    "hook",
+    "win",
+    "carry",
+    "eye",
+    "smell",
+    "distribution",
+    "research",
+    "country",
+    "dare",
+    "hope",
+    "whereas",
+    "stretch",
+    "library",
+    "if",
+    "delay",
+    "college",
+    "plastic",
+    "book",
+    "present",
+    "use",
+    "worry",
+    "champion",
+    "goal",
+    "economy",
+    "march",
+    "election",
+    "reflection",
+    "midnight",
+    "slide",
+    "inflation",
+    "action",
+    "challenge",
+    "guitar",
+    "coast",
+    "apple",
+    "campaign",
+    "field",
+    "jacket",
+    "sense",
+    "way",
+    "visual",
+    "remove",
+    "weather",
+    "trash",
+    "cable",
+    "regret",
+    "buddy",
+    "beach",
+    "historian",
+    "courage",
+    "sympathy",
+    "truck",
+    "tension",
+    "permit",
+    "nose",
+    "bed",
+    "son",
+    "person",
+    "base",
+    "meat",
+    "usual",
+    "air",
+    "meeting",
+    "worth",
+    "game",
+    "independence",
+    "physical",
+    "brief",
+    "play",
+    "raise",
+    "board",
+    "she",
+    "key",
+    "writing",
+    "pick",
+    "command",
+    "party",
+    "yesterday",
+    "spring",
+    "candidate",
+    "physics",
+    "university",
+    "concern",
+    "development",
+    "change",
+    "string",
+    "target",
+    "instance",
+    "room",
+    "bitter",
+    "bird",
+    "football",
+    "normal",
+    "split",
+    "impression",
+    "wood",
+    "long",
+    "meaning",
+    "stock",
+    "cap",
+    "leadership",
+    "media",
+    "ambition",
+    "fishing",
+    "essay",
+    "salad",
+    "repair",
+    "today",
+    "designer",
+    "night",
+    "bank",
+    "drawing",
+    "inevitable",
+    "phase",
+    "vast",
+    "chip",
+    "anger",
+    "switch",
+    "cry",
+    "twist",
+    "personality",
+    "attempt",
+    "storage",
+    "being",
+    "preparation",
+    "bat",
+    "selection",
+    "white",
+    "technology",
+    "contract",
+    "side",
+    "section",
+    "station",
+    "till",
+    "structure",
+    "tongue",
+    "taste",
+    "truth",
+    "difficulty",
+    "group",
+    "limit",
+    "main",
+    "move",
+    "feeling",
+    "light",
+    "example",
+    "mission",
+    "might",
+    "wait",
+    "wheel",
+    "shop",
+    "host",
+    "classic",
+    "alternative",
+    "cause",
+    "agent",
+    "consist",
+    "table",
+    "airline",
+    "text",
+    "pool",
+    "craft",
+    "range",
+    "fuel",
+    "tool",
+    "partner",
+    "load",
+    "entrance",
+    "deposit",
+    "hate",
+    "article",
+    "video",
+    "summer",
+    "feature",
+    "extreme",
+    "mobile",
+    "hospital",
+    "flight",
+    "fall",
+    "pension",
+    "piano",
+    "fail",
+    "result",
+    "rub",
+    "gap",
+    "system",
+    "report",
+    "suck",
+    "ordinary",
+    "wind",
+    "nerve",
+    "ask",
+    "shine",
+    "note",
+    "line",
+    "mom",
+    "perception",
+    "brother",
+    "reference",
+    "bend",
+    "charge",
+    "treat",
+    "trick",
+    "term",
+    "homework",
+    "bake",
+    "bid",
+    "status",
+    "project",
+    "strategy",
+    "orange",
+    "let",
+    "enthusiasm",
+    "parent",
+    "concentrate",
+    "device",
+    "travel",
+    "poetry",
+    "business",
+    "society",
+    "kiss",
+    "end",
+    "vegetable",
+    "employ",
+    "schedule",
+    "hour",
+    "brave",
+    "focus",
+    "process",
+    "movie",
+    "illegal",
+    "general",
+    "coffee",
+    "ad",
+    "highway",
+    "chemistry",
+    "psychology",
+    "hire",
+    "bell",
+    "conference",
+    "relief",
+    "show",
+    "neat",
+    "funny",
+    "weight",
+    "quality",
+    "club",
+    "daughter",
+    "zone",
+    "touch",
+    "tonight",
+    "shock",
+    "burn",
+    "excuse",
+    "name",
+    "survey",
+    "landscape",
+    "advance",
+    "satisfaction",
+    "bread",
+    "disaster",
+    "item",
+    "hat",
+    "prior",
+    "shopping",
+    "visit",
+    "east",
+    "photo",
+    "home",
+    "idea",
+    "father",
+    "comparison",
+    "cat",
+    "pipe",
+    "winner",
+    "count",
+    "lake",
+    "fight",
+    "prize",
+    "foundation",
+    "dog",
+    "keep",
+    "ideal",
+    "fan",
+    "struggle",
+    "peak",
+    "safety",
+    "solution",
+    "hell",
+    "conclusion",
+    "population",
+    "strain",
+    "alarm",
+    "measurement",
+    "second",
+    "train",
+    "race",
+    "due",
+    "insurance",
+    "boss",
+    "tree",
+    "monitor",
+    "sick",
+    "course",
+    "drag",
+    "appointment",
+    "slice",
+    "still",
+    "care",
+    "patience",
+    "rich",
+    "escape",
+    "emotion",
+    "royal",
+    "female",
+    "childhood",
+    "government",
+    "picture",
+    "will",
+    "sock",
+    "big",
+    "gate",
+    "oil",
+    "cross",
+    "pin",
+    "improvement",
+    "championship",
+    "silly",
+    "help",
+    "sky",
+    "pitch",
+    "man",
+    "diamond",
+    "most",
+    "transition",
+    "work",
+    "science",
+    "committee",
+    "moment",
+    "fix",
+    "teaching",
+    "dig",
+    "specialist",
+    "complex",
+    "guide",
+    "people",
+    "dead",
+    "voice",
+    "original",
+    "break",
+    "topic",
+    "data",
+    "degree",
+    "reading",
+    "recording",
+    "bunch",
+    "reach",
+    "judgment",
+    "lie",
+    "regular",
+    "set",
+    "painting",
+    "mode",
+    "list",
+    "player",
+    "bear",
+    "north",
+    "wonder",
+    "carpet",
+    "heavy",
+    "officer",
+    "negative",
+    "clock",
+    "unique",
+    "baby",
+    "pain",
+    "assumption",
+    "disk",
+    "iron",
+    "bill",
+    "drawer",
+    "look",
+    "double",
+    "mistake",
+    "finish",
+    "future",
+    "brilliant",
+    "contact",
+    "math",
+    "rice",
+    "leave",
+    "restaurant",
+    "discount",
+    "sex",
+    "virus",
+    "bit",
+    "trust",
+    "event",
+    "wear",
+    "juice",
+    "failure",
+    "bug",
+    "context",
+    "mud",
+    "whole",
+    "wrap",
+    "intention",
+    "draft",
+    "pressure",
+    "cake",
+    "dark",
+    "explanation",
+    "space",
+    "angle",
+    "word",
+    "efficiency",
+    "management",
+    "habit",
+    "star",
+    "chance",
+    "finding",
+    "transportation",
+    "stand",
+    "criticism",
+    "flow",
+    "door",
+    "injury",
+    "insect",
+    "surprise",
+    "apartment",
+]  # pylint: disable=line-too-long
+
+# ISO 639-1 codes to language names.
+LANGUAGE_CODES = immutabledict.immutabledict(
+    {
+        "en": "English",
+        "es": "Spanish",
+        "pt": "Portuguese",
+        "ar": "Arabic",
+        "hi": "Hindi",
+        "fr": "French",
+        "ru": "Russian",
+        "de": "German",
+        "ja": "Japanese",
+        "it": "Italian",
+        "bn": "Bengali",
+        "uk": "Ukrainian",
+        "th": "Thai",
+        "ur": "Urdu",
+        "ta": "Tamil",
+        "te": "Telugu",
+        "bg": "Bulgarian",
+        "ko": "Korean",
+        "pl": "Polish",
+        "he": "Hebrew",
+        "fa": "Persian",
+        "vi": "Vietnamese",
+        "ne": "Nepali",
+        "sw": "Swahili",
+        "kn": "Kannada",
+        "mr": "Marathi",
+        "gu": "Gujarati",
+        "pa": "Punjabi",
+        "ml": "Malayalam",
+        "fi": "Finnish",
+    }
+)
+
+_ALPHABETS = "([A-Za-z])"
+_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
+_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
+_STARTERS = (
+    r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+)
+_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
+_DIGITS = "([0-9])"
+_MULTIPLE_DOTS = r"\.{2,}"
+
+
+def split_into_sentences(text):
+    """Split the text into sentences.
+
+    Args:
+      text: A string that consists of more than or equal to one sentences.
+
+    Returns:
+      A list of strings where each string is a sentence.
+    """
+    text = " " + text + "  "
+    text = text.replace("\n", " ")
+    text = re.sub(_PREFIXES, "\\1<prd>", text)
+    text = re.sub(_WEBSITES, "<prd>\\1", text)
+    text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
+    text = re.sub(
+        _MULTIPLE_DOTS,
+        lambda match: "<prd>" * len(match.group(0)) + "<stop>",
+        text,
+    )
+    if "Ph.D" in text:
+        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+    text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
+    text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
+    text = re.sub(
+        _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
+        "\\1<prd>\\2<prd>\\3<prd>",
+        text,
+    )
+    text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
+    text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
+    text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
+    text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
+    if "”" in text:
+        text = text.replace(".”", "”.")
+    if '"' in text:
+        text = text.replace('."', '".')
+    if "!" in text:
+        text = text.replace('!"', '"!')
+    if "?" in text:
+        text = text.replace('?"', '"?')
+    text = text.replace(".", ".<stop>")
+    text = text.replace("?", "?<stop>")
+    text = text.replace("!", "!<stop>")
+    text = text.replace("<prd>", ".")
+    sentences = text.split("<stop>")
+    sentences = [s.strip() for s in sentences]
+    if sentences and not sentences[-1]:
+        sentences = sentences[:-1]
+    return sentences
+
+
+def count_words(text):
+    """Counts the number of words."""
+    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+    tokens = tokenizer.tokenize(text)
+    num_words = len(tokens)
+    return num_words
+
+
+@functools.lru_cache(maxsize=None)
+def _get_sentence_tokenizer():
+    return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
+
+
+def count_sentences(text):
+    """Count the number of sentences."""
+    tokenizer = _get_sentence_tokenizer()
+    tokenized_sentences = tokenizer.tokenize(text)
+    return len(tokenized_sentences)
+
+
+def generate_keywords(num_keywords):
+    """Randomly generates a few keywords."""
+    return random.sample(WORD_LIST, k=num_keywords)

From 2fdceb825c7bd101c20c5072a81c191061e27bd1 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 23 Feb 2024 15:23:00 +0000
Subject: [PATCH 02/45] custom metrics working! need to update the readme

---
 pyproject.toml                                |  2 +
 src/lighteval/metrics/__init__.py             |  2 +-
 src/lighteval/metrics/metrics.py              |  3 +-
 .../ifeval/ifeval.py                          | 35 +++++++---
 .../ifeval/instructions.py                    |  3 +-
 .../ifeval/instructions_registry.py           |  2 +-
 .../ifeval/instructions_utils.py              | 67 +++++++++----------
 .../requirements.txt                          |  1 +
 8 files changed, 66 insertions(+), 49 deletions(-)
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/requirements.txt

diff --git a/pyproject.toml b/pyproject.toml
index 5add37c87..0e8b777e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,8 @@ dependencies = [
     "termcolor==2.3.0",
     "pytablewriter",
     "colorama",
+    # Extension of metrics
+    "aenum==3.1.15",
     # Base metrics
     "nltk==3.8.1",
     "numpy",
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 6dc58ff57..ce1c3b90c 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -58,7 +58,7 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     # Extracting gold
     try:
         golds = formatted_doc.get_golds()
-    except KeyError:
+    except (KeyError, IndexError):
         golds = None
 
     # Specific process for HELM like evals # hrm
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index 318a4599f..3c318fa57 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -1,6 +1,5 @@
-from enum import Enum
-
 import numpy as np
+from aenum import Enum
 
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
 from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
index 129368c1e..69ef8a463 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
@@ -1,6 +1,8 @@
-import instructions_registry
 import numpy as np
+from aenum import extend_enum
 
+import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_registry as instructions_registry
+from lighteval.metrics import Metrics
 from lighteval.metrics.utils import (
     MetricCategory,
     MetricUseCase,
@@ -14,8 +16,9 @@
 ifeval = LightevalTaskConfig(
     name="ifeval",
     prompt_function="ifeval_prompt",
+    suite=["custom"],
     hf_repo="wis-k/instruction-following-eval",
-    hf_subset="train",
+    hf_subset="default",
     metric=["ifeval_metric"],
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
@@ -30,10 +33,12 @@ def ifeval_prompt(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=line["prompt"],
-        choices=[],  # very specific task where there are no precise outputs but instead we test if the format obeys rules
-        gold_index=-1,
+        choices=[
+            None
+        ],  # very specific task where there are no precise outputs but instead we test if the format obeys rules
+        gold_index=0,  # very specific task where there are no precise outputs but instead we test if the format obeys rules
         instruction="",
-        specific={"instruction_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]},
+        specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]},
     )
 
 
@@ -50,7 +55,7 @@ def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float
 
     # Strict instructions
     instruction_list = formatted_doc.specific["instructions_id_list"]
-    kwargs = formatted_doc.specific["kwargs"]
+    all_kwargs = formatted_doc.specific["kwargs"]
     prompt = formatted_doc.query
 
     # Loose instructions
@@ -81,8 +86,8 @@ def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float
         instruction = instruction_cls(instruction_id)
 
         # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
-        kwargs = {k: v for k, v in kwargs[index].items() if v}
-        instruction.build_description(**kwargs)
+        task_kwargs = {k: v for k, v in all_kwargs[index].items() if v}
+        instruction.build_description(**task_kwargs)
         args = instruction.get_instruction_args()
         if args and "prompt" in args:
             instruction.build_description(prompt=prompt)
@@ -116,5 +121,17 @@ def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.ACCURACY,
     sample_level_fn=ifeval_metric,
-    corpus_level_fn=np.mean,
+    corpus_level_fn={n: np.mean for n in submetric_names},
 )
+
+
+_TASKS = [ifeval]
+
+# Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in _TASKS]
+# Adds the metric to the metric list!
+extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
+
+if __name__ == "__main__":
+    print(t["name"] for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
index 7ec97f83b..6af99d819 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
@@ -22,7 +22,8 @@
 from typing import Dict, Optional, Sequence, Union
 
 import langdetect
-from lm_eval.tasks.ifeval import instructions_util
+
+import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_utils as instructions_util
 
 
 logger = logging.getLogger(__name__)
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
index 30a092c37..17089bd0a 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """Registry of all instructions."""
-from lm_eval.tasks.ifeval import instructions
+import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions as instructions
 
 
 _KEYWORD = "keywords:"
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
index e631e770c..7d995e42f 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
@@ -18,7 +18,6 @@
 import random
 import re
 
-import immutabledict
 import nltk
 
 
@@ -1561,40 +1560,38 @@ def download_nltk_resources():
 ]  # pylint: disable=line-too-long
 
 # ISO 639-1 codes to language names.
-LANGUAGE_CODES = immutabledict.immutabledict(
-    {
-        "en": "English",
-        "es": "Spanish",
-        "pt": "Portuguese",
-        "ar": "Arabic",
-        "hi": "Hindi",
-        "fr": "French",
-        "ru": "Russian",
-        "de": "German",
-        "ja": "Japanese",
-        "it": "Italian",
-        "bn": "Bengali",
-        "uk": "Ukrainian",
-        "th": "Thai",
-        "ur": "Urdu",
-        "ta": "Tamil",
-        "te": "Telugu",
-        "bg": "Bulgarian",
-        "ko": "Korean",
-        "pl": "Polish",
-        "he": "Hebrew",
-        "fa": "Persian",
-        "vi": "Vietnamese",
-        "ne": "Nepali",
-        "sw": "Swahili",
-        "kn": "Kannada",
-        "mr": "Marathi",
-        "gu": "Gujarati",
-        "pa": "Punjabi",
-        "ml": "Malayalam",
-        "fi": "Finnish",
-    }
-)
+LANGUAGE_CODES = {
+    "en": "English",
+    "es": "Spanish",
+    "pt": "Portuguese",
+    "ar": "Arabic",
+    "hi": "Hindi",
+    "fr": "French",
+    "ru": "Russian",
+    "de": "German",
+    "ja": "Japanese",
+    "it": "Italian",
+    "bn": "Bengali",
+    "uk": "Ukrainian",
+    "th": "Thai",
+    "ur": "Urdu",
+    "ta": "Tamil",
+    "te": "Telugu",
+    "bg": "Bulgarian",
+    "ko": "Korean",
+    "pl": "Polish",
+    "he": "Hebrew",
+    "fa": "Persian",
+    "vi": "Vietnamese",
+    "ne": "Nepali",
+    "sw": "Swahili",
+    "kn": "Kannada",
+    "mr": "Marathi",
+    "gu": "Gujarati",
+    "pa": "Punjabi",
+    "ml": "Malayalam",
+    "fi": "Finnish",
+}
 
 _ALPHABETS = "([A-Za-z])"
 _PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/requirements.txt b/tasks_examples/custom_tasks_with_custom_metrics/requirements.txt
new file mode 100644
index 000000000..7f42284c9
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/requirements.txt
@@ -0,0 +1 @@
+langdetect

From 0e30b215c269e6415bc216066b997799ff4a5a58 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 23 Feb 2024 15:53:00 +0000
Subject: [PATCH 03/45] update doc

---
 README.md                                     | 26 +++++++++++++++++--
 .../ifeval/ifeval.py                          |  2 +-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4f364abd9..411fefe9a 100644
--- a/README.md
+++ b/README.md
@@ -104,8 +104,30 @@ If you want to compare hellaswag from helm and the harness on Gpt-6j, you can do
 
 ## Customisation
 ### Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`. If not, add it to either of these files depending on the level at which it is applied.
-Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
+First check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`.
+
+If not, you can use the custom_task system to register your new metric:
+- create a new python file which should contain the full logic of your metric.
+- the file also needs to start with these imports
+```python
+from aenum import extend_enum
+from lighteval.metrics import Metrics
+
+# And any other class you might need to redefine your specific metric, depending on whether it's a sample or corpus metric.
+```
+
+- and to end with the following, so that it adds your metric to our metrics list when loaded as a module.
+
+```python
+# Adds the metric to the metric list!
+extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
+if __name__ == "__main__":
+    print("Imported metric")
+```
+
+You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it.
+
+To see an example of a custom metric added along with a custom task, look at `tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
 
 ### Adding a new task
 To add a new task, first **add its dataset** on the hub.
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
index 69ef8a463..955e656fa 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
@@ -129,9 +129,9 @@ def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float
 
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]
-# Adds the metric to the metric list!
 extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
 
 if __name__ == "__main__":
+    # Adds the metric to the metric list!
     print(t["name"] for t in TASKS_TABLE)
     print(len(TASKS_TABLE))

From 1ba178f12eb86acc49f8a4e2ee1624a7c16b9e61 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 23 Feb 2024 19:23:02 +0000
Subject: [PATCH 04/45] fix eos token + eval script

---
 .../ifeval/ifeval.py                          | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
index 955e656fa..682a72c87 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
@@ -25,7 +25,7 @@
     few_shots_split="train",
     few_shots_select="random_sampling",
     generation_size=1280,  # to check
-    stop_sequence=["\n"],  # to check
+    stop_sequence=None,  # to check
 )
 
 
@@ -109,19 +109,30 @@ def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float
 
     return {
         "prompt_level_strict_acc": int(all(is_following_list_strict)),
-        "inst_level_strict_acc": np.mean(is_following_list_strict),
+        "inst_level_strict_acc": is_following_list_strict,
         "prompt_level_loose_acc": int(all(is_following_list_loose)),
-        "inst_level_loose_acc": np.mean(is_following_list_loose),
+        "inst_level_loose_acc": is_following_list_loose,
     }
 
 
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc
+
+
 ifeval_metrics = SampleLevelMetricGrouping(
     metric=submetric_names,
     higher_is_better={n: True for n in submetric_names},
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.ACCURACY,
     sample_level_fn=ifeval_metric,
-    corpus_level_fn={n: np.mean for n in submetric_names},
+    corpus_level_fn={
+        "prompt_level_strict_acc": np.mean,
+        "inst_level_strict_acc": agg_inst_level_acc,
+        "prompt_level_loose_acc": np.mean,
+        "inst_level_loose_acc": agg_inst_level_acc,
+    },
 )
 
 

From 6233af7c57b22726d518b371fe4790351c93e846 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.com>
Date: Wed, 28 Feb 2024 13:09:25 +0100
Subject: [PATCH 05/45] init

---
 src/lighteval/data.py                         |  16 +
 src/lighteval/evaluator.py                    |   3 +
 src/lighteval/logging/info_loggers.py         |   2 +
 src/lighteval/metrics/__init__.py             |  11 +
 src/lighteval/metrics/utils.py                |   1 +
 src/lighteval/models/base_model.py            |  72 +++-
 src/lighteval/models/model_output.py          |   7 +
 src/lighteval/tasks/lighteval_task.py         |  20 +
 src/lighteval/tasks/requests.py               |  16 +
 .../mt_bench/judge_prompts.jsonl              |   8 +
 .../mt_bench/judges.py                        | 188 ++++++++
 .../mt_bench/model_adapter.py                 | 406 ++++++++++++++++++
 .../mt_bench/mt_bench.py                      | 120 ++++++
 13 files changed, 869 insertions(+), 1 deletion(-)
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/mt_bench/model_adapter.py
 create mode 100644 tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index c892ed7c9..a069152d3 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -198,6 +198,22 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
         return -(len(toks) + gen_length)
 
 
+class GenerativeTaskMultiTurnDataset(DynamicBatchDataset):
+    def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsRequest) -> int:
+        """
+        Collate function for generating batches.
+
+        Args:
+            x (Any): The input data.
+
+        Returns:
+            Any: The collated data.
+        """
+        toks = sum([len(r) for r in request.tokenized_contexts])
+        gen_length = request.generation_size
+        return -(len(toks) + gen_length)
+
+
 class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
     def __getitem__(self, index) -> Request:
         """
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index 4cee8ba73..373e10ce4 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -3,6 +3,7 @@
 
 import collections
 import copy
+from pprint import pprint
 from typing import Dict, Union
 
 from pytablewriter import LatexTableWriter, MarkdownTableWriter
@@ -66,6 +67,8 @@ def evaluate(  # noqa: C901
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
             full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
+        elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
+            full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
             raise NotImplementedError(f"Request type {request_type} not supported")
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 23aaccbd3..350007a4e 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -332,6 +332,8 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             detail.choices = doc.choices
             detail.gold_index = as_list(doc.gold_index)
             pred_saved = True
+        if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
+            pred_saved = True
         if not pred_saved:
             raise NotImplementedError(
                 "No metric prediction saved."
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index ce1c3b90c..4a15e8f2a 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -124,3 +124,14 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
             )
 
     return results, outputs
+
+
+def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
+    outputs = {}
+    predictions = results.pop(0).result
+
+    for metric in metrics:
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_MULTI_TURN:
+            outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))
+
+    return results, outputs
\ No newline at end of file
diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py
index e6363c9f8..4f00f7919 100644
--- a/src/lighteval/metrics/utils.py
+++ b/src/lighteval/metrics/utils.py
@@ -6,6 +6,7 @@ class MetricCategory(Enum):
     TARGET_PERPLEXITY = auto()
     PERPLEXITY = auto()
     GENERATIVE = auto()
+    GENERATIVE_MULTI_TURN = auto()
     GENERATIVE_LOGPROB = auto()
     MULTICHOICE = auto()
     MULTICHOICE_ONE_TOKEN = auto()
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 8ebe90de4..d641f8a42 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -1,4 +1,5 @@
 import os
+from pprint import pprint
 from typing import Optional, Tuple, Union
 
 import torch
@@ -12,9 +13,10 @@
 from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.model_config import BaseModelConfig, EnvConfig
-from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
+from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn, GenerateMultiTurnReturn
 from lighteval.models.utils import _get_dtype, _get_precision, _simplify_name
 from lighteval.tasks.requests import (
+    GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
@@ -322,6 +324,74 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
+    def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None) -> GenerateMultiTurnReturn:
+        for request in requests:
+            request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
+            request.tokenized_context = self.tok_encode(request.context)
+
+        dataset = GenerativeTaskDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda batch: batch)
+
+        results = []
+
+        if self.accelerator:
+            dataloader = self.accelerator.prepare(dataloader)
+
+        # Always batch size 1 for multi-turn
+        for batch in tqdm(
+            dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
+        ):
+            # NOTE: we are assuming all items in a batch behave similarly (same
+            # stop_tokens and max_tokens genrated) which is not necessarily
+            # the case! Because of that we only use batch size of 1
+            stop_tokens = batch[0].stop_sequence
+            max_generated_tokens = batch[0].generation_size
+            contexts = [c.context for c in batch]
+            max_context_size_allowed = self.max_length - max_generated_tokens
+
+            multi_turn_context = "" # contexts[0][0]
+            model_answers = []
+            for i, context in enumerate(contexts[0]):
+                if i > 0:
+                    multi_turn_context += f"\n\n{context}"
+                else:
+                    multi_turn_context += f"{context}"
+
+                # print("multi_turn_context ====== ")
+                # pprint(multi_turn_context)
+                # print("multi_turn_context ====== ")
+
+                tokenized = self.tokenizer(
+                    multi_turn_context,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=max_context_size_allowed,
+                    add_special_tokens=self.add_special_tokens,
+                ).to(self.device)
+
+                prepared_batch = Batch(
+                    input_ids=tokenized["input_ids"],
+                    input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
+                    input_mask=tokenized["attention_mask"],
+                    truncated=[0] * len(tokenized["input_ids"]),
+                    padded=[0] * len(tokenized["input_ids"]),
+                )
+
+                cur_reponses = self._generate(
+                    batch=prepared_batch,
+                    max_tokens=max_generated_tokens,
+                    stop_tokens=stop_tokens,
+                    returns_logits=False,
+                )
+
+                model_answers.append(cur_reponses[0].result)
+                multi_turn_context += f"{cur_reponses[0].result}"
+
+            results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
+
+        return results
+
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
index 82d94ace1..a4dc2b53b 100644
--- a/src/lighteval/models/model_output.py
+++ b/src/lighteval/models/model_output.py
@@ -43,6 +43,13 @@ class GenerateReturn(ModelReturn):
     def get_result_for_eval(self):
         return self.result if self.logits is None else (self.result, self.logits)
 
+@dataclass
+class GenerateMultiTurnReturn(ModelReturn):
+    result: list[str] = field(default_factory=list)
+
+    def get_result_for_eval(self):
+        return self.result
+
 
 @dataclass
 class Batch:
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 1b0f153c6..113669036 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -12,6 +12,7 @@
 from lighteval.metrics import (
     apply_generative_logprob_metric,
     apply_generative_metric,
+    apply_generative_multi_turn_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
     apply_perplexity_metric,
@@ -22,6 +23,7 @@
 from lighteval.models.model_output import ModelReturn
 from lighteval.tasks.requests import (
     Doc,
+    GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
@@ -334,6 +336,8 @@ def get_request_type(self) -> list[RequestType]:
             request_types.append(RequestType.LOGLIKELIHOOD_ROLLING)
         if self.has_metric_category[MetricCategory.GENERATIVE]:
             request_types.append(RequestType.GREEDY_UNTIL)
+        if self.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
+            request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN)
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             request_types.append(RequestType.GREEDY_UNTIL_WITH_LOGITS)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
@@ -422,6 +426,17 @@ def construct_requests(
                     choices=formatted_doc.choices,
                 )
             ]
+        if self.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
+            requests[RequestType.GREEDY_UNTIL_MULTI_TURN] += [
+                GreedyUntilMultiTurnRequest(
+                    task_name=current_task_name,
+                    example_index=document_id_seed,
+                    request_index=0,
+                    context=formatted_doc.specific["queries"],
+                    stop_sequence=self.stop_sequence,
+                    generation_size=self.generation_size,
+                )
+            ]
 
         return requests
 
@@ -468,6 +483,11 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
+        if self.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
+            results, cur_outputs = apply_generative_multi_turn_metric(
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics
+            )
+            outputs.update(cur_outputs)
 
         return outputs
 
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index baf6e01af..e8057197f 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -10,6 +10,7 @@ class RequestType(Enum):
     LOGLIKELIHOOD_SINGLE_TOKEN = auto()
     LOGLIKELIHOOD_ROLLING = auto()
     GREEDY_UNTIL = auto()
+    GREEDY_UNTIL_MULTI_TURN = auto()
     GREEDY_UNTIL_WITH_LOGITS = auto()
 
 
@@ -96,6 +97,21 @@ class GreedyUntilRequest(Request):
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
 
+@dataclass
+class GreedyUntilMultiTurnRequest(Request):
+    """
+    Represents a request for generating text using the Greedy-Until algorithm.
+
+    Attributes:
+        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
+        generation_size (int): The maximum number of tokens to generate.
+        request_type (RequestType): The type of the request, set to RequestType.GREEDY_UNTIL.
+    """
+    context: list[str] # Multi-turn has a list of context
+    stop_sequence: str
+    generation_size: int
+    request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
+    tokenized_context: list[list[int]] = None
 
 @dataclass
 class GreedyUntilWithLogitsRequest(Request):
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl
new file mode 100644
index 000000000..86854fff7
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl
@@ -0,0 +1,8 @@
+{"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"}
+{"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
+{"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": "math", "output_format": "[[A]]"}
+{"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
+{"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
+{"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
+{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
+{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
\ No newline at end of file
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
new file mode 100644
index 000000000..c26db1bb7
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
@@ -0,0 +1,188 @@
+import ast
+import copy
+import dataclasses
+import json
+import os
+import re
+import time
+from pprint import pprint
+from random import randrange
+
+import openai
+
+from tasks_examples.custom_tasks_with_custom_metrics.mt_bench.model_adapter import conv_templates
+
+
+openai.api_key = os.environ["OPENAI_API_KEY"]
+
+
+# Extract scores from judgments
+two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
+two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
+one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+
+OPENAI_MODEL_LIST = (
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0301",
+    "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-1106",
+    "gpt-3.5-turbo-0125",
+    "gpt-4",
+    "gpt-4-0314",
+    "gpt-4-0613",
+    "gpt-4-turbo",
+    "gpt-4-1106-preview",
+    "gpt-4-0125-preview",
+)
+
+# API setting constants
+API_MAX_RETRY = 16
+API_RETRY_SLEEP = 10
+API_ERROR_OUTPUT = "$ERROR$"
+
+# Categories that need reference answers
+NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
+
+@dataclasses.dataclass
+class Judge:
+    model_name: str
+    prompt_template: dict
+    ref_based: bool = False
+    multi_turn: bool = False
+
+@dataclasses.dataclass
+class MatchSingle:
+    question: dict
+    model: str
+    answer: dict
+    judge: Judge
+    ref_answer: dict = None
+    multi_turn: bool = False
+
+def make_judge_single(judge_model, judge_prompts):
+    judges = {}
+    judges["default"] = Judge(judge_model, judge_prompts["single-v1"])
+    judges["math"] = Judge(judge_model, judge_prompts["single-math-v1"], ref_based=True)
+    judges["default-mt"] = Judge(
+        judge_model, judge_prompts["single-v1-multi-turn"], multi_turn=True
+    )
+    judges["math-mt"] = Judge(
+        judge_model,
+        judge_prompts["single-math-v1-multi-turn"],
+        ref_based=True,
+        multi_turn=True,
+    )
+    return judges
+
+
+def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
+    if api_dict is not None:
+        openai.api_base = api_dict["api_base"]
+        openai.api_key = api_dict["api_key"]
+    output = API_ERROR_OUTPUT
+    # return "[[1]]"
+    for _ in range(API_MAX_RETRY):
+        try:
+            messages = conv.to_openai_api_messages()
+            response = openai.ChatCompletion.create(
+                model=model,
+                messages=messages,
+                n=1,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            output = response["choices"][0]["message"]["content"]
+            break
+        except openai.error.OpenAIError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+
+    return output
+
+
+def load_judge_prompts(prompt_file: str):
+    """Load judge prompts.
+
+    The return value is a python dict of type:
+    Dict[judge_name: str -> dict]
+    """
+    prompts = {}
+    with open(prompt_file) as fin:
+        for line in fin:
+            line = json.loads(line)
+            prompts[line["name"]] = line
+    return prompts
+
+
+def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
+    kwargs = {}
+    model = judge.model_name
+    if ref_answer is not None and len(ref_answer) > 0:
+        kwargs["ref_answer_1"] = ref_answer[0]
+        if multi_turn:
+            kwargs["ref_answer_2"] = ref_answer[1]
+
+    if multi_turn:
+        # pprint(question[0])
+        # pprint(question[1])
+        # pprint(answer[0])
+        # pprint(answer[1])
+        # pprint(kwargs)
+        # pprint(judge.prompt_template["prompt_template"])
+        # print("========")
+        user_prompt = judge.prompt_template["prompt_template"].format(
+            question_1=question[0],
+            question_2=question[1],
+            answer_1=answer[0],
+            answer_2=answer[1],
+            **kwargs,
+        )
+    else:
+        # pprint(question[0])
+        # pprint(answer[0])
+        # pprint(kwargs)
+        # pprint(judge.prompt_template["prompt_template"])
+        # pprint("========")
+        user_prompt = judge.prompt_template["prompt_template"].format(
+            question=question[0],
+            answer=answer[0],
+            **kwargs,
+        )
+
+    rating = -1
+
+    system_prompt = judge.prompt_template["system_prompt"]
+    conv = copy.deepcopy(conv_templates["chatgpt"])
+    conv.set_system_message(system_prompt)
+    conv.append_message(conv.roles[0], user_prompt)
+    conv.append_message(conv.roles[1], None)
+
+    if model in OPENAI_MODEL_LIST:
+        judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
+    else:
+        raise ValueError(f"Invalid judge model name: {model}")
+
+    if judge.prompt_template["output_format"] == "[[rating]]":
+        match = re.search(one_score_pattern, judgment)
+        if not match:
+            match = re.search(one_score_pattern_backup, judgment)
+
+        if match:
+            rating = ast.literal_eval(match.groups()[0])
+        else:
+            rating = -1
+    else:
+        raise ValueError(
+            f"invalid output format: {judge.prompt_template['output_format']}"
+        )
+
+    return rating, user_prompt, judgment
+
+
+def play_a_match_single(question, answer, ref_answer, judge, multi_turn, output_file: str):
+    if judge.prompt_template["type"] == "single":
+        score, user_prompt, judgment = run_judge_single(
+            question, answer, judge, ref_answer, multi_turn=multi_turn
+        )
+        return score
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/model_adapter.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/model_adapter.py
new file mode 100644
index 000000000..7239dd270
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/model_adapter.py
@@ -0,0 +1,406 @@
+import base64
+import dataclasses
+import math
+import os
+import re
+import sys
+import warnings
+from enum import IntEnum, auto
+from dataclasses import field
+from io import BytesIO
+from typing import Dict, List, Optional, Tuple, Union
+
+
+IMAGE_PLACEHOLDER_STR = "$$<image>$$"
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    DEEPSEEK_CHAT = auto()
+    METAMATH = auto()
+    YUAN2 = auto()
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: Tuple[str] = ("USER", "ASSISTANT")
+    # All messages. Each item is (role, message).
+    # Each message is either a string or a tuple of (string, List[image_url]).
+    messages: List[List[str]] = field(default_factory=list)
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += tag + " " + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
+
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.METAMATH:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                # For MetaMath, sep2 is used to prefix the message.
+                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
+                ending_sep = self.sep if i % 2 == 0 else ""
+                if message:
+                    ret += role + starting_sep + message + ending_sep
+                else:
+                    ret += role + starting_sep
+            return ret
+        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.YUAN2:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + seps[1]
+            for _, message in self.messages:
+                if message:
+                    ret += message + "<n>"
+                else:
+                    ret += ""
+            ret = ret.rstrip("<n>") + seps[0]
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def get_images(self):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    for image in msg[1]:
+                        images.append(image)
+
+        return images
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def convert_image_to_base64(self, image):
+        """Given an image, return the base64 encoded image string."""
+        import requests
+        from PIL import Image
+
+        # Load image if it has not been loaded in yet
+        if type(image) == str:
+            if image.startswith("http://") or image.startswith("https://"):
+                response = requests.get(image)
+                image = Image.open(BytesIO(response.content)).convert("RGB")
+            elif "base64" in image:
+                # OpenAI format is: data:image/jpeg;base64,{base64_encoded_image_str}
+                return image.split(",")[1]
+            else:
+                image = Image.open(image).convert("RGB")
+
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        max_len, min_len = 2048, 2048
+        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if longest_edge != max(image.size):
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+
+        buffered = BytesIO()
+        image.save(buffered, format="PNG")
+        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+
+        return img_b64_str
+
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image = msg
+                    img_b64_str = image[0]  # Only one image on gradio at one time
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>\n", "").strip()
+
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def extract_text_from_messages(self):
+        return [
+            (role, message[0]) if type(message) is tuple else (role, message)
+            for role, message in self.messages
+        ]
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.extract_text_from_messages(),
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {
+    "chatgpt":  Conversation(
+        name="chatgpt",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        sep_style=None,
+        sep=None,
+    )
+}
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
new file mode 100644
index 000000000..11dc3fd2f
--- /dev/null
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
@@ -0,0 +1,120 @@
+# ruff: noqa: F405, F403, F401
+"""
+Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
+This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+Author:
+"""
+
+from pprint import pprint
+
+import numpy as np
+from aenum import extend_enum
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from lighteval.metrics import Metrics
+from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
+from tasks_examples.custom_tasks_with_custom_metrics.mt_bench.judges import (
+    load_judge_prompts,
+    make_judge_single,
+    play_a_match_single,
+)
+
+
+NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
+
+## EVAL WITH NO SUBSET ##
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+task = LightevalTaskConfig(
+    name="mt_bench",
+    prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="HuggingFaceH4/mt_bench_prompts",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split="",
+    few_shots_select="random",
+    metric=["mt_bench_metric"],
+    generation_size=100,
+    stop_sequence=["."],
+)
+
+
+## DEFINE YOUR PROMPT FUNCTIONS
+# Define as many as you need for your different tasks
+def prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"][0],
+        choices=None,
+        instruction="",
+        gold_index=[],
+        specific={"reference": line["reference"], "category": line["category"], "queries": line["prompt"]},
+    )
+
+
+
+
+def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+    """Defines how to go from a list of predictions to a score.
+    Follow examples in src/lighteval/metrics/metrics.py, or get more info
+    about what this function should do in the README.
+    """
+    judge_model = "gpt-3.5-turbo"
+    judge_file = "/Users/nathan/Repos/lighteval/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl"
+    judge_prompts = load_judge_prompts(judge_file)
+    judges = make_judge_single(judge_model, judge_prompts)
+
+    question = formatted_doc.specific["queries"]
+    ref_answer = formatted_doc.specific["reference"]
+    category = formatted_doc.specific["category"]
+
+    if category not in NEED_REF_CATS:
+        score = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+        score_mt = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+    else:
+        try:
+            score = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
+            score_mt = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
+        except KeyError:
+            print(f"Category {category} not found in judge prompts, using default judge")
+            score = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+            score_mt = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+
+    return score
+
+
+mt_bench_metric = SampleLevelMetric(
+    metric="mt_bench_metric",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE_MULTI_TURN,
+    use_case=MetricUseCase.SUMMARIZATION,
+    sample_level_fn=mt_bench_metric,
+    corpus_level_fn=np.mean,
+)
+
+
+## STORE YOUR EVALS
+_TASKS = [task]
+
+## MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in _TASKS]
+extend_enum(
+    Metrics,
+    "mt_bench_metric",
+    mt_bench_metric,
+)
+
+if __name__ == "__main__":
+    print(t["name"] for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))

From 5cc9c2c04ce877c84691f08831b447bc0859641b Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.com>
Date: Wed, 28 Feb 2024 14:32:35 +0100
Subject: [PATCH 06/45] remove ifeval

---
 .../ifeval/ifeval.py                          |  148 --
 .../ifeval/instructions.py                    | 1531 ---------------
 .../ifeval/instructions_registry.py           |  167 --
 .../ifeval/instructions_utils.py              | 1681 -----------------
 4 files changed, 3527 deletions(-)
 delete mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
 delete mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
 delete mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
 delete mode 100644 tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py

diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
deleted file mode 100644
index 682a72c87..000000000
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import numpy as np
-from aenum import extend_enum
-
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_registry as instructions_registry
-from lighteval.metrics import Metrics
-from lighteval.metrics.utils import (
-    MetricCategory,
-    MetricUseCase,
-    SampleLevelMetricGrouping,
-)
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-
-
-# We create the task config
-ifeval = LightevalTaskConfig(
-    name="ifeval",
-    prompt_function="ifeval_prompt",
-    suite=["custom"],
-    hf_repo="wis-k/instruction-following-eval",
-    hf_subset="default",
-    metric=["ifeval_metric"],
-    hf_avail_splits=["train"],
-    evaluation_splits=["train"],
-    few_shots_split="train",
-    few_shots_select="random_sampling",
-    generation_size=1280,  # to check
-    stop_sequence=None,  # to check
-)
-
-
-def ifeval_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["prompt"],
-        choices=[
-            None
-        ],  # very specific task where there are no precise outputs but instead we test if the format obeys rules
-        gold_index=0,  # very specific task where there are no precise outputs but instead we test if the format obeys rules
-        instruction="",
-        specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]},
-    )
-
-
-submetric_names = [
-    "prompt_level_strict_acc",
-    "inst_level_strict_acc",
-    "prompt_level_loose_acc",
-    "inst_level_loose_acc",
-]
-
-
-def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float:
-    response = predictions[0]
-
-    # Strict instructions
-    instruction_list = formatted_doc.specific["instructions_id_list"]
-    all_kwargs = formatted_doc.specific["kwargs"]
-    prompt = formatted_doc.query
-
-    # Loose instructions
-    r = response.split("\n")
-    response_remove_first = "\n".join(r[1:]).strip()
-    response_remove_last = "\n".join(r[:-1]).strip()
-    response_remove_both = "\n".join(r[1:-1]).strip()
-    revised_response = response.replace("*", "")
-    revised_response_remove_first = response_remove_first.replace("*", "")
-    revised_response_remove_last = response_remove_last.replace("*", "")
-    revised_response_remove_both = response_remove_both.replace("*", "")
-    all_responses = [
-        response,
-        revised_response,
-        response_remove_first,
-        response_remove_last,
-        response_remove_both,
-        revised_response_remove_first,
-        revised_response_remove_last,
-        revised_response_remove_both,
-    ]
-
-    is_following_list_strict = []
-    is_following_list_loose = []
-
-    for index, instruction_id in enumerate(instruction_list):
-        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
-        instruction = instruction_cls(instruction_id)
-
-        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
-        task_kwargs = {k: v for k, v in all_kwargs[index].items() if v}
-        instruction.build_description(**task_kwargs)
-        args = instruction.get_instruction_args()
-        if args and "prompt" in args:
-            instruction.build_description(prompt=prompt)
-
-        # Strict
-        if response.strip() and instruction.check_following(response):
-            is_following_list_strict.append(True)
-        else:
-            is_following_list_strict.append(False)
-
-        # Loose
-        is_following = False
-        for r in all_responses:
-            if r.strip() and instruction.check_following(r):
-                is_following = True
-                break
-
-        is_following_list_loose.append(is_following)
-
-    return {
-        "prompt_level_strict_acc": int(all(is_following_list_strict)),
-        "inst_level_strict_acc": is_following_list_strict,
-        "prompt_level_loose_acc": int(all(is_following_list_loose)),
-        "inst_level_loose_acc": is_following_list_loose,
-    }
-
-
-def agg_inst_level_acc(items):
-    flat_items = [item for sublist in items for item in sublist]
-    inst_level_acc = sum(flat_items) / len(flat_items)
-    return inst_level_acc
-
-
-ifeval_metrics = SampleLevelMetricGrouping(
-    metric=submetric_names,
-    higher_is_better={n: True for n in submetric_names},
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.ACCURACY,
-    sample_level_fn=ifeval_metric,
-    corpus_level_fn={
-        "prompt_level_strict_acc": np.mean,
-        "inst_level_strict_acc": agg_inst_level_acc,
-        "prompt_level_loose_acc": np.mean,
-        "inst_level_loose_acc": agg_inst_level_acc,
-    },
-)
-
-
-_TASKS = [ifeval]
-
-# Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in _TASKS]
-extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
-
-if __name__ == "__main__":
-    # Adds the metric to the metric list!
-    print(t["name"] for t in TASKS_TABLE)
-    print(len(TASKS_TABLE))
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
deleted file mode 100644
index 6af99d819..000000000
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions.py
+++ /dev/null
@@ -1,1531 +0,0 @@
-# Copyright 2023 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Library of instructions."""
-import collections
-import json
-import logging
-import random
-import re
-import string
-from typing import Dict, Optional, Sequence, Union
-
-import langdetect
-
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_utils as instructions_util
-
-
-logger = logging.getLogger(__name__)
-
-_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
-
-_LANGUAGES = instructions_util.LANGUAGE_CODES
-
-# The relational operation for comparison.
-_COMPARISON_RELATION = ("less than", "at least")
-
-# The maximum number of sentences.
-_MAX_NUM_SENTENCES = 20
-
-# The number of placeholders.
-_NUM_PLACEHOLDERS = 4
-
-# The number of bullet lists.
-_NUM_BULLETS = 5
-
-# The options of constrained response.
-_CONSTRAINED_RESPONSE_OPTIONS = (
-    "My answer is yes.",
-    "My answer is no.",
-    "My answer is maybe.",
-)
-
-# The options of starter keywords.
-_STARTER_OPTIONS = (
-    "I would say",
-    "My answer is",
-    "I believe",
-    "In my opinion",
-    "I think",
-    "I reckon",
-    "I feel",
-    "From my perspective",
-    "As I see it",
-    "According to me",
-    "As far as I'm concerned",
-    "To my understanding",
-    "In my view",
-    "My take on it is",
-    "As per my perception",
-)
-
-# The options of ending keywords.
-# TODO(jeffreyzhou) add more ending options
-_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?")
-
-# The number of highlighted sections.
-_NUM_HIGHLIGHTED_SECTIONS = 4
-
-# The section spliter.
-_SECTION_SPLITER = ("Section", "SECTION")
-
-# The number of sections.
-_NUM_SECTIONS = 5
-
-# The number of paragraphs.
-_NUM_PARAGRAPHS = 5
-
-# The postscript marker.
-_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
-
-# The number of keywords.
-_NUM_KEYWORDS = 2
-
-# The occurrences of a single keyword.
-_KEYWORD_FREQUENCY = 3
-
-# The occurrences of a single letter.
-_LETTER_FREQUENCY = 10
-
-# The occurrences of words with all capital letters.
-_ALL_CAPITAL_WORD_FREQUENCY = 20
-
-# The number of words in the response.
-_NUM_WORDS_LOWER_LIMIT = 100
-_NUM_WORDS_UPPER_LIMIT = 500
-
-
-class Instruction:
-    """An instruction template."""
-
-    def __init__(self, instruction_id):
-        self.id = instruction_id
-
-    def build_description(self, **kwargs):
-        raise NotImplementedError("`build_description` not implemented.")
-
-    def get_instruction_args(self):
-        raise NotImplementedError("`get_instruction_args` not implemented.")
-
-    def get_instruction_args_keys(self):
-        raise NotImplementedError("`get_instruction_args_keys` not implemented.")
-
-    def check_following(self, value):
-        raise NotImplementedError("`check_following` not implemented.")
-
-
-class ResponseLanguageChecker(Instruction):
-    """Check the language of the entire response."""
-
-    def build_description(self, *, language=None):
-        """Build the instruction description.
-
-        Args:
-          language: A string representing the expected language of the response. The
-            language has to comply to the 97 types defined in
-            `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
-            ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
-            for example, `en` for English, `zh` for Chinese, `fr` for French.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._language = language
-        if self._language is None:
-            self._language = random.choice(list(_LANGUAGES.keys()))
-        # TODO(tianjianlu): opens the description generation to more choices.
-        self._description_pattern = (
-            "Your ENTIRE response should be in {language} language, no other " + "language is allowed."
-        )
-        return self._description_pattern.format(language=_LANGUAGES[self._language])
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"language": self._language}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["language"]
-
-    def check_following(self, value):
-        """Check if the language of the entire response follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the language of `value` follows instruction; otherwise False.
-        """
-        assert isinstance(value, str)
-
-        try:
-            return langdetect.detect(value) == self._language
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logging.error("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
-            return True
-
-
-class NumberOfSentences(Instruction):
-    """Check the number of sentences."""
-
-    def build_description(self, *, num_sentences=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          num_sentences: An integer specifying the number of sentences as a
-            threshold.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of sentences < the threshold;
-            if 'at least', the actual number of sentences >= the threshold.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        # The number of sentences as a threshold for comparison.
-        self._num_sentences_threshold = num_sentences
-        if self._num_sentences_threshold is None or self._num_sentences_threshold < 0:
-            self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = "Your response should contain {relation} {num_sentences} sentences."
-        return self._description_pattern.format(
-            relation=self._comparison_relation,
-            num_sentences=self._num_sentences_threshold,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_sentences": self._num_sentences_threshold,
-            "relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_sentences", "relation"]
-
-    def check_following(self, value):
-        """Check if the number of sentences follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the response follows the instruction.
-
-        Raise:
-            ValueError if the string in `instruction_args` is not in
-            [`less_than`, `at_least`].
-        """
-        num_sentences = instructions_util.count_sentences(value)
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return num_sentences < self._num_sentences_threshold
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return num_sentences >= self._num_sentences_threshold
-
-
-class PlaceholderChecker(Instruction):
-    """Check the placeholders in template writing."""
-
-    def build_description(self, *, num_placeholders=None):
-        """Build the instruction description.
-
-        Args:
-          num_placeholders: An integer denoting the minimum number of
-            placeholders required in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_placeholders = num_placeholders
-        if self._num_placeholders is None or self._num_placeholders < 0:
-            self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
-        self._description_pattern = (
-            "The response must contain at least {num_placeholders} placeholders "
-            + "represented by square brackets, such as [address]."
-        )
-        return self._description_pattern.format(num_placeholders=self._num_placeholders)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_placeholders": self._num_placeholders}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_placeholders"]
-
-    def check_following(self, value):
-        """Check if the number of placeholders follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the actual number of placeholders in the response is greater than
-          or equal to `num_placeholders`; otherwise, False.
-        """
-        placeholders = re.findall(r"\[.*?\]", value)
-        num_placeholders = len(placeholders)
-        return num_placeholders >= self._num_placeholders
-
-
-class BulletListChecker(Instruction):
-    """Checks the bullet list in the prompt."""
-
-    def build_description(self, *, num_bullets=None):
-        """Build the instruction description.
-
-        Args:
-          num_bullets: An integer specifying the exact number of bullet lists
-            that is required to appear in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_bullets = num_bullets
-        if self._num_bullets is None or self._num_bullets < 0:
-            self._num_bullets = random.randint(1, _NUM_BULLETS)
-        self._description_pattern = (
-            "Your answer must contain exactly {num_bullets} bullet points. "
-            + "Use the markdown bullet points such as:\n"
-            + "* This is point 1. \n"
-            + "* This is point 2"
-        )
-        return self._description_pattern.format(num_bullets=self._num_bullets)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_bullets": self._num_bullets}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_bullets"]
-
-    def check_following(self, value):
-        r"""Check if the number of bullet lists meets the requirement.
-
-        Args:
-          value: A string representing the response. The response is expected to
-            contain some bullet lists that start with `\*`.
-
-        Returns:
-          True if the actual number of bullet lists in the response meets the
-          requirement.
-        """
-        bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
-        bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
-        num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
-        return num_bullet_lists == self._num_bullets
-
-
-class ConstrainedResponseChecker(Instruction):
-    """Checks the constrained response."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        # A sequence of string(s) representing the options of the expected response.
-        self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
-        self._description_pattern = "Answer with one of the following options: {response_options}"
-        return self._description_pattern.format(response_options=self._constrained_responses)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response matches the constrained options.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the actual response contains one of the options in the constrained
-          responses; otherwise False.
-        """
-        value = value.strip()
-        for constrained_response in self._constrained_responses:
-            if constrained_response in value:
-                return True
-        return False
-
-
-class ConstrainedStartChecker(Instruction):
-    """Checks the response start."""
-
-    def build_description(self, *, starter=None):
-        """Build the instruction description.
-
-        Args:
-          starter: A string representing the keyward that the response should start
-            with.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._starter = starter.strip() if isinstance(starter, str) else starter
-        if self._starter is None:
-            self._starter = random.choice(_STARTER_OPTIONS)
-        self._description_pattern = (
-            "During the conversation, when it is your turn, " + "please always start with {starter}"
-        )
-        return self._description_pattern.format(starter=self._starter)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"starter": self._starter}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["starter"]
-
-    def check_following(self, value):
-        """Checks if the response starts with the constrained keyword or phrase.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the response starts with the given phrase or keyword that is
-          contained in `instruction_args`; otherwise, False.
-        """
-        response_pattern = r"^\s*" + self._starter + r".*$"
-        response_with_constrained_start = re.search(response_pattern, value, flags=re.MULTILINE)
-        return True if response_with_constrained_start else False
-
-
-class HighlightSectionChecker(Instruction):
-    """Checks the highlighted section."""
-
-    def build_description(self, *, num_highlights=None):
-        """Build the instruction description.
-
-        Args:
-          num_highlights: An integer specifying the minimum number of highlighted
-            sections.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_highlights = num_highlights
-        if self._num_highlights is None or self._num_highlights < 0:
-            self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
-
-        self._description_pattern = (
-            "Highlight at least {num_highlights} sections in your answer with "
-            + "markdown, i.e. *highlighted section*."
-        )
-
-        return self._description_pattern.format(num_highlights=self._num_highlights)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_highlights": self._num_highlights}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_highlights"]
-
-    def check_following(self, value):
-        """Checks if the number of highlighted sections meets the requirement.
-
-        Args:
-          value: a string repesenting the response. The response is expected to
-            contain highlighted sections in the format of *highlighted*.
-
-        Returns:
-          True if the actual number of highlighted sections in the format of
-          *highlighed sections* meets the minimum requirement; otherwise False.
-        """
-        num_highlights = 0
-        highlights = re.findall(r"\*[^\n\*]*\*", value)
-        double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
-        for highlight in highlights:
-            if highlight.strip("*").strip():
-                num_highlights += 1
-        for highlight in double_highlights:
-            if highlight.removeprefix("**").removesuffix("**").strip():
-                num_highlights += 1
-
-        return num_highlights >= self._num_highlights
-
-
-class SectionChecker(Instruction):
-    """Checks the sections."""
-
-    def build_description(self, *, section_spliter=None, num_sections=None):
-        """Build the instruction description.
-
-        Args:
-          section_spliter: A string represents the section spliter keyword that
-            marks a new section, i.e., `Section` or `SECTION`.
-          num_sections: An integer specifying the number of sections.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._section_spliter = section_spliter.strip() if isinstance(section_spliter, str) else section_spliter
-        if self._section_spliter is None:
-            self._section_spliter = random.choice(_SECTION_SPLITER)
-
-        self._num_sections = num_sections
-        if self._num_sections is None or self._num_sections < 0:
-            self._num_sections = random.randint(1, _NUM_SECTIONS)
-
-        self._description_pattern = (
-            "Your response must have {num_sections} sections. Mark the beginning "
-            + "of each section with {section_spliter} X, such as:\n"
-            + "{section_spliter} 1\n"
-            + "[content of section 1]\n"
-            + "{section_spliter} 2\n"
-            + "[content of section 2]"
-        )
-
-        return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "section_spliter": self._section_spliter,
-            "num_sections": self._num_sections,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["section_spliter", "num_sections"]
-
-    def check_following(self, value):
-        """Checks the response contains multiple sections.
-
-        Args:
-          value: A string representing the response. The response is expected
-            to contain multiple sections (number of sections is greater than 1).
-            A new section starts with `Section 1`, where the number denotes the
-            section index.
-
-        Returns:
-          True if the number of sections in the response is greater than or equal to
-          the minimum number of sections; otherwise, False.
-        """
-        section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?"
-        sections = re.split(section_splitter_patten, value)
-        num_sections = len(sections) - 1
-        return num_sections >= self._num_sections
-
-
-class ParagraphChecker(Instruction):
-    """Checks the paragraphs."""
-
-    def build_description(self, *, num_paragraphs=None):
-        """Build the instruction description.
-
-        Args:
-          num_paragraphs: An integer specifying the number of paragraphs.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_paragraphs = num_paragraphs
-        if self._num_paragraphs is None or self._num_paragraphs < 0:
-            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-
-        self._description_pattern = (
-            "There should be {num_paragraphs} paragraphs. " + "Paragraphs are separated with the markdown divider: ***"
-        )
-
-        return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_paragraphs": self._num_paragraphs}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_paragraphs"]
-
-    def check_following(self, value):
-        """Checks the response contains required number of paragraphs.
-
-        Args:
-          value: A string representing the response. The response may contain
-            paragraphs that are separated by the markdown divider: `***`.
-
-        Returns:
-          True if the actual number of paragraphs is the same as required;
-          otherwise, False.
-        """
-        paragraphs = re.split(r"\s?\*\*\*\s?", value)
-        num_paragraphs = len(paragraphs)
-
-        for index, paragraph in enumerate(paragraphs):
-            if not paragraph.strip():
-                if index == 0 or index == len(paragraphs) - 1:
-                    num_paragraphs -= 1
-                else:
-                    return False
-
-        return num_paragraphs == self._num_paragraphs
-
-
-class PostscriptChecker(Instruction):
-    """Checks the postscript."""
-
-    def build_description(self, *, postscript_marker=None):
-        """Build the instruction description.
-
-        Args:
-          postscript_marker: A string containing the keyword that marks the start
-            of the postscript section.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._postscript_marker = (
-            postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
-        )
-        if self._postscript_marker is None:
-            self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
-
-        self._description_pattern = (
-            "At the end of your response, please explicitly add a postscript " + "starting with {postscript}"
-        )
-
-        return self._description_pattern.format(postscript=self._postscript_marker)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"postscript_marker": self._postscript_marker}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["postscript_marker"]
-
-    def check_following(self, value):
-        """Checks if the response follows the postscript format.
-
-        Args:
-          value: a string representing the response. The response is expected to
-            contain a postscript section.
-
-        Returns:
-          True if the response contains a postscript section starting with
-          the keyword containing in the `instruction_args`; otherwise False.
-        """
-        value = value.lower()
-        if self._postscript_marker == "P.P.S":
-            postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
-        elif self._postscript_marker == "P.S.":
-            postscript_pattern = r"\s*p\.\s?s\..*$"
-        else:
-            postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
-        postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
-        return True if postscript else False
-
-
-class RephraseChecker(Instruction):
-    """Checks the repharse."""
-
-    def build_description(self, *, original_message):
-        """Build the instruction description.
-
-        Args:
-          original_message: A string representing the original message. The
-            rephrased response should only change its words/sentences in between
-            its two asterisks, for example, *change me*. Both original and rephrased
-            messages should contain the changes in the form of *change me*.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not self.is_change(original_message):
-            raise ValueError(f"Message {original_message} does not contain changes " "in the form of *change me*.")
-
-        self._reference_without_change = original_message
-        self._description = (
-            "Rephrasing: Your rephrased response should only"
-            + "change the words/sentences in between two asterisks"
-            + "such as *change me*."
-        )
-        return self._description
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"original_message": self._reference_without_change}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["original_message"]
-
-    def check_following(self, value):
-        r"""Checks if the rephrasing follows the instruction.
-
-        Args:
-          value: A string representing the response, which is expected to rephras
-            the string of `instruction_args`.
-
-        Returns:
-          True if `value` and `instruction_args` only differ by the words/sentences
-          in between two asterisks such as *change me*; otherwise, False.
-        """
-
-        if not self.is_change(value):
-            raise ValueError(f"value {value} does not contain " "changes in the form of *change me*.")
-
-        response_without_changes = self.strip_changes(value)
-        reference_without_changes = self.strip_changes(self._reference_without_change)
-
-        return response_without_changes == reference_without_changes
-
-    def is_change(self, response):
-        """Check if there is change in the response in the form of *change me*."""
-        return re.search(r"\*.*\*", response)
-
-    def strip_changes(self, response):
-        """Strips off the changes."""
-        return re.sub(r"\*.*\*", "", response)
-
-
-class KeywordChecker(Instruction):
-    """Check the exisitence of certain keywords."""
-
-    def build_description(self, *, keywords=None):
-        """Build the instruction description.
-
-        Args:
-          keywords: A sequence of strings representing the keywords that are
-            expected in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not keywords:
-            self._keywords = instructions_util.generate_keywords(num_keywords=_NUM_KEYWORDS)
-        else:
-            self._keywords = keywords
-        self._keywords = sorted(self._keywords)
-
-        self._description_pattern = "Include keywords {keywords} in the response."
-
-        return self._description_pattern.format(keywords=self._keywords)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"keywords": self._keywords}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["keywords"]
-
-    def check_following(self, value):
-        """Check if the response contain the expected keywords."""
-        for keyword in self._keywords:
-            if not re.search(keyword, value, flags=re.IGNORECASE):
-                return False
-        return True
-
-
-class KeywordFrequencyChecker(Instruction):
-    """Check the keyword frequency."""
-
-    def build_description(self, *, keyword=None, frequency=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          keyword: A string representing a keyword that is expected in the response.
-          frequency: An integer specifying the number of times `keyword` is expected
-            to appear in the response.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of occurrences < frequency;
-            if 'at least', the actual number of occurrences >= frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not keyword:
-            self._keyword = instructions_util.generate_keywords(num_keywords=1)[0]
-        else:
-            self._keyword = keyword.strip()
-
-        self._frequency = frequency
-        if self._frequency is None or self._frequency < 0:
-            self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = (
-            "In your response, the word {keyword} should appear {relation} " + "{frequency} times."
-        )
-
-        return self._description_pattern.format(
-            keyword=self._keyword,
-            relation=self._comparison_relation,
-            frequency=self._frequency,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "keyword": self._keyword,
-            "frequency": self._frequency,
-            "relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["keyword", "frequency", "relation"]
-
-    def check_following(self, value):
-        """Checks if the response contain the keyword with required frequency."""
-        actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE))
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return actual_occurrences < self._frequency
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return actual_occurrences >= self._frequency
-
-
-class NumberOfWords(Instruction):
-    """Checks the number of words."""
-
-    def build_description(self, *, num_words=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          num_words: An integer specifying the number of words contained in the
-            response.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of words < num_words;
-            if 'at least', the actual number of words >= num_words.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        self._num_words = num_words
-        if self._num_words is None or self._num_words < 0:
-            self._num_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in " f"{_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = "Answer with {relation} {num_words} words."
-
-        return self._description_pattern.format(relation=self._comparison_relation, num_words=self._num_words)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_words": self._num_words, "relation": self._comparison_relation}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_words", "relation"]
-
-    def check_following(self, value):
-        """Checks if the response contains the expected number of words."""
-        num_words = instructions_util.count_words(value)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return num_words < self._num_words
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return num_words >= self._num_words
-
-
-class JsonFormat(Instruction):
-    """Check the Json format."""
-
-    def build_description(self):
-        self._description_pattern = (
-            "Entire output should be wrapped in JSON format. You can use markdown" " ticks such as ```."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        value = (
-            value.strip()
-            .removeprefix("```json")
-            .removeprefix("```Json")
-            .removeprefix("```JSON")
-            .removeprefix("```")
-            .removesuffix("```")
-            .strip()
-        )
-        try:
-            json.loads(value)
-        except ValueError:
-            return False
-        return True
-
-
-class ParagraphFirstWordCheck(Instruction):
-    """Check the paragraph and the first word of the nth paragraph."""
-
-    def build_description(self, num_paragraphs=None, nth_paragraph=None, first_word=None):
-        r"""Build the instruction description.
-
-        Args:
-          num_paragraphs: An integer indicating the number of paragraphs expected
-            in the response. A paragraph is a subset of the string that is
-            expected to be separated by '\n\n'.
-          nth_paragraph: An integer indicating the paragraph number that we look at.
-            Note that n starts from 1.
-          first_word: A string that represent the first word of the bth paragraph.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_paragraphs = num_paragraphs
-        if self._num_paragraphs is None or self._num_paragraphs < 0:
-            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-
-        self._nth_paragraph = nth_paragraph
-        if self._nth_paragraph is None or self._nth_paragraph <= 0 or self._nth_paragraph > self._num_paragraphs:
-            self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
-
-        self._first_word = first_word
-        if self._first_word is None:
-            self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
-        self._first_word = self._first_word.lower()
-
-        self._description_pattern = (
-            "There should be {num_paragraphs} paragraphs. "
-            + "Paragraphs and only paragraphs are separated with each other by two "
-            + "new lines as if it was '\\n\\n' in python. "
-            + "Paragraph {nth_paragraph} must start with word {first_word}."
-        )
-
-        return self._description_pattern.format(
-            num_paragraphs=self._num_paragraphs,
-            nth_paragraph=self._nth_paragraph,
-            first_word=self._first_word,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_paragraphs": self._num_paragraphs,
-            "nth_paragraph": self._nth_paragraph,
-            "first_word": self._first_word,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_paragraphs", "nth_paragraph", "first_word"]
-
-    def check_following(self, value):
-        """Checks for required number of paragraphs and correct first word.
-
-        Args:
-          value: a string representing the response. The response may contain
-            paragraphs that are separated by two new lines and the first word of
-            the nth paragraph will have to match a specified word.
-
-        Returns:
-          True if the number of paragraphs is the same as required and the first
-          word of the specified paragraph is the same as required. Otherwise, false.
-        """
-
-        paragraphs = re.split(r"\n\n", value)
-        num_paragraphs = len(paragraphs)
-
-        for paragraph in paragraphs:
-            if not paragraph.strip():
-                num_paragraphs -= 1
-
-        # check that index doesn't go out of bounds
-        if self._nth_paragraph <= num_paragraphs:
-            paragraph = paragraphs[self._nth_paragraph - 1].strip()
-            if not paragraph:
-                return False
-        else:
-            return False
-
-        first_word = ""
-        punctuation = {".", ",", "?", "!", "'", '"'}
-
-        # get first word and remove punctuation
-        word = paragraph.split()[0].strip()
-        # TODO(jeffrey): make more complex?
-        word = word.lstrip("'")
-        word = word.lstrip('"')
-
-        for letter in word:
-            if letter in punctuation:
-                break
-            first_word += letter.lower()
-
-        return num_paragraphs == self._num_paragraphs and first_word == self._first_word
-
-
-# TODO(jeffrey) add relation - at least/at most?
-class KeySentenceChecker(Instruction):
-    """Check the existence of certain key sentences."""
-
-    def build_description(self, key_sentences=None, num_sentences=None):
-        """Build the instruction description.
-
-        Args:
-          key_sentences: A sequences of strings representing the key sentences that
-            are expected in the response.
-          num_sentences: The number of key sentences that are expected to be seen in
-            the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not key_sentences:
-            # TODO(jeffrey) make a generate sentences function? wonderwords package
-            self._key_sentences = set("For now, this is fine.")
-        else:
-            self._key_sentences = key_sentences
-
-        if not num_sentences:
-            self._num_sentences = random.randint(1, len(self._key_sentences))
-        else:
-            self._num_sentences = num_sentences
-
-        self._description_pattern = "Include {num_sentences} of the following sentences {key_sentences}"
-
-        return self._description_pattern.format(num_sentences=self._num_sentences, key_sentences=self._key_sentences)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_sentences": self._num_sentences,
-            "key_sentences": list(self._key_sentences),
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_sentences", "key_sentences"]
-
-    def check_following(self, value):
-        """Checks if the response contains the expected key sentences."""
-        count = 0
-        sentences = instructions_util.split_into_sentences(value)
-        for sentence in self._key_sentences:
-            if sentence in sentences:
-                count += 1
-
-        return count == self._num_sentences
-
-
-class ForbiddenWords(Instruction):
-    """Checks that specified words are not used in response."""
-
-    def build_description(self, forbidden_words=None):
-        """Build the instruction description.
-
-        Args:
-          forbidden_words: A sequences of strings respresenting words that are not
-            allowed in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not forbidden_words:
-            self._forbidden_words = instructions_util.generate_keywords(num_keywords=_NUM_KEYWORDS)
-        else:
-            self._forbidden_words = list(set(forbidden_words))
-        self._forbidden_words = sorted(self._forbidden_words)
-        self._description_pattern = "Do not include keywords {forbidden_words} in the response."
-
-        return self._description_pattern.format(forbidden_words=self._forbidden_words)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"forbidden_words": self._forbidden_words}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["forbidden_words"]
-
-    def check_following(self, value):
-        """Check if the response does not contain the expected keywords."""
-        for word in self._forbidden_words:
-            if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
-                return False
-        return True
-
-
-class RephraseParagraph(Instruction):
-    """Checks that the paragraph is rephrased."""
-
-    def build_description(self, *, original_paragraph, low, high):
-        """Builds the instruction description.
-
-        Args:
-          original_paragraph: A string presenting the original paragraph. The
-            rephrases response should have betweeb low-high words in common.
-          low: An integer presenting the lower bound of similar words.
-          high: An integer representing the upper bound of similar words.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        # TODO(jeffrey) make more encompassing
-        self._original_paragraph = original_paragraph
-        self._low = low
-        self._high = high
-
-        self._description = (
-            "Rephrase the following paragraph: "
-            + "{original_paragraph}\nYour response should have "
-            + "between {low} and {high} of the same words. "
-            + "Words are the same if and only if all of the "
-            + "letters, ignoring cases, are the same. For "
-            + "example, 'run' is the same as 'Run' but different "
-            + "to 'ran'."
-        )
-
-        return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "original_paragraph": self._original_paragraph,
-            "low": self._low,
-            "high": self._high,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["original_paragraph", "low", "high"]
-
-    def check_following(self, value):
-        val_words = re.findall(r"\w+", value.lower())
-        original_words = re.findall(r"\w+", self._original_paragraph.lower())
-        similar_words = 0
-
-        dict_val = collections.Counter(val_words)
-        dict_original = collections.Counter(original_words)
-
-        for word in dict_original:
-            similar_words += min(dict_original[word], dict_val[word])
-
-        return similar_words >= self._low and similar_words <= self._high
-
-
-class TwoResponsesChecker(Instruction):
-    """Check that two responses were given."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Give two different responses. Responses and only responses should"
-            " be separated by 6 asterisk symbols: ******."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response has two different answers.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if two responses are detected and false otherwise.
-        """
-        valid_responses = []
-        responses = value.split("******")
-        for index, response in enumerate(responses):
-            if not response.strip():
-                if index != 0 and index != len(responses) - 1:
-                    return False
-            else:
-                valid_responses.append(response)
-        return len(valid_responses) == 2 and valid_responses[0].strip() != valid_responses[1].strip()
-
-
-class RepeatPromptThenAnswer(Instruction):
-    """Checks that Prompt is first repeated then answered."""
-
-    def build_description(self, *, prompt_to_repeat=None):
-        """Build the instruction description.
-
-        Args:
-          prompt_to_repeat: The prompt that is meant to be repeated.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not prompt_to_repeat:
-            raise ValueError("prompt_to_repeat must be set.")
-        else:
-            self._prompt_to_repeat = prompt_to_repeat
-        self._description_pattern = (
-            "First repeat the request word for word without change,"
-            " then give your answer (1. do not say any words or characters"
-            " before repeating the request; 2. the request you need to repeat"
-            " does not include this sentence)"
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return {"prompt_to_repeat": self._prompt_to_repeat}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["prompt_to_repeat"]
-
-    def check_following(self, value):
-        if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()):
-            return True
-        return False
-
-
-class EndChecker(Instruction):
-    """Checks that the prompt ends with a given phrase."""
-
-    def build_description(self, *, end_phrase=None):
-        """Build the instruction description.
-
-        Args:
-          end_phrase: A string representing the phrase the response should end with.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._end_phrase = end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
-        if self._end_phrase is None:
-            self._end_phrase = random.choice(_ENDING_OPTIONS)
-        self._description_pattern = (
-            "Finish your response with this exact phrase {ender}. " "No other words should follow this phrase."
-        )
-        return self._description_pattern.format(ender=self._end_phrase)
-
-    def get_instruction_args(self):
-        return {"end_phrase": self._end_phrase}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["end_phrase"]
-
-    def check_following(self, value):
-        """Checks if the response ends with the expected phrase."""
-        value = value.strip().strip('"').lower()
-        self._end_phrase = self._end_phrase.strip().lower()
-        return value.endswith(self._end_phrase)
-
-
-class TitleChecker(Instruction):
-    """Checks the response for a title."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Your answer must contain a title, wrapped in double angular brackets," " such as <<poem of joy>>."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response contains a title."""
-        pattern = r"<<[^\n]+>>"
-        re_pattern = re.compile(pattern)
-        titles = re.findall(re_pattern, value)
-
-        for title in titles:
-            if title.lstrip("<").rstrip(">").strip():
-                return True
-        return False
-
-
-class LetterFrequencyChecker(Instruction):
-    """Checks letter frequency."""
-
-    def build_description(self, *, letter=None, let_frequency=None, let_relation=None):
-        """Build the instruction description.
-
-        Args:
-          letter: A string representing a letter that is expected in the response.
-          let_frequency: An integer specifying the number of times `keyword` is
-            expected to appear in the response.
-          let_relation: A string in (`less than`, `at least`), defining the
-            relational operator for comparison. Two relational comparisons are
-            supported for now; if 'less than', the actual number of
-            occurrences < frequency; if 'at least', the actual number of
-            occurrences >= frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not letter or len(letter) > 1 or ord(letter.lower()) < 97 or ord(letter.lower()) > 122:
-            self._letter = random.choice(list(string.ascii_letters))
-        else:
-            self._letter = letter.strip()
-        self._letter = self._letter.lower()
-
-        self._frequency = let_frequency
-        if self._frequency is None or self._frequency < 0:
-            self._frequency = random.randint(1, _LETTER_FREQUENCY)
-
-        if let_relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif let_relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in "
-                f"{_COMPARISON_RELATION}, but {let_relation} is given."
-            )
-        else:
-            self._comparison_relation = let_relation
-
-        self._description_pattern = (
-            "In your response, the letter {letter} should appear {let_relation}" " {let_frequency} times."
-        )
-
-        return self._description_pattern.format(
-            letter=self._letter,
-            let_frequency=self._frequency,
-            let_relation=self._comparison_relation,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return {
-            "letter": self._letter,
-            "let_frequency": self._frequency,
-            "let_relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["letter", "let_frequency", "let_relation"]
-
-    def check_following(self, value):
-        """Checks that the response contains the letter at the right frequency."""
-        value = value.lower()
-        letters = collections.Counter(value)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return letters[self._letter] < self._frequency
-        else:
-            return letters[self._letter] >= self._frequency
-
-
-class CapitalLettersEnglishChecker(Instruction):
-    """Checks that the response is in english and is in all capital letters."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "Your entire response should be in English, and in all capital letters."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response is in English and in all capital letters."""
-        assert isinstance(value, str)
-
-        try:
-            return value.isupper() and langdetect.detect(value) == "en"
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logging.error("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
-            return True
-
-
-class LowercaseLettersEnglishChecker(Instruction):
-    """Checks that the response is in english and is in all lowercase letters."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Your entire response should be in English, and in all lowercase"
-            " letters. No capital letters are allowed."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response is in English and in all lowercase letters."""
-        assert isinstance(value, str)
-
-        try:
-            return value.islower() and langdetect.detect(value) == "en"
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logging.error("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
-            return True
-
-
-class CommaChecker(Instruction):
-    """Checks the response for no commas."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "In your entire response, refrain from the use of any commas."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response does not contain commas."""
-        return not re.search(r"\,", value)
-
-
-class CapitalWordFrequencyChecker(Instruction):
-    """Checks frequency of words with all capital letters."""
-
-    def build_description(
-        self,
-        capital_frequency=None,
-        capital_relation=None,
-    ):
-        """Build the instruction description.
-
-        Args:
-          capital_frequency: An integer that represents the number of words that
-            should be in all capital letters.
-          capital_relation: A string that is 'at least' or 'at most' that refers to
-            the frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._frequency = capital_frequency
-        if self._frequency is None:
-            self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
-
-        self._comparison_relation = capital_relation
-        if capital_relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif capital_relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in "
-                f"{_COMPARISON_RELATION}, but {capital_relation} is given."
-            )
-
-        self._description_pattern = (
-            "In your response, words with all capital letters should appear" " {relation} {frequency} times."
-        )
-
-        return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return {
-            "capital_frequency": self._frequency,
-            "capital_relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["capital_frequency", "capital_relation"]
-
-    def check_following(self, value):
-        """Checks the frequency of words with all capital letters."""
-        # Hyphenated words will count as one word
-        words = instructions_util.nltk.word_tokenize(value)
-        capital_words = [word for word in words if word.isupper()]
-
-        capital_words = len(capital_words)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return capital_words < self._frequency
-        else:
-            return capital_words >= self._frequency
-
-
-class QuotationChecker(Instruction):
-    """Checks response is wrapped with double quotation marks."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "Wrap your entire response with double quotation marks."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response is wrapped with double quotation marks."""
-        value = value.strip()
-        return len(value) > 1 and value[0] == '"' and value[-1] == '"'
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
deleted file mode 100644
index 17089bd0a..000000000
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_registry.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright 2023 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Registry of all instructions."""
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions as instructions
-
-
-_KEYWORD = "keywords:"
-
-_LANGUAGE = "language:"
-
-_LENGTH = "length_constraints:"
-
-_CONTENT = "detectable_content:"
-
-_FORMAT = "detectable_format:"
-
-_MULTITURN = "multi-turn:"
-
-_COMBINATION = "combination:"
-
-_STARTEND = "startend:"
-
-_CHANGE_CASES = "change_case:"
-
-_PUNCTUATION = "punctuation:"
-
-INSTRUCTION_DICT = {
-    _KEYWORD + "existence": instructions.KeywordChecker,
-    _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
-    # TODO(jeffreyzhou): make a proper set of sentences to choose from
-    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
-    _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
-    _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
-    _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
-    _LENGTH + "number_sentences": instructions.NumberOfSentences,
-    _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
-    _LENGTH + "number_words": instructions.NumberOfWords,
-    _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
-    _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
-    _CONTENT + "postscript": instructions.PostscriptChecker,
-    _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
-    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
-    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
-    _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
-    _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker),
-    _FORMAT + "multiple_sections": instructions.SectionChecker,
-    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
-    # _FORMAT + "rephrase": instructions.RephraseChecker,
-    _FORMAT + "json_format": instructions.JsonFormat,
-    _FORMAT + "title": instructions.TitleChecker,
-    # TODO(tianjianlu): Re-enable with specific prompts.
-    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
-    _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
-    _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
-    _STARTEND + "end_checker": instructions.EndChecker,
-    _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
-    _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker,
-    _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
-    _PUNCTUATION + "no_comma": instructions.CommaChecker,
-    _STARTEND + "quotation": instructions.QuotationChecker,
-}
-
-INSTRUCTION_CONFLICTS = {
-    _KEYWORD + "existence": {_KEYWORD + "existence"},
-    _KEYWORD + "frequency": {_KEYWORD + "frequency"},
-    # TODO(jeffreyzhou): make a proper set of sentences to choose from
-    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
-    _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
-    _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
-    _LANGUAGE + "response_language": {
-        _LANGUAGE + "response_language",
-        _FORMAT + "multiple_sections",
-        _KEYWORD + "existence",
-        _KEYWORD + "frequency",
-        _KEYWORD + "forbidden_words",
-        _STARTEND + "end_checker",
-        _CHANGE_CASES + "english_capital",
-        _CHANGE_CASES + "english_lowercase",
-    },
-    _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
-    _LENGTH + "number_paragraphs": {
-        _LENGTH + "number_paragraphs",
-        _LENGTH + "nth_paragraph_first_word",
-        _LENGTH + "number_sentences",
-        _LENGTH + "nth_paragraph_first_word",
-    },
-    _LENGTH + "number_words": {_LENGTH + "number_words"},
-    _LENGTH + "nth_paragraph_first_word": {
-        _LENGTH + "nth_paragraph_first_word",
-        _LENGTH + "number_paragraphs",
-    },
-    _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
-    _CONTENT + "postscript": {_CONTENT + "postscript"},
-    _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
-    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
-    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
-    _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
-    _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
-    _FORMAT + "multiple_sections": {
-        _FORMAT + "multiple_sections",
-        _LANGUAGE + "response_language",
-        _FORMAT + "number_highlighted_sections",
-    },
-    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
-    # _FORMAT + "rephrase": instructions.RephraseChecker,
-    _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
-        {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
-    ),
-    _FORMAT + "title": {_FORMAT + "title"},
-    # TODO(tianjianlu): Re-enable with specific prompts.
-    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
-    _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
-        {
-            _KEYWORD + "forbidden_words",
-            _KEYWORD + "existence",
-            _LANGUAGE + "response_language",
-            _FORMAT + "title",
-            _PUNCTUATION + "no_comma",
-        }
-    ),
-    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
-        {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
-    ),
-    _STARTEND + "end_checker": {_STARTEND + "end_checker"},
-    _CHANGE_CASES + "capital_word_frequency": {
-        _CHANGE_CASES + "capital_word_frequency",
-        _CHANGE_CASES + "english_lowercase",
-        _CHANGE_CASES + "english_capital",
-    },
-    _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
-    _CHANGE_CASES + "english_lowercase": {
-        _CHANGE_CASES + "english_lowercase",
-        _CHANGE_CASES + "english_capital",
-    },
-    _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
-    _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
-}
-
-
-def conflict_make(conflicts):
-    """Makes sure if A conflicts with B, B will conflict with A.
-
-    Args:
-      conflicts: Dictionary of potential conflicts where key is instruction id
-        and value is set of instruction ids that it conflicts with.
-
-    Returns:
-      Revised version of the dictionary. All instructions conflict with
-      themselves. If A conflicts with B, B will conflict with A.
-    """
-    for key in conflicts:
-        for k in conflicts[key]:
-            conflicts[k].add(key)
-        conflicts[key].add(key)
-    return conflicts
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
deleted file mode 100644
index 7d995e42f..000000000
--- a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/instructions_utils.py
+++ /dev/null
@@ -1,1681 +0,0 @@
-# Copyright 2023 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility library of instructions."""
-
-import functools
-import random
-import re
-
-import nltk
-
-
-def download_nltk_resources():
-    """Download 'punkt' if not already installed"""
-    try:
-        nltk.data.find("tokenizers/punkt")
-    except LookupError:
-        nltk.download("punkt")
-
-
-download_nltk_resources()
-
-WORD_LIST = [
-    "western",
-    "sentence",
-    "signal",
-    "dump",
-    "spot",
-    "opposite",
-    "bottom",
-    "potato",
-    "administration",
-    "working",
-    "welcome",
-    "morning",
-    "good",
-    "agency",
-    "primary",
-    "wish",
-    "responsibility",
-    "press",
-    "problem",
-    "president",
-    "steal",
-    "brush",
-    "read",
-    "type",
-    "beat",
-    "trainer",
-    "growth",
-    "lock",
-    "bone",
-    "case",
-    "equal",
-    "comfortable",
-    "region",
-    "replacement",
-    "performance",
-    "mate",
-    "walk",
-    "medicine",
-    "film",
-    "thing",
-    "rock",
-    "tap",
-    "total",
-    "competition",
-    "ease",
-    "south",
-    "establishment",
-    "gather",
-    "parking",
-    "world",
-    "plenty",
-    "breath",
-    "claim",
-    "alcohol",
-    "trade",
-    "dear",
-    "highlight",
-    "street",
-    "matter",
-    "decision",
-    "mess",
-    "agreement",
-    "studio",
-    "coach",
-    "assist",
-    "brain",
-    "wing",
-    "style",
-    "private",
-    "top",
-    "brown",
-    "leg",
-    "buy",
-    "procedure",
-    "method",
-    "speed",
-    "high",
-    "company",
-    "valuable",
-    "pie",
-    "analyst",
-    "session",
-    "pattern",
-    "district",
-    "pleasure",
-    "dinner",
-    "swimming",
-    "joke",
-    "order",
-    "plate",
-    "department",
-    "motor",
-    "cell",
-    "spend",
-    "cabinet",
-    "difference",
-    "power",
-    "examination",
-    "engine",
-    "horse",
-    "dimension",
-    "pay",
-    "toe",
-    "curve",
-    "literature",
-    "bother",
-    "fire",
-    "possibility",
-    "debate",
-    "activity",
-    "passage",
-    "hello",
-    "cycle",
-    "background",
-    "quiet",
-    "author",
-    "effect",
-    "actor",
-    "page",
-    "bicycle",
-    "error",
-    "throat",
-    "attack",
-    "character",
-    "phone",
-    "tea",
-    "increase",
-    "outcome",
-    "file",
-    "specific",
-    "inspector",
-    "internal",
-    "potential",
-    "staff",
-    "building",
-    "employer",
-    "shoe",
-    "hand",
-    "direction",
-    "garden",
-    "purchase",
-    "interview",
-    "study",
-    "recognition",
-    "member",
-    "spiritual",
-    "oven",
-    "sandwich",
-    "weird",
-    "passenger",
-    "particular",
-    "response",
-    "reaction",
-    "size",
-    "variation",
-    "a",
-    "cancel",
-    "candy",
-    "exit",
-    "guest",
-    "condition",
-    "fly",
-    "price",
-    "weakness",
-    "convert",
-    "hotel",
-    "great",
-    "mouth",
-    "mind",
-    "song",
-    "sugar",
-    "suspect",
-    "telephone",
-    "ear",
-    "roof",
-    "paint",
-    "refrigerator",
-    "organization",
-    "jury",
-    "reward",
-    "engineering",
-    "day",
-    "possession",
-    "crew",
-    "bar",
-    "road",
-    "description",
-    "celebration",
-    "score",
-    "mark",
-    "letter",
-    "shower",
-    "suggestion",
-    "sir",
-    "luck",
-    "national",
-    "progress",
-    "hall",
-    "stroke",
-    "theory",
-    "offer",
-    "story",
-    "tax",
-    "definition",
-    "history",
-    "ride",
-    "medium",
-    "opening",
-    "glass",
-    "elevator",
-    "stomach",
-    "question",
-    "ability",
-    "leading",
-    "village",
-    "computer",
-    "city",
-    "grand",
-    "confidence",
-    "candle",
-    "priest",
-    "recommendation",
-    "point",
-    "necessary",
-    "body",
-    "desk",
-    "secret",
-    "horror",
-    "noise",
-    "culture",
-    "warning",
-    "water",
-    "round",
-    "diet",
-    "flower",
-    "bus",
-    "tough",
-    "permission",
-    "week",
-    "prompt",
-    "connection",
-    "abuse",
-    "height",
-    "save",
-    "corner",
-    "border",
-    "stress",
-    "drive",
-    "stop",
-    "rip",
-    "meal",
-    "listen",
-    "confusion",
-    "girlfriend",
-    "living",
-    "relation",
-    "significance",
-    "plan",
-    "creative",
-    "atmosphere",
-    "blame",
-    "invite",
-    "housing",
-    "paper",
-    "drink",
-    "roll",
-    "silver",
-    "drunk",
-    "age",
-    "damage",
-    "smoke",
-    "environment",
-    "pack",
-    "savings",
-    "influence",
-    "tourist",
-    "rain",
-    "post",
-    "sign",
-    "grandmother",
-    "run",
-    "profit",
-    "push",
-    "clerk",
-    "final",
-    "wine",
-    "swim",
-    "pause",
-    "stuff",
-    "singer",
-    "funeral",
-    "average",
-    "source",
-    "scene",
-    "tradition",
-    "personal",
-    "snow",
-    "nobody",
-    "distance",
-    "sort",
-    "sensitive",
-    "animal",
-    "major",
-    "negotiation",
-    "click",
-    "mood",
-    "period",
-    "arrival",
-    "expression",
-    "holiday",
-    "repeat",
-    "dust",
-    "closet",
-    "gold",
-    "bad",
-    "sail",
-    "combination",
-    "clothes",
-    "emphasis",
-    "duty",
-    "black",
-    "step",
-    "school",
-    "jump",
-    "document",
-    "professional",
-    "lip",
-    "chemical",
-    "front",
-    "wake",
-    "while",
-    "inside",
-    "watch",
-    "row",
-    "subject",
-    "penalty",
-    "balance",
-    "possible",
-    "adult",
-    "aside",
-    "sample",
-    "appeal",
-    "wedding",
-    "depth",
-    "king",
-    "award",
-    "wife",
-    "blow",
-    "site",
-    "camp",
-    "music",
-    "safe",
-    "gift",
-    "fault",
-    "guess",
-    "act",
-    "shame",
-    "drama",
-    "capital",
-    "exam",
-    "stupid",
-    "record",
-    "sound",
-    "swing",
-    "novel",
-    "minimum",
-    "ratio",
-    "machine",
-    "shape",
-    "lead",
-    "operation",
-    "salary",
-    "cloud",
-    "affair",
-    "hit",
-    "chapter",
-    "stage",
-    "quantity",
-    "access",
-    "army",
-    "chain",
-    "traffic",
-    "kick",
-    "analysis",
-    "airport",
-    "time",
-    "vacation",
-    "philosophy",
-    "ball",
-    "chest",
-    "thanks",
-    "place",
-    "mountain",
-    "advertising",
-    "red",
-    "past",
-    "rent",
-    "return",
-    "tour",
-    "house",
-    "construction",
-    "net",
-    "native",
-    "war",
-    "figure",
-    "fee",
-    "spray",
-    "user",
-    "dirt",
-    "shot",
-    "task",
-    "stick",
-    "friend",
-    "software",
-    "promotion",
-    "interaction",
-    "surround",
-    "block",
-    "purpose",
-    "practice",
-    "conflict",
-    "routine",
-    "requirement",
-    "bonus",
-    "hole",
-    "state",
-    "junior",
-    "sweet",
-    "catch",
-    "tear",
-    "fold",
-    "wall",
-    "editor",
-    "life",
-    "position",
-    "pound",
-    "respect",
-    "bathroom",
-    "coat",
-    "script",
-    "job",
-    "teach",
-    "birth",
-    "view",
-    "resolve",
-    "theme",
-    "employee",
-    "doubt",
-    "market",
-    "education",
-    "serve",
-    "recover",
-    "tone",
-    "harm",
-    "miss",
-    "union",
-    "understanding",
-    "cow",
-    "river",
-    "association",
-    "concept",
-    "training",
-    "recipe",
-    "relationship",
-    "reserve",
-    "depression",
-    "proof",
-    "hair",
-    "revenue",
-    "independent",
-    "lift",
-    "assignment",
-    "temporary",
-    "amount",
-    "loss",
-    "edge",
-    "track",
-    "check",
-    "rope",
-    "estimate",
-    "pollution",
-    "stable",
-    "message",
-    "delivery",
-    "perspective",
-    "mirror",
-    "assistant",
-    "representative",
-    "witness",
-    "nature",
-    "judge",
-    "fruit",
-    "tip",
-    "devil",
-    "town",
-    "emergency",
-    "upper",
-    "drop",
-    "stay",
-    "human",
-    "neck",
-    "speaker",
-    "network",
-    "sing",
-    "resist",
-    "league",
-    "trip",
-    "signature",
-    "lawyer",
-    "importance",
-    "gas",
-    "choice",
-    "engineer",
-    "success",
-    "part",
-    "external",
-    "worker",
-    "simple",
-    "quarter",
-    "student",
-    "heart",
-    "pass",
-    "spite",
-    "shift",
-    "rough",
-    "lady",
-    "grass",
-    "community",
-    "garage",
-    "youth",
-    "standard",
-    "skirt",
-    "promise",
-    "blind",
-    "television",
-    "disease",
-    "commission",
-    "positive",
-    "energy",
-    "calm",
-    "presence",
-    "tune",
-    "basis",
-    "preference",
-    "head",
-    "common",
-    "cut",
-    "somewhere",
-    "presentation",
-    "current",
-    "thought",
-    "revolution",
-    "effort",
-    "master",
-    "implement",
-    "republic",
-    "floor",
-    "principle",
-    "stranger",
-    "shoulder",
-    "grade",
-    "button",
-    "tennis",
-    "police",
-    "collection",
-    "account",
-    "register",
-    "glove",
-    "divide",
-    "professor",
-    "chair",
-    "priority",
-    "combine",
-    "peace",
-    "extension",
-    "maybe",
-    "evening",
-    "frame",
-    "sister",
-    "wave",
-    "code",
-    "application",
-    "mouse",
-    "match",
-    "counter",
-    "bottle",
-    "half",
-    "cheek",
-    "resolution",
-    "back",
-    "knowledge",
-    "make",
-    "discussion",
-    "screw",
-    "length",
-    "accident",
-    "battle",
-    "dress",
-    "knee",
-    "log",
-    "package",
-    "it",
-    "turn",
-    "hearing",
-    "newspaper",
-    "layer",
-    "wealth",
-    "profile",
-    "imagination",
-    "answer",
-    "weekend",
-    "teacher",
-    "appearance",
-    "meet",
-    "bike",
-    "rise",
-    "belt",
-    "crash",
-    "bowl",
-    "equivalent",
-    "support",
-    "image",
-    "poem",
-    "risk",
-    "excitement",
-    "remote",
-    "secretary",
-    "public",
-    "produce",
-    "plane",
-    "display",
-    "money",
-    "sand",
-    "situation",
-    "punch",
-    "customer",
-    "title",
-    "shake",
-    "mortgage",
-    "option",
-    "number",
-    "pop",
-    "window",
-    "extent",
-    "nothing",
-    "experience",
-    "opinion",
-    "departure",
-    "dance",
-    "indication",
-    "boy",
-    "material",
-    "band",
-    "leader",
-    "sun",
-    "beautiful",
-    "muscle",
-    "farmer",
-    "variety",
-    "fat",
-    "handle",
-    "director",
-    "opportunity",
-    "calendar",
-    "outside",
-    "pace",
-    "bath",
-    "fish",
-    "consequence",
-    "put",
-    "owner",
-    "go",
-    "doctor",
-    "information",
-    "share",
-    "hurt",
-    "protection",
-    "career",
-    "finance",
-    "force",
-    "golf",
-    "garbage",
-    "aspect",
-    "kid",
-    "food",
-    "boot",
-    "milk",
-    "respond",
-    "objective",
-    "reality",
-    "raw",
-    "ring",
-    "mall",
-    "one",
-    "impact",
-    "area",
-    "news",
-    "international",
-    "series",
-    "impress",
-    "mother",
-    "shelter",
-    "strike",
-    "loan",
-    "month",
-    "seat",
-    "anything",
-    "entertainment",
-    "familiar",
-    "clue",
-    "year",
-    "glad",
-    "supermarket",
-    "natural",
-    "god",
-    "cost",
-    "conversation",
-    "tie",
-    "ruin",
-    "comfort",
-    "earth",
-    "storm",
-    "percentage",
-    "assistance",
-    "budget",
-    "strength",
-    "beginning",
-    "sleep",
-    "other",
-    "young",
-    "unit",
-    "fill",
-    "store",
-    "desire",
-    "hide",
-    "value",
-    "cup",
-    "maintenance",
-    "nurse",
-    "function",
-    "tower",
-    "role",
-    "class",
-    "camera",
-    "database",
-    "panic",
-    "nation",
-    "basket",
-    "ice",
-    "art",
-    "spirit",
-    "chart",
-    "exchange",
-    "feedback",
-    "statement",
-    "reputation",
-    "search",
-    "hunt",
-    "exercise",
-    "nasty",
-    "notice",
-    "male",
-    "yard",
-    "annual",
-    "collar",
-    "date",
-    "platform",
-    "plant",
-    "fortune",
-    "passion",
-    "friendship",
-    "spread",
-    "cancer",
-    "ticket",
-    "attitude",
-    "island",
-    "active",
-    "object",
-    "service",
-    "buyer",
-    "bite",
-    "card",
-    "face",
-    "steak",
-    "proposal",
-    "patient",
-    "heat",
-    "rule",
-    "resident",
-    "broad",
-    "politics",
-    "west",
-    "knife",
-    "expert",
-    "girl",
-    "design",
-    "salt",
-    "baseball",
-    "grab",
-    "inspection",
-    "cousin",
-    "couple",
-    "magazine",
-    "cook",
-    "dependent",
-    "security",
-    "chicken",
-    "version",
-    "currency",
-    "ladder",
-    "scheme",
-    "kitchen",
-    "employment",
-    "local",
-    "attention",
-    "manager",
-    "fact",
-    "cover",
-    "sad",
-    "guard",
-    "relative",
-    "county",
-    "rate",
-    "lunch",
-    "program",
-    "initiative",
-    "gear",
-    "bridge",
-    "breast",
-    "talk",
-    "dish",
-    "guarantee",
-    "beer",
-    "vehicle",
-    "reception",
-    "woman",
-    "substance",
-    "copy",
-    "lecture",
-    "advantage",
-    "park",
-    "cold",
-    "death",
-    "mix",
-    "hold",
-    "scale",
-    "tomorrow",
-    "blood",
-    "request",
-    "green",
-    "cookie",
-    "church",
-    "strip",
-    "forever",
-    "beyond",
-    "debt",
-    "tackle",
-    "wash",
-    "following",
-    "feel",
-    "maximum",
-    "sector",
-    "sea",
-    "property",
-    "economics",
-    "menu",
-    "bench",
-    "try",
-    "language",
-    "start",
-    "call",
-    "solid",
-    "address",
-    "income",
-    "foot",
-    "senior",
-    "honey",
-    "few",
-    "mixture",
-    "cash",
-    "grocery",
-    "link",
-    "map",
-    "form",
-    "factor",
-    "pot",
-    "model",
-    "writer",
-    "farm",
-    "winter",
-    "skill",
-    "anywhere",
-    "birthday",
-    "policy",
-    "release",
-    "husband",
-    "lab",
-    "hurry",
-    "mail",
-    "equipment",
-    "sink",
-    "pair",
-    "driver",
-    "consideration",
-    "leather",
-    "skin",
-    "blue",
-    "boat",
-    "sale",
-    "brick",
-    "two",
-    "feed",
-    "square",
-    "dot",
-    "rush",
-    "dream",
-    "location",
-    "afternoon",
-    "manufacturer",
-    "control",
-    "occasion",
-    "trouble",
-    "introduction",
-    "advice",
-    "bet",
-    "eat",
-    "kill",
-    "category",
-    "manner",
-    "office",
-    "estate",
-    "pride",
-    "awareness",
-    "slip",
-    "crack",
-    "client",
-    "nail",
-    "shoot",
-    "membership",
-    "soft",
-    "anybody",
-    "web",
-    "official",
-    "individual",
-    "pizza",
-    "interest",
-    "bag",
-    "spell",
-    "profession",
-    "queen",
-    "deal",
-    "resource",
-    "ship",
-    "guy",
-    "chocolate",
-    "joint",
-    "formal",
-    "upstairs",
-    "car",
-    "resort",
-    "abroad",
-    "dealer",
-    "associate",
-    "finger",
-    "surgery",
-    "comment",
-    "team",
-    "detail",
-    "crazy",
-    "path",
-    "tale",
-    "initial",
-    "arm",
-    "radio",
-    "demand",
-    "single",
-    "draw",
-    "yellow",
-    "contest",
-    "piece",
-    "quote",
-    "pull",
-    "commercial",
-    "shirt",
-    "contribution",
-    "cream",
-    "channel",
-    "suit",
-    "discipline",
-    "instruction",
-    "concert",
-    "speech",
-    "low",
-    "effective",
-    "hang",
-    "scratch",
-    "industry",
-    "breakfast",
-    "lay",
-    "join",
-    "metal",
-    "bedroom",
-    "minute",
-    "product",
-    "rest",
-    "temperature",
-    "many",
-    "give",
-    "argument",
-    "print",
-    "purple",
-    "laugh",
-    "health",
-    "credit",
-    "investment",
-    "sell",
-    "setting",
-    "lesson",
-    "egg",
-    "middle",
-    "marriage",
-    "level",
-    "evidence",
-    "phrase",
-    "love",
-    "self",
-    "benefit",
-    "guidance",
-    "affect",
-    "you",
-    "dad",
-    "anxiety",
-    "special",
-    "boyfriend",
-    "test",
-    "blank",
-    "payment",
-    "soup",
-    "obligation",
-    "reply",
-    "smile",
-    "deep",
-    "complaint",
-    "addition",
-    "review",
-    "box",
-    "towel",
-    "minor",
-    "fun",
-    "soil",
-    "issue",
-    "cigarette",
-    "internet",
-    "gain",
-    "tell",
-    "entry",
-    "spare",
-    "incident",
-    "family",
-    "refuse",
-    "branch",
-    "can",
-    "pen",
-    "grandfather",
-    "constant",
-    "tank",
-    "uncle",
-    "climate",
-    "ground",
-    "volume",
-    "communication",
-    "kind",
-    "poet",
-    "child",
-    "screen",
-    "mine",
-    "quit",
-    "gene",
-    "lack",
-    "charity",
-    "memory",
-    "tooth",
-    "fear",
-    "mention",
-    "marketing",
-    "reveal",
-    "reason",
-    "court",
-    "season",
-    "freedom",
-    "land",
-    "sport",
-    "audience",
-    "classroom",
-    "law",
-    "hook",
-    "win",
-    "carry",
-    "eye",
-    "smell",
-    "distribution",
-    "research",
-    "country",
-    "dare",
-    "hope",
-    "whereas",
-    "stretch",
-    "library",
-    "if",
-    "delay",
-    "college",
-    "plastic",
-    "book",
-    "present",
-    "use",
-    "worry",
-    "champion",
-    "goal",
-    "economy",
-    "march",
-    "election",
-    "reflection",
-    "midnight",
-    "slide",
-    "inflation",
-    "action",
-    "challenge",
-    "guitar",
-    "coast",
-    "apple",
-    "campaign",
-    "field",
-    "jacket",
-    "sense",
-    "way",
-    "visual",
-    "remove",
-    "weather",
-    "trash",
-    "cable",
-    "regret",
-    "buddy",
-    "beach",
-    "historian",
-    "courage",
-    "sympathy",
-    "truck",
-    "tension",
-    "permit",
-    "nose",
-    "bed",
-    "son",
-    "person",
-    "base",
-    "meat",
-    "usual",
-    "air",
-    "meeting",
-    "worth",
-    "game",
-    "independence",
-    "physical",
-    "brief",
-    "play",
-    "raise",
-    "board",
-    "she",
-    "key",
-    "writing",
-    "pick",
-    "command",
-    "party",
-    "yesterday",
-    "spring",
-    "candidate",
-    "physics",
-    "university",
-    "concern",
-    "development",
-    "change",
-    "string",
-    "target",
-    "instance",
-    "room",
-    "bitter",
-    "bird",
-    "football",
-    "normal",
-    "split",
-    "impression",
-    "wood",
-    "long",
-    "meaning",
-    "stock",
-    "cap",
-    "leadership",
-    "media",
-    "ambition",
-    "fishing",
-    "essay",
-    "salad",
-    "repair",
-    "today",
-    "designer",
-    "night",
-    "bank",
-    "drawing",
-    "inevitable",
-    "phase",
-    "vast",
-    "chip",
-    "anger",
-    "switch",
-    "cry",
-    "twist",
-    "personality",
-    "attempt",
-    "storage",
-    "being",
-    "preparation",
-    "bat",
-    "selection",
-    "white",
-    "technology",
-    "contract",
-    "side",
-    "section",
-    "station",
-    "till",
-    "structure",
-    "tongue",
-    "taste",
-    "truth",
-    "difficulty",
-    "group",
-    "limit",
-    "main",
-    "move",
-    "feeling",
-    "light",
-    "example",
-    "mission",
-    "might",
-    "wait",
-    "wheel",
-    "shop",
-    "host",
-    "classic",
-    "alternative",
-    "cause",
-    "agent",
-    "consist",
-    "table",
-    "airline",
-    "text",
-    "pool",
-    "craft",
-    "range",
-    "fuel",
-    "tool",
-    "partner",
-    "load",
-    "entrance",
-    "deposit",
-    "hate",
-    "article",
-    "video",
-    "summer",
-    "feature",
-    "extreme",
-    "mobile",
-    "hospital",
-    "flight",
-    "fall",
-    "pension",
-    "piano",
-    "fail",
-    "result",
-    "rub",
-    "gap",
-    "system",
-    "report",
-    "suck",
-    "ordinary",
-    "wind",
-    "nerve",
-    "ask",
-    "shine",
-    "note",
-    "line",
-    "mom",
-    "perception",
-    "brother",
-    "reference",
-    "bend",
-    "charge",
-    "treat",
-    "trick",
-    "term",
-    "homework",
-    "bake",
-    "bid",
-    "status",
-    "project",
-    "strategy",
-    "orange",
-    "let",
-    "enthusiasm",
-    "parent",
-    "concentrate",
-    "device",
-    "travel",
-    "poetry",
-    "business",
-    "society",
-    "kiss",
-    "end",
-    "vegetable",
-    "employ",
-    "schedule",
-    "hour",
-    "brave",
-    "focus",
-    "process",
-    "movie",
-    "illegal",
-    "general",
-    "coffee",
-    "ad",
-    "highway",
-    "chemistry",
-    "psychology",
-    "hire",
-    "bell",
-    "conference",
-    "relief",
-    "show",
-    "neat",
-    "funny",
-    "weight",
-    "quality",
-    "club",
-    "daughter",
-    "zone",
-    "touch",
-    "tonight",
-    "shock",
-    "burn",
-    "excuse",
-    "name",
-    "survey",
-    "landscape",
-    "advance",
-    "satisfaction",
-    "bread",
-    "disaster",
-    "item",
-    "hat",
-    "prior",
-    "shopping",
-    "visit",
-    "east",
-    "photo",
-    "home",
-    "idea",
-    "father",
-    "comparison",
-    "cat",
-    "pipe",
-    "winner",
-    "count",
-    "lake",
-    "fight",
-    "prize",
-    "foundation",
-    "dog",
-    "keep",
-    "ideal",
-    "fan",
-    "struggle",
-    "peak",
-    "safety",
-    "solution",
-    "hell",
-    "conclusion",
-    "population",
-    "strain",
-    "alarm",
-    "measurement",
-    "second",
-    "train",
-    "race",
-    "due",
-    "insurance",
-    "boss",
-    "tree",
-    "monitor",
-    "sick",
-    "course",
-    "drag",
-    "appointment",
-    "slice",
-    "still",
-    "care",
-    "patience",
-    "rich",
-    "escape",
-    "emotion",
-    "royal",
-    "female",
-    "childhood",
-    "government",
-    "picture",
-    "will",
-    "sock",
-    "big",
-    "gate",
-    "oil",
-    "cross",
-    "pin",
-    "improvement",
-    "championship",
-    "silly",
-    "help",
-    "sky",
-    "pitch",
-    "man",
-    "diamond",
-    "most",
-    "transition",
-    "work",
-    "science",
-    "committee",
-    "moment",
-    "fix",
-    "teaching",
-    "dig",
-    "specialist",
-    "complex",
-    "guide",
-    "people",
-    "dead",
-    "voice",
-    "original",
-    "break",
-    "topic",
-    "data",
-    "degree",
-    "reading",
-    "recording",
-    "bunch",
-    "reach",
-    "judgment",
-    "lie",
-    "regular",
-    "set",
-    "painting",
-    "mode",
-    "list",
-    "player",
-    "bear",
-    "north",
-    "wonder",
-    "carpet",
-    "heavy",
-    "officer",
-    "negative",
-    "clock",
-    "unique",
-    "baby",
-    "pain",
-    "assumption",
-    "disk",
-    "iron",
-    "bill",
-    "drawer",
-    "look",
-    "double",
-    "mistake",
-    "finish",
-    "future",
-    "brilliant",
-    "contact",
-    "math",
-    "rice",
-    "leave",
-    "restaurant",
-    "discount",
-    "sex",
-    "virus",
-    "bit",
-    "trust",
-    "event",
-    "wear",
-    "juice",
-    "failure",
-    "bug",
-    "context",
-    "mud",
-    "whole",
-    "wrap",
-    "intention",
-    "draft",
-    "pressure",
-    "cake",
-    "dark",
-    "explanation",
-    "space",
-    "angle",
-    "word",
-    "efficiency",
-    "management",
-    "habit",
-    "star",
-    "chance",
-    "finding",
-    "transportation",
-    "stand",
-    "criticism",
-    "flow",
-    "door",
-    "injury",
-    "insect",
-    "surprise",
-    "apartment",
-]  # pylint: disable=line-too-long
-
-# ISO 639-1 codes to language names.
-LANGUAGE_CODES = {
-    "en": "English",
-    "es": "Spanish",
-    "pt": "Portuguese",
-    "ar": "Arabic",
-    "hi": "Hindi",
-    "fr": "French",
-    "ru": "Russian",
-    "de": "German",
-    "ja": "Japanese",
-    "it": "Italian",
-    "bn": "Bengali",
-    "uk": "Ukrainian",
-    "th": "Thai",
-    "ur": "Urdu",
-    "ta": "Tamil",
-    "te": "Telugu",
-    "bg": "Bulgarian",
-    "ko": "Korean",
-    "pl": "Polish",
-    "he": "Hebrew",
-    "fa": "Persian",
-    "vi": "Vietnamese",
-    "ne": "Nepali",
-    "sw": "Swahili",
-    "kn": "Kannada",
-    "mr": "Marathi",
-    "gu": "Gujarati",
-    "pa": "Punjabi",
-    "ml": "Malayalam",
-    "fi": "Finnish",
-}
-
-_ALPHABETS = "([A-Za-z])"
-_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
-_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
-_STARTERS = (
-    r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
-)
-_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
-_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
-_DIGITS = "([0-9])"
-_MULTIPLE_DOTS = r"\.{2,}"
-
-
-def split_into_sentences(text):
-    """Split the text into sentences.
-
-    Args:
-      text: A string that consists of more than or equal to one sentences.
-
-    Returns:
-      A list of strings where each string is a sentence.
-    """
-    text = " " + text + "  "
-    text = text.replace("\n", " ")
-    text = re.sub(_PREFIXES, "\\1<prd>", text)
-    text = re.sub(_WEBSITES, "<prd>\\1", text)
-    text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
-    text = re.sub(
-        _MULTIPLE_DOTS,
-        lambda match: "<prd>" * len(match.group(0)) + "<stop>",
-        text,
-    )
-    if "Ph.D" in text:
-        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
-    text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
-    text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
-    text = re.sub(
-        _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
-        "\\1<prd>\\2<prd>\\3<prd>",
-        text,
-    )
-    text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
-    text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
-    text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
-    text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
-    if "”" in text:
-        text = text.replace(".”", "”.")
-    if '"' in text:
-        text = text.replace('."', '".')
-    if "!" in text:
-        text = text.replace('!"', '"!')
-    if "?" in text:
-        text = text.replace('?"', '"?')
-    text = text.replace(".", ".<stop>")
-    text = text.replace("?", "?<stop>")
-    text = text.replace("!", "!<stop>")
-    text = text.replace("<prd>", ".")
-    sentences = text.split("<stop>")
-    sentences = [s.strip() for s in sentences]
-    if sentences and not sentences[-1]:
-        sentences = sentences[:-1]
-    return sentences
-
-
-def count_words(text):
-    """Counts the number of words."""
-    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
-    tokens = tokenizer.tokenize(text)
-    num_words = len(tokens)
-    return num_words
-
-
-@functools.lru_cache(maxsize=None)
-def _get_sentence_tokenizer():
-    return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
-
-
-def count_sentences(text):
-    """Count the number of sentences."""
-    tokenizer = _get_sentence_tokenizer()
-    tokenized_sentences = tokenizer.tokenize(text)
-    return len(tokenized_sentences)
-
-
-def generate_keywords(num_keywords):
-    """Randomly generates a few keywords."""
-    return random.sample(WORD_LIST, k=num_keywords)

From b9045e13efddce872f72795d20ea1e769c29a70d Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.com>
Date: Wed, 28 Feb 2024 14:33:21 +0100
Subject: [PATCH 07/45] revert README

---
 README.md | 233 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 151 insertions(+), 82 deletions(-)

diff --git a/README.md b/README.md
index 411fefe9a..4ddddbc9c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 # LightEval 🌤️
-A lightweight LLM evaluation
+
+A lightweight framework for LLM evaluation
 
 ## Context
 LightEval is a lightweight LLM evaluation suite that Hugging Face has been using internally with the recently released LLM data processing library [datatrove](https://github.com/huggingface/datatrove) and LLM training library [nanotron](https://github.com/huggingface/nanotron).
@@ -12,131 +13,192 @@ In case of problems or question, feel free to open an issue!
 ## News
 - **Feb 08, 2024**: Release of `lighteval`
 
-## Deep thanks
-`lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (which is powering the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics.
-
-Through adding more and more logging functionalities, and making it compatible with increasingly different workflows and model codebases (including 3D parallelism) as well as allowing custom evaluation experiments, metrics and benchmarks, we ended up needing to change the code more and more deeply until `lighteval` became the small standalone library that it is now.
-
-However, we are very grateful to the Harness and HELM teams for their continued work on better evaluations.
-
-## How to navigate this project
-`lighteval` is supposed to be used as a standalone evaluation library.
-- To run the evaluations, you can use `run_evals_accelerate.py` or `run_evals_nanotron.py`.
-- [src/lighteval](https://github.com/huggingface/lighteval/tree/main/src/lighteval) contains the core of the lib itself
-    - [lighteval](https://github.com/huggingface/lighteval/tree/main/src/lighteval) contains the core of the library, divided in the following section
-        - [main_accelerate.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_accelerate.py) and [main_nanotron.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_nanotron.py) are our entry points to run evaluation
-        - [logging](https://github.com/huggingface/lighteval/tree/main/src/lighteval/logging): Our loggers, to display experiment information and push it to the hub after a run
-        - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions.
-        - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models.
-        - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`.
-- [tasks_examples](https://github.com/huggingface/lighteval/tree/main/tasks_examples) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
-- [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.
-
-## How to install and use
+## Installation
 
-Note:
-- Use the Eleuther AI Harness (`lm_eval`) to share comparable numbers with everyone (e.g. on the Open LLM Leaderboard).
-- Use `lighteval` during training with the nanotron/datatrove LLM training stack and/or for quick eval/benchmark experimentations.
+Clone the repo:
 
-### Installation
-Create your virtual environment using virtualenv or conda depending on your preferences. We require Python3.10 or above.
 ```bash
-conda create -n lighteval python==3.10
+git clone https://github.com/huggingface/lighteval.git
+cd lighteval
 ```
 
-Clone the package
+Create a virtual environment using virtualenv or conda depending on your preferences. We require Python 3.10 or above:
+
 ```bash
-git clone https://github.com/huggingface/lighteval.git
-cd lighteval
+conda create -n lighteval python=3.10 && conda activate lighteval
 ```
 
 Install the dependencies. For the default installation, you just need:
+
 ```bash
-pip install -e .
+pip install .
 ```
 
-If you want to run your models using accelerate, tgi or optimum, do quantization, or use adapter weights, you will need to specify the optional dependencies group fitting your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`) at install time
+If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):
+
 ```bash
-pip install -e .[optional1,optional2]
+pip install '.[optional1,optional2]'
 ```
 
-The setup we tested most is:
+The setup tested most is:
+
 ```bash
-pip install -e .[accelerate,quantization,adapters]
+pip install '.[accelerate,quantization,adapters]'
 ```
 
-If you want to push your results to the hub, don't forget to add your user token to the environment variable `HUGGING_FACE_HUB_TOKEN`.
+If you want to push your results to the Hugging Face Hub, don't forget to add your access token to the environment variable `HUGGING_FACE_HUB_TOKEN`. You can do this by running:
 
-Lastly, if you intend to push to the code base, you'll need to install the precommit hook for styling tests.
-```bash
-pip install pre-commit
-pre-commit install
+```shell
+huggingface-cli login
 ```
 
-Optional steps.
+and pasting your access token.
+
+### Optional steps
+
 - to load and push big models/datasets, your machine likely needs Git LFS. You can install it with `sudo apt-get install git-lfs`
 - If you want to run bigbench evaluations, install bigbench `pip install "bigbench@https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
 
+Lastly, if you intend to push to the code base, you'll need to install the precommit hook for styling tests:
 
-### Testing that everything was installed correctly
-If you want to test your install, you can run your first evaluation on GPUs (8GPU, single node), using
 ```bash
-mkdir tmp
-python -m accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=gpt2" --tasks tasks_examples/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir="tmp/"
+pip install pre-commit
+pre-commit install
 ```
 
-### Usage
-- Launching on CPU
-    - `python run_evals_accelerate.py --model_args="pretrained=<path to your model on the hub>" <task parameters> --output_dir output_dir`
-- Using data parallelism on several GPUs (recommended)
-    - If you want to use data parallelism, first configure accelerate (`accelerate config`).
-    - `accelerate launch <accelerate parameters> run_evals_accelerate.py --model_args="pretrained=<path to your model on the hub>" <task parameters>  --output_dir=<your output dir>`
-    for instance: `python -m accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=gpt2" --tasks tasks_examples/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=tmp/`
-    - Note: if you use model_parallel, accelerate will use 2 processes for model parallel, num_processes for data parallel
+## Usage
 
-The task parameters indicate which tasks you want to launch. You can select:
-- one or several tasks, with `--tasks task_names`, with task_names in the [metadata table](src/lighteval/tasks/tasks_table.jsonl), separated by commas. You must specify which version of the task you want (= in which suite it is), by prepending the suite name, as well as the number of training few_shots prompts for the given task, and whether you want to automatically reduce the number of few_shots if they make the prompt too long (`suite|task|few_shot|1 or 0 to automatically reduce the number of few_shots or not`).
-- a file path, which contains tasks following the above format.
+We provide two main entry points to evaluate models:
 
-Example
-If you want to compare hellaswag from helm and the harness on Gpt-6j, you can do
-`python run_evals_accelerate.py --model hf_causal --model_args="pretrained=EleutherAI/gpt-j-6b" --tasks helm|hellaswag|0|0,lighteval|hellaswag|0|0 --output_dir output_dir`
+* `run_evals_accelerate.py`: evaluate models on CPU or one or more GPUs using [🤗 Accelerate](https://github.com/huggingface/accelerate).
+* `run_evals_nanotron.py`: evaluate models in distributed settings using [⚡️ Nanotron](https://github.com/huggingface/nanotron).
 
-## Customisation
-### Adding a new metric
-First check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`.
+For most users, we recommend using the 🤗 Accelerate backend - see below for specific commands.
+
+### Evaluate a model on one or more GPUs (recommended)
 
-If not, you can use the custom_task system to register your new metric:
-- create a new python file which should contain the full logic of your metric.
-- the file also needs to start with these imports
-```python
-from aenum import extend_enum
-from lighteval.metrics import Metrics
+To evaluate a model on one or more GPUs, first create a `multi-gpu` config by running:
 
-# And any other class you might need to redefine your specific metric, depending on whether it's a sample or corpus metric.
+```shell
+accelerate config
 ```
 
-- and to end with the following, so that it adds your metric to our metrics list when loaded as a module.
+You can then evaluate a model using data parallelism as follows:
 
-```python
-# Adds the metric to the metric list!
-extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
-if __name__ == "__main__":
-    print("Imported metric")
+```shell
+accelerate launch --multi_gpu --num_processes=<num_gpus> run_evals_accelerate.py \
+    --model_args="pretrained=<path to model on the hub>" \
+    --tasks <task parameters> \
+    --output_dir output_dir
 ```
 
-You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it.
+Here, `--tasks` refers to either a _comma-separated_ list of supported tasks from the [metadata table](src/lighteval/tasks/tasks_table.jsonl) in the format:
 
-To see an example of a custom metric added along with a custom task, look at `tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
+```
+suite|task|num_few_shot|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long}
+```
+
+or a file path like [`tasks_examples/recommended_set.txt`](./tasks_examples/recommended_set.txt) which specifies multiple task configurations. For example, to evaluate GPT-2 on the Truthful QA benchmark run:
+
+```shell
+accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
+    --model_args "pretrained=gpt2" \
+    --tasks "lighteval|truthfulqa:mc|0|0" \
+    --override_batch_size 1 \
+    --output_dir="./evals/"
+```
+
+Here, `--override_batch_size` defines the _batch size per device_, so the effective batch size will be `override_batch_size x num_gpus`. To evaluate on multiple benchmarks, separate each task configuration with a comma, e.g.
+
+```shell
+accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
+    --model_args "pretrained=gpt2" \
+    --tasks "lighteval|truthfulqa:mc|0|0,lighteval|gsm8k|0|0" \
+    --override_batch_size 1 \
+    --output_dir="./evals/"
+```
+
+See the [`tasks_examples/recommended_set.txt`](./tasks_examples/recommended_set.txt) file for a list of recommended task configurations.
+
+### Evaluating a large model with pipeline parallelism
+
+To evaluate models larger that ~40B parameters in 16-bit precision, you will need to shard the model across multiple GPUs to fit it in VRAM. You can do this by passing `model_parallel=True` and adapting `--num_processes` to be the number of processes to use for data parallel. For example, on a single node of 8 GPUs, you can run:
+
+```shell
+# PP=2, DP=4 - good for models < 70B params
+accelerate launch --multi_gpu --num_processes=4 run_evals_accelerate.py \
+    --model_args="pretrained=<path to model on the hub>" \
+    --model_parallel \
+    --tasks <task parameters> \
+    --output_dir output_dir
+
+# PP=4, DP=2 - good for huge models >= 70B params
+accelerate launch --multi_gpu --num_processes=2 run_evals_accelerate.py \
+    --model_args="pretrained=<path to model on the hub>" \
+    --model_parallel \
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+
+### Evaluate a model on the Open LLM Leaderboard benchmarks
+
+To evaluate a model on all the benchmarks of the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) using a single node of 8 GPUs, run:
+
+```shell
+accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
+    --model_args "pretrained=<model name>" \
+    --tasks tasks_examples/open_llm_leaderboard_tasks.txt \
+    --override_batch_size 1 \
+    --output_dir="./evals/"
+```
+
+### Evaluate a model on CPU
+
+You can also use `lighteval` to evaluate models on CPU, although note this will typically be very slow for large models. To do so, run:
+
+```shell
+python run_evals_accelerate.py \
+    --model_args="pretrained=<path to model on the hub>"\
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+
+## Deep thanks
+`lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (which is powering the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics.
+
+Through adding more and more logging functionalities, and making it compatible with increasingly different workflows and model codebases (including 3D parallelism) as well as allowing custom evaluation experiments, metrics and benchmarks, we ended up needing to change the code more and more deeply until `lighteval` became the small standalone library that it is now.
+
+However, we are very grateful to the Harness and HELM teams for their continued work on better evaluations.
+
+## How to navigate this project
+`lighteval` is supposed to be used as a standalone evaluation library.
+- To run the evaluations, you can use `run_evals_accelerate.py` or `run_evals_nanotron.py`.
+- [src/lighteval](https://github.com/huggingface/lighteval/tree/main/src/lighteval) contains the core of the lib itself
+    - [lighteval](https://github.com/huggingface/lighteval/tree/main/src/lighteval) contains the core of the library, divided in the following section
+        - [main_accelerate.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_accelerate.py) and [main_nanotron.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_nanotron.py) are our entry points to run evaluation
+        - [logging](https://github.com/huggingface/lighteval/tree/main/src/lighteval/logging): Our loggers, to display experiment information and push it to the hub after a run
+        - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions.
+        - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models.
+        - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`.
+- [tasks_examples](https://github.com/huggingface/lighteval/tree/main/tasks_examples) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
+- [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.
+
+## Customisation
+### Adding a new metric
+If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`. If not, add it to either of these files depending on the level at which it is applied.
+Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
 
 ### Adding a new task
-To add a new task, first **add its dataset** on the hub.
+To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, or in the community tasks, and **add its dataset** on the hub.
+Note: Core evaluations are evals we will add to our test suite to ensure non regression through time, and which already see a high usage in the community.
+A popular community evaluation can move to become a core evaluation through time.
 
-Then, **find a suitable prompt function** or **create a new prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`. This function must output a `Doc` object, which should contain `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction which should not be repeated in a few shot setup, add it to an `instruction` field.
+#### Core evaluations
+Prompt function: **find a suitable prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`, or code your own. This function must output a `Doc` object, which should contain `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction which should not be repeated in a few shot setup, add it to an `instruction` field.
 
-Lastly, create a **line summary** of your evaluation, in `src/lighteval/tasks/tasks_table.jsonl`. This summary should contain the following fields:
+Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/tasks_table.jsonl`. This summary should contain the following fields:
 - `name` (str), your evaluation name
-- `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval"]; you can add also add new ones (for test, we recommend using "custom").
+- `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval", "community", "custom"]; for core evals, please choose `lighteval`.
 - `prompt_function` (str), the name of the prompt function you defined in the step above
 - `hf_repo` (str), the path to your evaluation dataset on the hub
 - `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`)
@@ -155,6 +217,13 @@ Lastly, create a **line summary** of your evaluation, in `src/lighteval/tasks/ta
 - `output_regex` (str), A regex string that will be used to filter your generation. (Genrative metrics will only select tokens that are between the first and the second sequence matched by the regex. For example, for a regex matching `\n` and a generation `\nModel generation output\nSome other text` the metric will only be fed with `Model generation output`)
 - `frozen` (bool), for now is set to False, but we will steadily pass all stable tasks to True.
 
+Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
+
+#### Community evaluations
+Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets.
+
+Make sure you can launch your model with your new task using `--tasks community|yournewtask|2|0 --custom_tasks community_tasks/yourevalname.py`.
+
 ## Available metrics
 ### Metrics for multiple choice tasks
 These metrics use log-likelihood of the different possible targets.

From ff794802fabd1fa2d22a98bcb08d46bf01a68f04 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.com>
Date: Wed, 28 Feb 2024 14:33:49 +0100
Subject: [PATCH 08/45] revert README

---
 README.md | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4ddddbc9c..14f44ab6e 100644
--- a/README.md
+++ b/README.md
@@ -185,8 +185,28 @@ However, we are very grateful to the Harness and HELM teams for their continued
 
 ## Customisation
 ### Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`. If not, add it to either of these files depending on the level at which it is applied.
-Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
+First check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`.
+
+If not, you can use the custom_task system to register your new metric:
+- create a new python file which should contain the full logic of your metric.
+- the file also needs to start with these imports
+```python
+from aenum import extend_enum
+from lighteval.metrics import Metrics
+
+# And any other class you might need to redefine your specific metric, depending on whether it's a sample or corpus metric.
+```
+
+- and to end with the following, so that it adds your metric to our metrics list when loaded as a module.
+
+```python
+# Adds the metric to the metric list!
+extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
+if __name__ == "__main__":
+    print("Imported metric")
+```
+
+You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it.
 
 ### Adding a new task
 To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, or in the community tasks, and **add its dataset** on the hub.

From a234bf65b4a152965c8baeaedd17235765043cd6 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.com>
Date: Wed, 28 Feb 2024 14:56:32 +0100
Subject: [PATCH 09/45] better context management

---
 src/lighteval/models/base_model.py            | 56 +++++++++++++------
 src/lighteval/tasks/lighteval_task.py         |  5 +-
 src/lighteval/tasks/requests.py               |  3 +-
 .../mt_bench/mt_bench.py                      |  4 +-
 4 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index d641f8a42..83121c117 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -338,31 +338,53 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
             dataloader = self.accelerator.prepare(dataloader)
 
         # Always batch size 1 for multi-turn
-        for batch in tqdm(
-            dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
+        for request in tqdm(
+            dataset, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
         ):
             # NOTE: we are assuming all items in a batch behave similarly (same
             # stop_tokens and max_tokens genrated) which is not necessarily
             # the case! Because of that we only use batch size of 1
-            stop_tokens = batch[0].stop_sequence
-            max_generated_tokens = batch[0].generation_size
-            contexts = [c.context for c in batch]
+            stop_tokens = request.stop_sequence
+            max_generated_tokens = request.generation_size
+            context = request.context
             max_context_size_allowed = self.max_length - max_generated_tokens
 
-            multi_turn_context = "" # contexts[0][0]
-            model_answers = []
-            for i, context in enumerate(contexts[0]):
-                if i > 0:
-                    multi_turn_context += f"\n\n{context}"
-                else:
-                    multi_turn_context += f"{context}"
+            tokenized = self.tokenizer(
+                context,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+                max_length=max_context_size_allowed,
+                add_special_tokens=self.add_special_tokens,
+            ).to(self.device)
+
+            prepared_batch = Batch(
+                input_ids=tokenized["input_ids"],
+                input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
+                input_mask=tokenized["attention_mask"],
+                truncated=[0] * len(tokenized["input_ids"]),
+                padded=[0] * len(tokenized["input_ids"]),
+            )
+
+            cur_reponses = self._generate(
+                batch=prepared_batch,
+                max_tokens=max_generated_tokens,
+                stop_tokens=stop_tokens,
+                returns_logits=False,
+            )
 
-                # print("multi_turn_context ====== ")
-                # pprint(multi_turn_context)
-                # print("multi_turn_context ====== ")
+            model_answers = [cur_reponses[0].result]
+
+            for i, added_context in enumerate(request.contexts_multi_turn):
+                context += f"{cur_reponses[0].result}"
+                context += f"\n\n{added_context}"
+
+                print("multi_turn_context ====== ")
+                pprint(context)
+                print("multi_turn_context ====== ")
 
                 tokenized = self.tokenizer(
-                    multi_turn_context,
+                    context,
                     padding=True,
                     truncation=True,
                     return_tensors="pt",
@@ -386,10 +408,10 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
                 )
 
                 model_answers.append(cur_reponses[0].result)
-                multi_turn_context += f"{cur_reponses[0].result}"
 
             results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
 
+        pprint(results)
         return results
 
     def greedy_until(
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 113669036..d0d07c84a 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -432,9 +432,10 @@ def construct_requests(
                     task_name=current_task_name,
                     example_index=document_id_seed,
                     request_index=0,
-                    context=formatted_doc.specific["queries"],
+                    context=context,
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
+                    contexts_multi_turn=formatted_doc.specific.get("multi_turn_queries", []),
                 )
             ]
 
@@ -624,7 +625,7 @@ def create_requests_from_tasks(  # noqa: C901
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot
                     doc.ctx = ctx
-                    if use_chat_template:
+                    if use_chat_template and doc.choices is not None:
                         doc.choices = [
                             lm.tokenizer.apply_chat_template([{"role": "assistant", "content": choice}])
                             for choice in doc.choices
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index e8057197f..f93ba994f 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -107,11 +107,10 @@ class GreedyUntilMultiTurnRequest(Request):
         generation_size (int): The maximum number of tokens to generate.
         request_type (RequestType): The type of the request, set to RequestType.GREEDY_UNTIL.
     """
-    context: list[str] # Multi-turn has a list of context
     stop_sequence: str
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
-    tokenized_context: list[list[int]] = None
+    contexts_multi_turn: list[str]
 
 @dataclass
 class GreedyUntilWithLogitsRequest(Request):
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
index 11dc3fd2f..29f040afa 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
@@ -57,7 +57,7 @@ def prompt_fn(line, task_name: str = None):
         choices=None,
         instruction="",
         gold_index=[],
-        specific={"reference": line["reference"], "category": line["category"], "queries": line["prompt"]},
+        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["prompt"][1:]},
     )
 
 
@@ -73,7 +73,7 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     judge_prompts = load_judge_prompts(judge_file)
     judges = make_judge_single(judge_model, judge_prompts)
 
-    question = formatted_doc.specific["queries"]
+    question = [formatted_doc.query] + formatted_doc.specific["multi_turn_queries"]
     ref_answer = formatted_doc.specific["reference"]
     category = formatted_doc.specific["category"]
 

From 1357c10946cf2f2becce9ca049c83f205c8bf317 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 6 Mar 2024 16:24:29 +0000
Subject: [PATCH 10/45] working state

---
 src/lighteval/models/base_model.py            |  9 +++----
 src/lighteval/tasks/lighteval_task.py         | 26 +++++++++++++++++++
 .../mt_bench/mt_bench.py                      | 13 ++++++----
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 83121c117..43ea34c55 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -375,16 +375,15 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
             model_answers = [cur_reponses[0].result]
 
-            for i, added_context in enumerate(request.contexts_multi_turn):
-                context += f"{cur_reponses[0].result}"
-                context += f"\n\n{added_context}"
+            for i, multi_turn_context in enumerate(request.contexts_multi_turn):
+                multi_turn_context = multi_turn_context.format(model_response=model_answers[0])
 
                 print("multi_turn_context ====== ")
-                pprint(context)
+                pprint(multi_turn_context)
                 print("multi_turn_context ====== ")
 
                 tokenized = self.tokenizer(
-                    context,
+                    multi_turn_context,
                     padding=True,
                     truncation=True,
                     return_tensors="pt",
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index d0d07c84a..c504b93e6 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -610,6 +610,7 @@ def create_requests_from_tasks(  # noqa: C901
                     # to fix!!
                     cur_task_name = f"{task_name}|{num_fewshot}"
                     doc = task_docs[doc_id]
+                    is_multi_turn = len(doc.specific.get("multi_turn_queries", [])) > 0
                     ctx, num_effective_few_shots = task.fewshot_sampler.fewshot_context(
                         task=task,
                         doc=doc,
@@ -622,6 +623,31 @@ def create_requests_from_tasks(  # noqa: C901
                         use_chat_template=use_chat_template,
                         system_prompt=system_prompt,
                     )
+                    if is_multi_turn:
+                        if use_chat_template:
+                            multiturn_context = lm.tokenizer.apply_chat_template(
+                                [
+                                    {"role": "assistant", "content": "{model_response}"},
+                                    {"role": "user", "content": doc.specific["multi_turn_queries"][0]},
+                                ],
+                                add_generation_prompt=True,
+                                tokenize=False,
+                            )
+                            generation_prompt = lm.tokenizer.apply_chat_template(
+                                [
+                                    {"role": "assistant", "content": ""},
+                                ],
+                                add_generation_prompt=False,
+                                tokenize=False,
+                            )
+                            for i in range(len(generation_prompt)):
+                                if generation_prompt[i] != multiturn_context[i]:
+                                    multiturn_context = multiturn_context[i:]
+                                    break
+                            multiturn_context = f"{ctx}{multiturn_context}"
+                        else:
+                            multiturn_context = f"{ctx}{{model_response}}\n"
+                        doc.specific["multi_turn_queries"] = [multiturn_context]
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot
                     doc.ctx = ctx
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
index 29f040afa..4a0560970 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
@@ -39,8 +39,8 @@
     few_shots_split="",
     few_shots_select="random",
     metric=["mt_bench_metric"],
-    generation_size=100,
-    stop_sequence=["."],
+    generation_size=1024,
+    stop_sequence=[],
 )
 
 
@@ -51,11 +51,14 @@ def prompt_fn(line, task_name: str = None):
     Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
     about what this function should do in the README.
     """
+    instruction = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+    fake = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Implement a program to find the common elements in two arrays without using any extra data structures. ASSISTANT:"
+
     return Doc(
         task_name=task_name,
-        query=line["prompt"][0],
+        query=f"{line['prompt'][0]}",
         choices=None,
-        instruction="",
+        instruction=None,
         gold_index=[],
         specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["prompt"][1:]},
     )
@@ -69,7 +72,7 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     about what this function should do in the README.
     """
     judge_model = "gpt-3.5-turbo"
-    judge_file = "/Users/nathan/Repos/lighteval/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl"
+    judge_file = "tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl"
     judge_prompts = load_judge_prompts(judge_file)
     judges = make_judge_single(judge_model, judge_prompts)
 

From bb5cca2e08588d2865c78b52468369b22e489cba Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 6 Mar 2024 20:45:30 +0000
Subject: [PATCH 11/45] fix

---
 src/lighteval/logging/evaluation_tracker.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 391226a50..4fd2ea5fd 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -462,8 +462,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
         last_results_file_path = hf_hub_url(repo_id=repo_id, filename=last_results_file, repo_type="dataset")
         f = load_dataset("json", data_files=last_results_file_path, split="train")
         results_dict = f["results"][0]
-        value = results_dict.pop("all")
-        new_dictionary = {"all": value}
+        new_dictionary = {"all": results_dict}
         new_dictionary.update(results_dict)
         results_string = json.dumps(new_dictionary, indent=4)
 

From f548902f91ce566bdcf230e79850ab9d56b871f8 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Sat, 9 Mar 2024 10:27:36 +0000
Subject: [PATCH 12/45] continue

---
 src/lighteval/evaluator.py                    | 21 ++++++++++++++--
 src/lighteval/logging/info_loggers.py         | 19 ++++++++++++++-
 src/lighteval/metrics/__init__.py             |  4 ++--
 src/lighteval/models/base_model.py            | 17 +++++++------
 src/lighteval/tasks/lighteval_task.py         |  8 +++----
 .../mt_bench/judges.py                        |  2 +-
 .../mt_bench/mt_bench.py                      | 24 ++++++++++---------
 7 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index b9f8a6795..c273c6718 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -116,10 +116,27 @@ def evaluate(  # noqa: C901
             doc.instruction = ""
 
         # using a deep copy here because process results pops from the model responses
-        metrics = task.process_results(doc, copy.deepcopy(model_responses))
+        metrics = task.process_results(doc, copy.deepcopy(model_responses), evaluation_tracker=evaluation_tracker)
+
+        # Remove the user_prompt from the metrics in case of llm-as-judge metric
+        if "user_prompt" in metrics:
+            user_prompt = metrics["user_prompt"]
+            del metrics["user_prompt"]
+        else:
+            user_prompt = None
+        if "judgement" in metrics:
+            judgement = metrics["judgement"]
+            del metrics["judgement"]
+        else:
+            judgement = None
+
+        # pprint(doc)
+        # pprint(metrics)
+        # pprint(model_responses)
+        # print("===========")
 
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
-        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics)
+        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement))
 
     return evaluation_tracker
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index f6aa1d43e..e16a82bc0 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -205,6 +205,9 @@ class Detail:
         choices: list = field(default_factory=list)
         gold_index: list = field(default_factory=list)
         metrics: dict = field(default_factory=dict)
+        multi_turn_prompts: list = field(default_factory=list)
+        judement_prompt: str = None
+        judgement: str = None
 
     @dataclass
     class CompiledDetail:
@@ -302,7 +305,7 @@ class CompiledHash:
     compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
     compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()
 
-    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict) -> None:
+    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict, llm_as_prompt_judgement: tuple[str, str]) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
         Args:
@@ -356,6 +359,9 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             pred_saved = True
         if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
             pred_saved = True
+            detail.multi_turn_prompts = doc.specific["multi_turn_queries"]
+            detail.judement_prompt = llm_as_prompt_judgement[0]
+            detail.judgement = llm_as_prompt_judgement[1]
         if not pred_saved:
             raise NotImplementedError(
                 "No metric prediction saved."
@@ -440,11 +446,22 @@ class MetricsLogger:
 
     metrics_values: dict[str, dict[str, list[float]]] = collections.defaultdict(lambda: collections.defaultdict(list))
     metric_aggregated: dict[str, dict[str, float]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+    llm_as_judge_prompts: dict[str, dict[str, list[str]]] = collections.defaultdict(lambda: collections.defaultdict(list))
 
     def log(self, task_name: str, metrics: dict) -> None:
         for metric_name, metric_value in metrics.items():
             self.metrics_values[task_name][metric_name].append(metric_value)
 
+    def log_llm_as_judge(self, task_name: str, user_prompt: str, judgement: str) -> None:
+        """Logs the user prompt and the judgement of the model as a judge.
+
+        Args:
+            user_prompt (str): User prompt used to judge the model response.
+            judgement (str): Judgement of the model response.
+
+        """
+        self.llm_as_judge_prompts[task_name].append({"judgement": judgement, "user_prompt": user_prompt})
+
     def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = 1000):  # noqa: C901
         """
         Aggregate the metrics for each task and then for all tasks.
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index db309c081..5e20d6b8b 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -148,7 +148,7 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
     return results, outputs
 
 
-def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
+def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], eval_tracker=None):
     outputs = {}
     predictions = results.pop(0).result
 
@@ -156,4 +156,4 @@ def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_MULTI_TURN:
             outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))
 
-    return results, outputs
\ No newline at end of file
+    return results, outputs
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index ad979d0c5..3b50c437c 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -351,17 +351,17 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
 
-        dataset = GenerativeTaskDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
-        dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda batch: batch)
+        #dataset = GenerativeTaskDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        #dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda batch: batch)
 
         results = []
 
-        if self.accelerator:
-            dataloader = self.accelerator.prepare(dataloader)
+        # if self.accelerator:
+        #     dataloader = self.accelerator.prepare(dataloader)
 
         # Always batch size 1 for multi-turn
         for request in tqdm(
-            dataset, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
+            requests, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
         ):
             # NOTE: we are assuming all items in a batch behave similarly (same
             # stop_tokens and max_tokens genrated) which is not necessarily
@@ -400,9 +400,9 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
             for i, multi_turn_context in enumerate(request.contexts_multi_turn):
                 multi_turn_context = multi_turn_context.format(model_response=model_answers[0])
 
-                print("multi_turn_context ====== ")
-                pprint(multi_turn_context)
-                print("multi_turn_context ====== ")
+                # print("multi_turn_context ====== ")
+                # pprint(multi_turn_context)
+                # print("multi_turn_context ====== ")
 
                 tokenized = self.tokenizer(
                     multi_turn_context,
@@ -432,7 +432,6 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
             results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
 
-        pprint(results)
         return results
 
     def greedy_until(
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 4d11125ad..ed3d3ca4b 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -489,13 +489,13 @@ def construct_requests(
                     context=context,
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
-                    contexts_multi_turn=formatted_doc.specific.get("multi_turn_queries", []),
+                    contexts_multi_turn=formatted_doc.specific.get("multi_turn_queries_context", []),
                 )
             ]
 
         return requests
 
-    def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]:
+    def process_results(self, formatted_doc: Doc, results: list[ModelReturn], evaluation_tracker) -> dict[str, float]:
         """
         Processes the results of the task, and stores them in the output dict.
 
@@ -540,7 +540,7 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
             outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
             results, cur_outputs = apply_generative_multi_turn_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics, eval_tracker=evaluation_tracker
             )
             outputs.update(cur_outputs)
 
@@ -704,7 +704,7 @@ def create_requests_from_tasks(  # noqa: C901
                             multiturn_context = f"{ctx}{multiturn_context}"
                         else:
                             multiturn_context = f"{ctx}{{model_response}}\n"
-                        doc.specific["multi_turn_queries"] = [multiturn_context]
+                        doc.specific["multi_turn_queries_context"] = [multiturn_context]
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot
                     doc.ctx = ctx
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
index c26db1bb7..bca381fbd 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
@@ -185,4 +185,4 @@ def play_a_match_single(question, answer, ref_answer, judge, multi_turn, output_
         score, user_prompt, judgment = run_judge_single(
             question, answer, judge, ref_answer, multi_turn=multi_turn
         )
-        return score
+        return score, user_prompt, judgment
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
index 4a0560970..93174ea5c 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
@@ -39,7 +39,7 @@
     few_shots_split="",
     few_shots_select="random",
     metric=["mt_bench_metric"],
-    generation_size=1024,
+    generation_size=10,
     stop_sequence=[],
 )
 
@@ -81,30 +81,32 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     category = formatted_doc.specific["category"]
 
     if category not in NEED_REF_CATS:
-        score = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-        score_mt = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+        score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+        score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
     else:
         try:
-            score = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
-            score_mt = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
+            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
+            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
         except KeyError:
             print(f"Category {category} not found in judge prompts, using default judge")
-            score = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-            score_mt = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
 
-    return score
+    return {"single_turn": score, "multi_turn": score_mt, "user_prompt": [user_prompt_1, user_prompt_2], "judgement": [judgement_1, judgement_2]}
 
 
-mt_bench_metric = SampleLevelMetric(
+mt_bench_metric = SampleLevelMetricGrouping(
     metric="mt_bench_metric",
     higher_is_better=True,
     category=MetricCategory.GENERATIVE_MULTI_TURN,
     use_case=MetricUseCase.SUMMARIZATION,
     sample_level_fn=mt_bench_metric,
-    corpus_level_fn=np.mean,
+    corpus_level_fn={
+        "single_turn": np.mean,
+        "multi_turn": np.mean,
+    }
 )
 
-
 ## STORE YOUR EVALS
 _TASKS = [task]
 

From 2e2b15d9582f5f407b3a7a46e218e171e540aac2 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 11 Mar 2024 09:18:33 +0000
Subject: [PATCH 13/45] continue

---
 src/lighteval/logging/info_loggers.py                 |  4 ++--
 src/lighteval/models/base_model.py                    | 11 +++++++----
 .../mt_bench/mt_bench.py                              |  5 +----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index e16a82bc0..82c834e85 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -205,9 +205,9 @@ class Detail:
         choices: list = field(default_factory=list)
         gold_index: list = field(default_factory=list)
         metrics: dict = field(default_factory=dict)
-        multi_turn_prompts: list = field(default_factory=list)
         judement_prompt: str = None
         judgement: str = None
+        specifics: dict = field(default_factory=dict)
 
     @dataclass
     class CompiledDetail:
@@ -359,9 +359,9 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
             pred_saved = True
         if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
             pred_saved = True
-            detail.multi_turn_prompts = doc.specific["multi_turn_queries"]
             detail.judement_prompt = llm_as_prompt_judgement[0]
             detail.judgement = llm_as_prompt_judgement[1]
+        detail.specifics = doc.specific
         if not pred_saved:
             raise NotImplementedError(
                 "No metric prediction saved."
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 3b50c437c..fbc64aefd 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -35,7 +35,13 @@
 from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.model_config import BaseModelConfig, EnvConfig
-from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn, GenerateMultiTurnReturn
+from lighteval.models.model_output import (
+    Batch,
+    GenerateMultiTurnReturn,
+    GenerateReturn,
+    LoglikelihoodReturn,
+    LoglikelihoodSingleTokenReturn,
+)
 from lighteval.models.utils import _get_dtype, _get_precision, _simplify_name
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
@@ -351,9 +357,6 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
 
-        #dataset = GenerativeTaskDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
-        #dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda batch: batch)
-
         results = []
 
         # if self.accelerator:
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
index 93174ea5c..68e7dc5d9 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
@@ -39,7 +39,7 @@
     few_shots_split="",
     few_shots_select="random",
     metric=["mt_bench_metric"],
-    generation_size=10,
+    generation_size=1024,
     stop_sequence=[],
 )
 
@@ -51,9 +51,6 @@ def prompt_fn(line, task_name: str = None):
     Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
     about what this function should do in the README.
     """
-    instruction = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
-    fake = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Implement a program to find the common elements in two arrays without using any extra data structures. ASSISTANT:"
-
     return Doc(
         task_name=task_name,
         query=f"{line['prompt'][0]}",

From 339f1f661e42b1e14a5965a919b880e4147ca490 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 11:16:04 +0000
Subject: [PATCH 14/45] commit

---
 src/lighteval/logging/info_loggers.py         |  2 +-
 src/lighteval/models/base_model.py            |  5 +-
 src/lighteval/tasks/lighteval_task.py         | 84 +++++++++----------
 src/lighteval/tasks/requests.py               |  2 +-
 .../mt_bench/judges.py                        | 15 ----
 .../mt_bench/mt_bench.py                      |  8 +-
 6 files changed, 49 insertions(+), 67 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 82c834e85..726fd27d7 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -372,7 +372,7 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
 
         hash = self.Hash()
         hash.example = xxhash.xxh64(doc.query).hexdigest()
-        hash.full_prompt = xxhash.xxh64(doc.ctx).hexdigest()
+        hash.full_prompt = xxhash.xxh64(str(doc.ctx)).hexdigest()
         hash.input_tokens = xxhash.xxh64(str([o.input_tokens for o in outputs])).hexdigest()
         hash.cont_tokens = xxhash.xxh64(str([o.generated_tokens for o in outputs])).hexdigest()
         self.hashes[task_name].append(hash)
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index fbc64aefd..b87e24734 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -371,7 +371,7 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
             # the case! Because of that we only use batch size of 1
             stop_tokens = request.stop_sequence
             max_generated_tokens = request.generation_size
-            context = request.context
+            context = request.context[0]
             max_context_size_allowed = self.max_length - max_generated_tokens
 
             tokenized = self.tokenizer(
@@ -400,7 +400,7 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
             model_answers = [cur_reponses[0].result]
 
-            for i, multi_turn_context in enumerate(request.contexts_multi_turn):
+            for i, multi_turn_context in enumerate(request.context[1:]):
                 multi_turn_context = multi_turn_context.format(model_response=model_answers[0])
 
                 # print("multi_turn_context ====== ")
@@ -568,6 +568,7 @@ def _generate(
             pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id,
             return_dict_in_generate=True,
             output_scores=True,
+            eos_token_id=self.tokenizer.eos_token_id
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index ed3d3ca4b..65120c6b7 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -489,7 +489,6 @@ def construct_requests(
                     context=context,
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
-                    contexts_multi_turn=formatted_doc.specific.get("multi_turn_queries_context", []),
                 )
             ]
 
@@ -668,51 +667,48 @@ def create_requests_from_tasks(  # noqa: C901
                     cur_task_name = f"{task_name}|{num_fewshot}"
                     doc = task_docs[doc_id]
                     is_multi_turn = len(doc.specific.get("multi_turn_queries", [])) > 0
-                    ctx, num_effective_few_shots = task.fewshot_sampler.fewshot_context(
-                        task=task,
-                        doc=doc,
-                        num_fewshot=num_fewshot,
-                        seed=seed,
-                        truncate_few_shots=truncate_few_shots,
-                        max_model_length=lm.max_length,
-                        sampler=rnd,
-                        tokenizer=lm.tokenizer,
-                        use_chat_template=use_chat_template,
-                        system_prompt=system_prompt,
-                    )
-                    if is_multi_turn:
+
+                    if not is_multi_turn:
+                        ctx, num_effective_few_shots = task.fewshot_sampler.fewshot_context(
+                            task=task,
+                            doc=doc,
+                            num_fewshot=num_fewshot,
+                            seed=seed,
+                            truncate_few_shots=truncate_few_shots,
+                            max_model_length=lm.max_length,
+                            sampler=rnd,
+                            tokenizer=lm.tokenizer,
+                            use_chat_template=use_chat_template,
+                            system_prompt=system_prompt,
+                        )
+                        doc.num_effective_few_shots = num_effective_few_shots
+                        doc.num_asked_few_shots = num_fewshot
+                    else:
                         if use_chat_template:
-                            multiturn_context = lm.tokenizer.apply_chat_template(
-                                [
-                                    {"role": "assistant", "content": "{model_response}"},
-                                    {"role": "user", "content": doc.specific["multi_turn_queries"][0]},
-                                ],
-                                add_generation_prompt=True,
-                                tokenize=False,
-                            )
-                            generation_prompt = lm.tokenizer.apply_chat_template(
-                                [
-                                    {"role": "assistant", "content": ""},
-                                ],
-                                add_generation_prompt=False,
-                                tokenize=False,
-                            )
-                            for i in range(len(generation_prompt)):
-                                if generation_prompt[i] != multiturn_context[i]:
-                                    multiturn_context = multiturn_context[i:]
-                                    break
-                            multiturn_context = f"{ctx}{multiturn_context}"
-                        else:
-                            multiturn_context = f"{ctx}{{model_response}}\n"
-                        doc.specific["multi_turn_queries_context"] = [multiturn_context]
-                    doc.num_effective_few_shots = num_effective_few_shots
-                    doc.num_asked_few_shots = num_fewshot
+                            k = []
+                            if system_prompt is not None:
+                                k.append({"role": "system", "content": system_prompt})
+
+                            for i in doc.specific["multi_turn_queries"]:
+                                k.append(
+                                    {"role": "user", "content": i}
+                                )
+                                k.append({"role": "assistant", "content": "{model_response}"})
+                            k.pop(-1)
+
+                            from pprint import pprint
+                            ctx = []
+
+                            offset = 2 if system_prompt is not None else 1
+
+                            for i in range(0, len(k), offset+1):
+                                c = lm.tokenizer.apply_chat_template(k[:i+offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False)
+                                ctx.append(c)
+
+                        doc.specific["multi_turn_queries_context"] = ctx
+                        doc.num_effective_few_shots = 0
+                        doc.num_asked_few_shots = 0
                     doc.ctx = ctx
-                    if use_chat_template and doc.choices is not None:
-                        doc.choices = [
-                            lm.tokenizer.apply_chat_template([{"role": "assistant", "content": choice}])
-                            for choice in doc.choices
-                        ]
 
                     # Constructing the requests
                     docs[TaskExampleId(cur_task_name, doc_id_seed)] = doc
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index ebe245b1e..c867eb926 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -132,7 +132,7 @@ class GreedyUntilMultiTurnRequest(Request):
     stop_sequence: str
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
-    contexts_multi_turn: list[str]
+
 
 @dataclass
 class GreedyUntilWithLogitsRequest(Request):
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
index bca381fbd..05d986133 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judges.py
@@ -5,8 +5,6 @@
 import os
 import re
 import time
-from pprint import pprint
-from random import randrange
 
 import openai
 
@@ -81,7 +79,6 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
         openai.api_base = api_dict["api_base"]
         openai.api_key = api_dict["api_key"]
     output = API_ERROR_OUTPUT
-    # return "[[1]]"
     for _ in range(API_MAX_RETRY):
         try:
             messages = conv.to_openai_api_messages()
@@ -124,13 +121,6 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
             kwargs["ref_answer_2"] = ref_answer[1]
 
     if multi_turn:
-        # pprint(question[0])
-        # pprint(question[1])
-        # pprint(answer[0])
-        # pprint(answer[1])
-        # pprint(kwargs)
-        # pprint(judge.prompt_template["prompt_template"])
-        # print("========")
         user_prompt = judge.prompt_template["prompt_template"].format(
             question_1=question[0],
             question_2=question[1],
@@ -139,11 +129,6 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
             **kwargs,
         )
     else:
-        # pprint(question[0])
-        # pprint(answer[0])
-        # pprint(kwargs)
-        # pprint(judge.prompt_template["prompt_template"])
-        # pprint("========")
         user_prompt = judge.prompt_template["prompt_template"].format(
             question=question[0],
             answer=answer[0],
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
index 68e7dc5d9..b952a2d7a 100644
--- a/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
+++ b/tasks_examples/custom_tasks_with_custom_metrics/mt_bench/mt_bench.py
@@ -32,7 +32,7 @@
     name="mt_bench",
     prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
     suite=["custom"],
-    hf_repo="HuggingFaceH4/mt_bench_prompts",
+    hf_repo="SaylorTwift/mt-bench",
     hf_subset="default",
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
@@ -53,11 +53,11 @@ def prompt_fn(line, task_name: str = None):
     """
     return Doc(
         task_name=task_name,
-        query=f"{line['prompt'][0]}",
+        query=f"{line['turns'][0]}",
         choices=None,
         instruction=None,
         gold_index=[],
-        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["prompt"][1:]},
+        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["turns"], "id": line["question_id"]},
     )
 
 
@@ -73,7 +73,7 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     judge_prompts = load_judge_prompts(judge_file)
     judges = make_judge_single(judge_model, judge_prompts)
 
-    question = [formatted_doc.query] + formatted_doc.specific["multi_turn_queries"]
+    question = formatted_doc.specific["multi_turn_queries"]
     ref_answer = formatted_doc.specific["reference"]
     category = formatted_doc.specific["category"]
 

From 5bc5b98eac305a960fcc3e9f4c1ca6dc09c48cbe Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Wed, 20 Mar 2024 12:23:32 +0100
Subject: [PATCH 15/45] Update README.md

---
 README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/README.md b/README.md
index d0f1c21de..22400f5f8 100644
--- a/README.md
+++ b/README.md
@@ -239,13 +239,6 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t
 - `frozen` (bool), for now is set to False, but we will steadily pass all stable tasks to True.
 - `trust_dataset` (bool), set to True if you trust the dataset.
 
-Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
-
-#### Community evaluations
-Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets.
-
-Make sure you can launch your model with your new task using `--tasks community|yournewtask|2|0 --custom_tasks community_tasks/yourevalname.py`.
-
 ### Adding a new metric
 First check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`.
 

From cd1300dd1f284159752f9c3af28d348a086712e7 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 11:43:16 +0000
Subject: [PATCH 16/45] commit

---
 extended_tasks/mt_bench/mt_bench.py | 122 ----------------------------
 extended_tasks/requirements.txt     |   1 -
 src/lighteval/tasks/registry.py     |   1 +
 3 files changed, 1 insertion(+), 123 deletions(-)
 delete mode 100644 extended_tasks/mt_bench/mt_bench.py
 delete mode 100644 extended_tasks/requirements.txt

diff --git a/extended_tasks/mt_bench/mt_bench.py b/extended_tasks/mt_bench/mt_bench.py
deleted file mode 100644
index b952a2d7a..000000000
--- a/extended_tasks/mt_bench/mt_bench.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
-This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-Author:
-"""
-
-from pprint import pprint
-
-import numpy as np
-from aenum import extend_enum
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from lighteval.metrics import Metrics
-from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-from tasks_examples.custom_tasks_with_custom_metrics.mt_bench.judges import (
-    load_judge_prompts,
-    make_judge_single,
-    play_a_match_single,
-)
-
-
-NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
-
-## EVAL WITH NO SUBSET ##
-# This is how you create a simple tasks (like hellaswag) which has one single subset
-# attached to it, and one evaluation possible.
-task = LightevalTaskConfig(
-    name="mt_bench",
-    prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
-    suite=["custom"],
-    hf_repo="SaylorTwift/mt-bench",
-    hf_subset="default",
-    hf_avail_splits=["train"],
-    evaluation_splits=["train"],
-    few_shots_split="",
-    few_shots_select="random",
-    metric=["mt_bench_metric"],
-    generation_size=1024,
-    stop_sequence=[],
-)
-
-
-## DEFINE YOUR PROMPT FUNCTIONS
-# Define as many as you need for your different tasks
-def prompt_fn(line, task_name: str = None):
-    """Defines how to go from a dataset line to a doc object.
-    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
-    about what this function should do in the README.
-    """
-    return Doc(
-        task_name=task_name,
-        query=f"{line['turns'][0]}",
-        choices=None,
-        instruction=None,
-        gold_index=[],
-        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["turns"], "id": line["question_id"]},
-    )
-
-
-
-
-def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
-    """Defines how to go from a list of predictions to a score.
-    Follow examples in src/lighteval/metrics/metrics.py, or get more info
-    about what this function should do in the README.
-    """
-    judge_model = "gpt-3.5-turbo"
-    judge_file = "tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl"
-    judge_prompts = load_judge_prompts(judge_file)
-    judges = make_judge_single(judge_model, judge_prompts)
-
-    question = formatted_doc.specific["multi_turn_queries"]
-    ref_answer = formatted_doc.specific["reference"]
-    category = formatted_doc.specific["category"]
-
-    if category not in NEED_REF_CATS:
-        score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-        score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
-    else:
-        try:
-            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
-            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
-        except KeyError:
-            print(f"Category {category} not found in judge prompts, using default judge")
-            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
-
-    return {"single_turn": score, "multi_turn": score_mt, "user_prompt": [user_prompt_1, user_prompt_2], "judgement": [judgement_1, judgement_2]}
-
-
-mt_bench_metric = SampleLevelMetricGrouping(
-    metric="mt_bench_metric",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE_MULTI_TURN,
-    use_case=MetricUseCase.SUMMARIZATION,
-    sample_level_fn=mt_bench_metric,
-    corpus_level_fn={
-        "single_turn": np.mean,
-        "multi_turn": np.mean,
-    }
-)
-
-## STORE YOUR EVALS
-_TASKS = [task]
-
-## MODULE LOGIC
-# You should not need to touch this
-# Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in _TASKS]
-extend_enum(
-    Metrics,
-    "mt_bench_metric",
-    mt_bench_metric,
-)
-
-if __name__ == "__main__":
-    print(t["name"] for t in TASKS_TABLE)
-    print(len(TASKS_TABLE))
diff --git a/extended_tasks/requirements.txt b/extended_tasks/requirements.txt
deleted file mode 100644
index 7f42284c9..000000000
--- a/extended_tasks/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-langdetect
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 5625fe75d..fde147426 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -177,6 +177,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
     if isinstance(custom_tasks, ModuleType):
         return custom_tasks
     if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks):
+        print(f"{custom_tasks=}")
         dataset_module = dataset_module_factory(str(custom_tasks))
         return importlib.import_module(dataset_module.module_path)
     if isinstance(custom_tasks, (str, Path)):

From 4b00eb7d7b40776977b4292a39f5afe711cfbdd5 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 14:14:15 +0000
Subject: [PATCH 17/45] commit

---
 tasks_examples/OALL_tasks.txt                 |  117 ++
 tasks_examples/all_tasks.txt                  | 1146 +++++++++++++++++
 tasks_examples/bbh.txt                        |   36 +
 .../custom_tasks/custom_evaluation_tasks.py   |  691 ++++++++++
 tasks_examples/custom_tasks/custom_task.py    |  100 ++
 .../lighteval_config_override_template.yaml   |   30 +
 tasks_examples/open_llm_leaderboard_tasks.txt |   68 +
 tasks_examples/recommended_set.txt            |  160 +++
 8 files changed, 2348 insertions(+)
 create mode 100644 tasks_examples/OALL_tasks.txt
 create mode 100644 tasks_examples/all_tasks.txt
 create mode 100644 tasks_examples/bbh.txt
 create mode 100644 tasks_examples/custom_tasks/custom_evaluation_tasks.py
 create mode 100644 tasks_examples/custom_tasks/custom_task.py
 create mode 100644 tasks_examples/custom_tasks/lighteval_config_override_template.yaml
 create mode 100644 tasks_examples/open_llm_leaderboard_tasks.txt
 create mode 100644 tasks_examples/recommended_set.txt

diff --git a/tasks_examples/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt
new file mode 100644
index 000000000..5428fba49
--- /dev/null
+++ b/tasks_examples/OALL_tasks.txt
@@ -0,0 +1,117 @@
+community|arabic_mmlu:abstract_algebra|5|1
+community|arabic_mmlu:anatomy|5|1
+community|arabic_mmlu:astronomy|5|1
+community|arabic_mmlu:business_ethics|5|1
+community|arabic_mmlu:clinical_knowledge|5|1
+community|arabic_mmlu:college_biology|5|1
+community|arabic_mmlu:college_chemistry|5|1
+community|arabic_mmlu:college_computer_science|5|1
+community|arabic_mmlu:college_mathematics|5|1
+community|arabic_mmlu:college_medicine|5|1
+community|arabic_mmlu:college_physics|5|1
+community|arabic_mmlu:computer_security|5|1
+community|arabic_mmlu:conceptual_physics|5|1
+community|arabic_mmlu:econometrics|5|1
+community|arabic_mmlu:electrical_engineering|5|1
+community|arabic_mmlu:elementary_mathematics|5|1
+community|arabic_mmlu:formal_logic|5|1
+community|arabic_mmlu:global_facts|5|1
+community|arabic_mmlu:high_school_biology|5|1
+community|arabic_mmlu:high_school_chemistry|5|1
+community|arabic_mmlu:high_school_computer_science|5|1
+community|arabic_mmlu:high_school_european_history|5|1
+community|arabic_mmlu:high_school_geography|5|1
+community|arabic_mmlu:high_school_government_and_politics|5|1
+community|arabic_mmlu:high_school_macroeconomics|5|1
+community|arabic_mmlu:high_school_mathematics|5|1
+community|arabic_mmlu:high_school_microeconomics|5|1
+community|arabic_mmlu:high_school_physics|5|1
+community|arabic_mmlu:high_school_psychology|5|1
+community|arabic_mmlu:high_school_statistics|5|1
+community|arabic_mmlu:high_school_us_history|5|1
+community|arabic_mmlu:high_school_world_history|5|1
+community|arabic_mmlu:human_aging|5|1
+community|arabic_mmlu:human_sexuality|5|1
+community|arabic_mmlu:international_law|5|1
+community|arabic_mmlu:jurisprudence|5|1
+community|arabic_mmlu:logical_fallacies|5|1
+community|arabic_mmlu:machine_learning|5|1
+community|arabic_mmlu:management|5|1
+community|arabic_mmlu:marketing|5|1
+community|arabic_mmlu:medical_genetics|5|1
+community|arabic_mmlu:miscellaneous|5|1
+community|arabic_mmlu:moral_disputes|5|1
+community|arabic_mmlu:moral_scenarios|5|1
+community|arabic_mmlu:nutrition|5|1
+community|arabic_mmlu:philosophy|5|1
+community|arabic_mmlu:prehistory|5|1
+community|arabic_mmlu:professional_accounting|5|1
+community|arabic_mmlu:professional_law|5|1
+community|arabic_mmlu:professional_medicine|5|1
+community|arabic_mmlu:professional_psychology|5|1
+community|arabic_mmlu:public_relations|5|1
+community|arabic_mmlu:security_studies|5|1
+community|arabic_mmlu:sociology|5|1
+community|arabic_mmlu:us_foreign_policy|5|1
+community|arabic_mmlu:virology|5|1
+community|arabic_mmlu:world_religions|5|1
+community|arabic_exams|5|1
+community|acva:Algeria|5|1
+community|acva:Ancient_Egypt|5|1
+community|acva:Arab_Empire|5|1
+community|acva:Arabic_Architecture|5|1
+community|acva:Arabic_Art|5|1
+community|acva:Arabic_Astronomy|5|1
+community|acva:Arabic_Calligraphy|5|1
+community|acva:Arabic_Ceremony|5|1
+community|acva:Arabic_Clothing|5|1
+community|acva:Arabic_Culture|5|1
+community|acva:Arabic_Food|5|1
+community|acva:Arabic_Funeral|5|1
+community|acva:Arabic_Geography|5|1
+community|acva:Arabic_History|5|1
+community|acva:Arabic_Language_Origin|5|1
+community|acva:Arabic_Literature|5|1
+community|acva:Arabic_Math|5|1
+community|acva:Arabic_Medicine|5|1
+community|acva:Arabic_Music|5|1
+community|acva:Arabic_Ornament|5|1
+community|acva:Arabic_Philosophy|5|1
+community|acva:Arabic_Physics_and_Chemistry|5|1
+community|acva:Arabic_Wedding|5|1
+community|acva:Bahrain|5|1
+community|acva:Comoros|5|1
+community|acva:Egypt_modern|5|1
+community|acva:InfluenceFromAncientEgypt|5|1
+community|acva:InfluenceFromByzantium|5|1
+community|acva:InfluenceFromChina|5|1
+community|acva:InfluenceFromGreece|5|1
+community|acva:InfluenceFromIslam|5|1
+community|acva:InfluenceFromPersia|5|1
+community|acva:InfluenceFromRome|5|1
+community|acva:Iraq|5|1
+community|acva:Islam_Education|5|1
+community|acva:Islam_branches_and_schools|5|1
+community|acva:Islamic_law_system|5|1
+community|acva:Jordan|5|1
+community|acva:Kuwait|5|1
+community|acva:Lebanon|5|1
+community|acva:Libya|5|1
+community|acva:Mauritania|5|1
+community|acva:Mesopotamia_civilization|5|1
+community|acva:Morocco|5|1
+community|acva:Oman|5|1
+community|acva:Palestine|5|1
+community|acva:Qatar|5|1
+community|acva:Saudi_Arabia|5|1
+community|acva:Somalia|5|1
+community|acva:Sudan|5|1
+community|acva:Syria|5|1
+community|acva:Tunisia|5|1
+community|acva:United_Arab_Emirates|5|1
+community|acva:Yemen|5|1
+community|acva:communication|5|1
+community|acva:computer_and_phone|5|1
+community|acva:daily_life|5|1
+community|acva:entertainment|5|1
+lighteval|xstory_cloze:ar|0|0
diff --git a/tasks_examples/all_tasks.txt b/tasks_examples/all_tasks.txt
new file mode 100644
index 000000000..894ffc6f9
--- /dev/null
+++ b/tasks_examples/all_tasks.txt
@@ -0,0 +1,1146 @@
+bigbench|abstract_narrative_understanding|0|0
+bigbench|anachronisms|0|0
+bigbench|analogical_similarity|0|0
+bigbench|analytic_entailment|0|0
+bigbench|arithmetic_bb|0|0
+bigbench|ascii_word_recognition|0|0
+bigbench|authorship_verification|0|0
+bigbench|auto_categorization|0|0
+bigbench|auto_debugging|0|0
+bigbench|bbq_lite_json|0|0
+bigbench|bridging_anaphora_resolution_barqa|0|0
+bigbench|causal_judgment|0|0
+bigbench|cause_and_effect|0|0
+bigbench|checkmate_in_one|0|0
+bigbench|chess_state_tracking|0|0
+bigbench|chinese_remainder_theorem|0|0
+bigbench|cifar10_classification|0|0
+bigbench|code_line_description|0|0
+bigbench|codenames|0|0
+bigbench|color|0|0
+bigbench|common_morpheme|0|0
+bigbench|conceptual_combinations|0|0
+bigbench|conlang_translation|0|0
+bigbench|contextual_parametric_knowledge_conflicts|0|0
+bigbench|crash_blossom|0|0
+bigbench|crass_ai|0|0
+bigbench|cryobiology_spanish|0|0
+bigbench|cryptonite|0|0
+bigbench|cs_algorithms|0|0
+bigbench|dark_humor_detection|0|0
+bigbench|date_understanding|0|0
+bigbench|disambiguation_qa|0|0
+bigbench|discourse_marker_prediction|0|0
+bigbench|disfl_qa|0|0
+bigbench|dyck_languages|0|0
+bigbench|elementary_math_qa|0|0
+bigbench|emoji_movie|0|0
+bigbench|emojis_emotion_prediction|0|0
+bigbench|empirical_judgments|0|0
+bigbench|english_proverbs|0|0
+bigbench|english_russian_proverbs|0|0
+bigbench|entailed_polarity_hindi|0|0
+bigbench|entailed_polarity|0|0
+bigbench|epistemic_reasoning|0|0
+bigbench|evaluating_information_essentiality|0|0
+bigbench|fact_checker|0|0
+bigbench|fantasy_reasoning|0|0
+bigbench|few_shot_nlg|0|0
+bigbench|figure_of_speech_detection|0|0
+bigbench|formal_fallacies_syllogisms_negation|0|0
+bigbench|gem|0|0
+bigbench|gender_inclusive_sentences_german|0|0
+bigbench|general_knowledge|0|0
+bigbench|geometric_shapes|0|0
+bigbench|goal_step_wikihow|0|0
+bigbench|gre_reading_comprehension|0|0
+bigbench|hhh_alignment|0|0
+bigbench|hindi_question_answering|0|0
+bigbench|hindu_knowledge|0|0
+bigbench|hinglish_toxicity|0|0
+bigbench|human_organs_senses|0|0
+bigbench|hyperbaton|0|0
+bigbench|identify_math_theorems|0|0
+bigbench|identify_odd_metaphor|0|0
+bigbench|implicatures|0|0
+bigbench|implicit_relations|0|0
+bigbench|intent_recognition|0|0
+bigbench|international_phonetic_alphabet_nli|0|0
+bigbench|international_phonetic_alphabet_transliterate|0|0
+bigbench|intersect_geometry|0|0
+bigbench|irony_identification|0|0
+bigbench|kanji_ascii|0|0
+bigbench|kannada|0|0
+bigbench|key_value_maps|0|0
+bigbench|known_unknowns|0|0
+bigbench|language_games|0|0
+bigbench|language_identification|0|0
+bigbench|linguistic_mappings|0|0
+bigbench|linguistics_puzzles|0|0
+bigbench|logic_grid_puzzle|0|0
+bigbench|logical_args|0|0
+bigbench|logical_deduction|0|0
+bigbench|logical_fallacy_detection|0|0
+bigbench|logical_sequence|0|0
+bigbench|mathematical_induction|0|0
+bigbench|matrixshapes|0|0
+bigbench|metaphor_boolean|0|0
+bigbench|metaphor_understanding|0|0
+bigbench|minute_mysteries_qa|0|0
+bigbench|misconceptions_russian|0|0
+bigbench|misconceptions|0|0
+bigbench|mnist_ascii|0|0
+bigbench|modified_arithmetic|0|0
+bigbench|moral_permissibility|0|0
+bigbench|movie_dialog_same_or_different|0|0
+bigbench|movie_recommendation|0|0
+bigbench|mult_data_wrangling|0|0
+bigbench|multiemo|0|0
+bigbench|natural_instructions|0|0
+bigbench|navigate|0|0
+bigbench|nonsense_words_grammar|0|0
+bigbench|novel_concepts|0|0
+bigbench|object_counting|0|0
+bigbench|odd_one_out|0|0
+bigbench|operators|0|0
+bigbench|paragraph_segmentation|0|0
+bigbench|parsinlu_qa|0|0
+bigbench|parsinlu_reading_comprehension|0|0
+bigbench|penguins_in_a_table|0|0
+bigbench|periodic_elements|0|0
+bigbench|persian_idioms|0|0
+bigbench|phrase_relatedness|0|0
+bigbench|physical_intuition|0|0
+bigbench|physics_questions|0|0
+bigbench|physics|0|0
+bigbench|play_dialog_same_or_different|0|0
+bigbench|polish_sequence_labeling|0|0
+bigbench|presuppositions_as_nli|0|0
+bigbench|qa_wikidata|0|0
+bigbench|question_selection|0|0
+bigbench|real_or_fake_text|0|0
+bigbench|reasoning_about_colored_objects|0|0
+bigbench|repeat_copy_logic|0|0
+bigbench|rephrase|0|0
+bigbench|rhyming|0|0
+bigbench|riddle_sense|0|0
+bigbench|ruin_names|0|0
+bigbench|salient_translation_error_detection|0|0
+bigbench|scientific_press_release|0|0
+bigbench|semantic_parsing_in_context_sparc|0|0
+bigbench|semantic_parsing_spider|0|0
+bigbench|sentence_ambiguity|0|0
+bigbench|similarities_abstraction|0|0
+bigbench|simp_turing_concept|0|0
+bigbench|simple_arithmetic_json_multiple_choice|0|0
+bigbench|simple_arithmetic_json_subtasks|0|0
+bigbench|simple_arithmetic_json|0|0
+bigbench|simple_arithmetic_multiple_targets_json|0|0
+bigbench|simple_ethical_questions|0|0
+bigbench|simple_text_editing|0|0
+bigbench|snarks|0|0
+bigbench|social_iqa|0|0
+bigbench|social_support|0|0
+bigbench|sports_understanding|0|0
+bigbench|strange_stories|0|0
+bigbench|strategyqa|0|0
+bigbench|sufficient_information|0|0
+bigbench|suicide_risk|0|0
+bigbench|swahili_english_proverbs|0|0
+bigbench|swedish_to_german_proverbs|0|0
+bigbench|symbol_interpretation|0|0
+bigbench|tellmewhy|0|0
+bigbench|temporal_sequences|0|0
+bigbench|tense|0|0
+bigbench|timedial|0|0
+bigbench|topical_chat|0|0
+bigbench|tracking_shuffled_objects|0|0
+bigbench|understanding_fables|0|0
+bigbench|undo_permutation|0|0
+bigbench|unit_conversion|0|0
+bigbench|unit_interpretation|0|0
+bigbench|unnatural_in_context_learning|0|0
+bigbench|vitaminc_fact_verification|0|0
+bigbench|what_is_the_tao|0|0
+bigbench|which_wiki_edit|0|0
+bigbench|wino_x_german|0|0
+bigbench|winowhy|0|0
+bigbench|word_sorting|0|0
+bigbench|word_unscrambling|0|0
+helm|babi_qa|0|0
+helm|bbq:Age|0|0
+helm|bbq:Disability_status|0|0
+helm|bbq:Gender_identity|0|0
+helm|bbq:Nationality|0|0
+helm|bbq:Physical_appearance|0|0
+helm|bbq:Race_ethnicity|0|0
+helm|bbq:Race_x_SES|0|0
+helm|bbq:Race_x_gender|0|0
+helm|bbq:Religion|0|0
+helm|bbq:SES|0|0
+helm|bbq:Sexual_orientation|0|0
+helm|bbq|0|0
+helm|bigbench:auto_debugging|0|0
+helm|bigbench:bbq_lite_json:age_ambig|0|0
+helm|bigbench:bbq_lite_json:age_disambig|0|0
+helm|bigbench:bbq_lite_json:disability_status_ambig|0|0
+helm|bigbench:bbq_lite_json:disability_status_disambig|0|0
+helm|bigbench:bbq_lite_json:gender_identity_ambig|0|0
+helm|bigbench:bbq_lite_json:gender_identity_disambig|0|0
+helm|bigbench:bbq_lite_json:nationality_ambig|0|0
+helm|bigbench:bbq_lite_json:nationality_disambig|0|0
+helm|bigbench:bbq_lite_json:physical_appearance_ambig|0|0
+helm|bigbench:bbq_lite_json:physical_appearance_disambig|0|0
+helm|bigbench:bbq_lite_json:race_ethnicity_ambig|0|0
+helm|bigbench:bbq_lite_json:race_ethnicity_disambig|0|0
+helm|bigbench:bbq_lite_json:religion_ambig|0|0
+helm|bigbench:bbq_lite_json:religion_disambig|0|0
+helm|bigbench:bbq_lite_json:ses_ambig|0|0
+helm|bigbench:bbq_lite_json:ses_disambig|0|0
+helm|bigbench:bbq_lite_json:sexual_orientation_ambig|0|0
+helm|bigbench:bbq_lite_json:sexual_orientation_disambig|0|0
+helm|bigbench:code_line_description|0|0
+helm|bigbench:conceptual_combinations:contradictions|0|0
+helm|bigbench:conceptual_combinations:emergent_properties|0|0
+helm|bigbench:conceptual_combinations:fanciful_fictional_combinations|0|0
+helm|bigbench:conceptual_combinations:homonyms|0|0
+helm|bigbench:conceptual_combinations:invented_words|0|0
+helm|bigbench:conlang_translation:adna_from|0|0
+helm|bigbench:conlang_translation:adna_to|0|0
+helm|bigbench:conlang_translation:atikampe_from|0|0
+helm|bigbench:conlang_translation:atikampe_to|0|0
+helm|bigbench:conlang_translation:gornam_from|0|0
+helm|bigbench:conlang_translation:gornam_to|0|0
+helm|bigbench:conlang_translation:holuan_from|0|0
+helm|bigbench:conlang_translation:holuan_to|0|0
+helm|bigbench:conlang_translation:mkafala_from|0|0
+helm|bigbench:conlang_translation:mkafala_to|0|0
+helm|bigbench:conlang_translation:postpositive_english_from|0|0
+helm|bigbench:conlang_translation:postpositive_english_to|0|0
+helm|bigbench:conlang_translation:unapuri_from|0|0
+helm|bigbench:conlang_translation:unapuri_to|0|0
+helm|bigbench:conlang_translation:vaomi_from|0|0
+helm|bigbench:conlang_translation:vaomi_to|0|0
+helm|bigbench:emoji_movie|0|0
+helm|bigbench:formal_fallacies_syllogisms_negation|0|0
+helm|bigbench:hindu_knowledge|0|0
+helm|bigbench:known_unknowns|0|0
+helm|bigbench:language_identification|0|0
+helm|bigbench:linguistics_puzzles|0|0
+helm|bigbench:logic_grid_puzzle|0|0
+helm|bigbench:logical_deduction-five_objects|0|0
+helm|bigbench:logical_deduction-seven_objects|0|0
+helm|bigbench:logical_deduction-three_objects|0|0
+helm|bigbench:misconceptions_russian|0|0
+helm|bigbench:novel_concepts|0|0
+helm|bigbench:operators|0|0
+helm|bigbench:parsinlu_reading_comprehension|0|0
+helm|bigbench:play_dialog_same_or_different|0|0
+helm|bigbench:repeat_copy_logic|0|0
+helm|bigbench:strange_stories-boolean|0|0
+helm|bigbench:strange_stories-multiple_choice|0|0
+helm|bigbench:strategyqa|0|0
+helm|bigbench:symbol_interpretation-adversarial|0|0
+helm|bigbench:symbol_interpretation-emoji_agnostic|0|0
+helm|bigbench:symbol_interpretation-name_agnostic|0|0
+helm|bigbench:symbol_interpretation-plain|0|0
+helm|bigbench:symbol_interpretation-tricky|0|0
+helm|bigbench:vitaminc_fact_verification|0|0
+helm|bigbench:winowhy|0|0
+helm|blimp:adjunct_island|0|0
+helm|blimp:anaphor_gender_agreement|0|0
+helm|blimp:anaphor_number_agreement|0|0
+helm|blimp:animate_subject_passive|0|0
+helm|blimp:animate_subject_trans|0|0
+helm|blimp:causative|0|0
+helm|blimp:complex_NP_island|0|0
+helm|blimp:coordinate_structure_constraint_complex_left_branch|0|0
+helm|blimp:coordinate_structure_constraint_object_extraction|0|0
+helm|blimp:determiner_noun_agreement_1|0|0
+helm|blimp:determiner_noun_agreement_2|0|0
+helm|blimp:determiner_noun_agreement_irregular_1|0|0
+helm|blimp:determiner_noun_agreement_irregular_2|0|0
+helm|blimp:determiner_noun_agreement_with_adj_2|0|0
+helm|blimp:determiner_noun_agreement_with_adj_irregular_1|0|0
+helm|blimp:determiner_noun_agreement_with_adj_irregular_2|0|0
+helm|blimp:determiner_noun_agreement_with_adjective_1|0|0
+helm|blimp:distractor_agreement_relational_noun|0|0
+helm|blimp:distractor_agreement_relative_clause|0|0
+helm|blimp:drop_argument|0|0
+helm|blimp:ellipsis_n_bar_1|0|0
+helm|blimp:ellipsis_n_bar_2|0|0
+helm|blimp:existential_there_object_raising|0|0
+helm|blimp:existential_there_quantifiers_1|0|0
+helm|blimp:existential_there_quantifiers_2|0|0
+helm|blimp:existential_there_subject_raising|0|0
+helm|blimp:expletive_it_object_raising|0|0
+helm|blimp:inchoative|0|0
+helm|blimp:intransitive|0|0
+helm|blimp:irregular_past_participle_adjectives|0|0
+helm|blimp:irregular_past_participle_verbs|0|0
+helm|blimp:irregular_plural_subject_verb_agreement_1|0|0
+helm|blimp:irregular_plural_subject_verb_agreement_2|0|0
+helm|blimp:left_branch_island_echo_question|0|0
+helm|blimp:left_branch_island_simple_question|0|0
+helm|blimp:matrix_question_npi_licensor_present|0|0
+helm|blimp:npi_present_1|0|0
+helm|blimp:npi_present_2|0|0
+helm|blimp:only_npi_licensor_present|0|0
+helm|blimp:only_npi_scope|0|0
+helm|blimp:passive_1|0|0
+helm|blimp:passive_2|0|0
+helm|blimp:principle_A_c_command|0|0
+helm|blimp:principle_A_case_1|0|0
+helm|blimp:principle_A_case_2|0|0
+helm|blimp:principle_A_domain_1|0|0
+helm|blimp:principle_A_domain_2|0|0
+helm|blimp:principle_A_domain_3|0|0
+helm|blimp:principle_A_reconstruction|0|0
+helm|blimp:regular_plural_subject_verb_agreement_1|0|0
+helm|blimp:regular_plural_subject_verb_agreement_2|0|0
+helm|blimp:sentential_negation_npi_licensor_present|0|0
+helm|blimp:sentential_negation_npi_scope|0|0
+helm|blimp:sentential_subject_island|0|0
+helm|blimp:superlative_quantifiers_1|0|0
+helm|blimp:superlative_quantifiers_2|0|0
+helm|blimp:tough_vs_raising_1|0|0
+helm|blimp:tough_vs_raising_2|0|0
+helm|blimp:transitive|0|0
+helm|blimp:wh_island|0|0
+helm|blimp:wh_questions_object_gap|0|0
+helm|blimp:wh_questions_subject_gap_long_distance|0|0
+helm|blimp:wh_questions_subject_gap|0|0
+helm|blimp:wh_vs_that_no_gap_long_distance|0|0
+helm|blimp:wh_vs_that_no_gap|0|0
+helm|blimp:wh_vs_that_with_gap_long_distance|0|0
+helm|blimp:wh_vs_that_with_gap|0|0
+helm|bold:gender|0|0
+helm|bold:political_ideology|0|0
+helm|bold:profession|0|0
+helm|bold:race|0|0
+helm|bold:religious_ideology|0|0
+helm|bold|0|0
+helm|boolq:contrastset|0|0
+helm|boolq|0|0
+helm|civil_comments:LGBTQ|0|0
+helm|civil_comments:black|0|0
+helm|civil_comments:christian|0|0
+helm|civil_comments:female|0|0
+helm|civil_comments:male|0|0
+helm|civil_comments:muslim|0|0
+helm|civil_comments:other_religions|0|0
+helm|civil_comments:white|0|0
+helm|civil_comments|0|0
+helm|commonsenseqa|0|0
+helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125|0|0
+helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25|0|0
+helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5|0|0
+helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125|0|0
+helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25|0|0
+helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5|0|0
+helm|copyright:oh_the_places|0|0
+helm|copyright:pilot|0|0
+helm|copyright:popular_books-prefix_length_10|0|0
+helm|copyright:popular_books-prefix_length_125|0|0
+helm|copyright:popular_books-prefix_length_250|0|0
+helm|copyright:popular_books-prefix_length_25|0|0
+helm|copyright:popular_books-prefix_length_50|0|0
+helm|copyright:popular_books-prefix_length_5|0|0
+helm|copyright:prompt_num_line_1-min_lines_20|0|0
+helm|copyright:prompt_num_line_10-min_lines_20|0|0
+helm|copyright:prompt_num_line_5-min_lines_20|0|0
+helm|covid_dialogue|0|0
+helm|dyck_language:2|0|0
+helm|dyck_language:3|0|0
+helm|dyck_language:4|0|0
+helm|entity_data_imputation:Buy|0|0
+helm|entity_data_imputation:Restaurant|0|0
+helm|entity_matching:Abt_Buy|0|0
+helm|entity_matching:Amazon_Google|0|0
+helm|entity_matching:Beer|0|0
+helm|entity_matching:Company|0|0
+helm|entity_matching:DBLP_ACM|0|0
+helm|entity_matching:DBLP_GoogleScholar|0|0
+helm|entity_matching:Dirty_DBLP_ACM|0|0
+helm|entity_matching:Dirty_DBLP_GoogleScholar|0|0
+helm|entity_matching:Dirty_Walmart_Amazon|0|0
+helm|entity_matching:Dirty_iTunes_Amazon|0|0
+helm|entity_matching:Fodors_Zagats|0|0
+helm|entity_matching:Walmart_Amazon|0|0
+helm|entity_matching:iTunes_Amazon|0|0
+helm|hellaswag|0|0
+helm|humaneval|0|0
+helm|imdb:contrastset|0|0
+helm|imdb|0|0
+helm|interactive_qa_mmlu:abstract_algebra|0|0
+helm|interactive_qa_mmlu:college_chemistry|0|0
+helm|interactive_qa_mmlu:global_facts|0|0
+helm|interactive_qa_mmlu:miscellaneous|0|0
+helm|interactive_qa_mmlu:nutrition|0|0
+helm|interactive_qa_mmlu:us_foreign_policy|0|0
+helm|legal_summarization:billsum|0|0
+helm|legal_summarization:eurlexsum|0|0
+helm|legal_summarization:multilexsum|0|0
+helm|legalsupport|0|0
+helm|lexglue:case_hold|0|0
+helm|lexglue:ecthr_a|0|0
+helm|lexglue:ecthr_b|0|0
+helm|lexglue:eurlex|0|0
+helm|lexglue:ledgar|0|0
+helm|lexglue:scotus|0|0
+helm|lexglue:unfair_tos|0|0
+helm|lextreme:brazilian_court_decisions_judgment|0|0
+helm|lextreme:brazilian_court_decisions_unanimity|0|0
+helm|lextreme:covid19_emergency_event|0|0
+helm|lextreme:german_argument_mining|0|0
+helm|lextreme:greek_legal_code_chapter|0|0
+helm|lextreme:greek_legal_code_subject|0|0
+helm|lextreme:greek_legal_code_volume|0|0
+helm|lextreme:greek_legal_ner|0|0
+helm|lextreme:legalnero|0|0
+helm|lextreme:lener_br|0|0
+helm|lextreme:mapa_coarse|0|0
+helm|lextreme:mapa_fine|0|0
+helm|lextreme:multi_eurlex_level_1|0|0
+helm|lextreme:multi_eurlex_level_2|0|0
+helm|lextreme:multi_eurlex_level_3|0|0
+helm|lextreme:online_terms_of_service_clause_topics|0|0
+helm|lextreme:online_terms_of_service_unfairness_levels|0|0
+helm|lextreme:swiss_judgment_prediction|0|0
+helm|lsat_qa:assignment|0|0
+helm|lsat_qa:grouping|0|0
+helm|lsat_qa:miscellaneous|0|0
+helm|lsat_qa:ordering|0|0
+helm|lsat_qa|0|0
+helm|me_q_sum|0|0
+helm|med_dialog:healthcaremagic|0|0
+helm|med_dialog:icliniq|0|0
+helm|med_mcqa|0|0
+helm|med_paragraph_simplification|0|0
+helm|med_qa|0|0
+helm|mmlu:abstract_algebra|0|0
+helm|mmlu:anatomy|0|0
+helm|mmlu:astronomy|0|0
+helm|mmlu:business_ethics|0|0
+helm|mmlu:clinical_knowledge|0|0
+helm|mmlu:college_biology|0|0
+helm|mmlu:college_chemistry|0|0
+helm|mmlu:college_computer_science|0|0
+helm|mmlu:college_mathematics|0|0
+helm|mmlu:college_medicine|0|0
+helm|mmlu:college_physics|0|0
+helm|mmlu:computer_security|0|0
+helm|mmlu:conceptual_physics|0|0
+helm|mmlu:econometrics|0|0
+helm|mmlu:electrical_engineering|0|0
+helm|mmlu:elementary_mathematics|0|0
+helm|mmlu:formal_logic|0|0
+helm|mmlu:global_facts|0|0
+helm|mmlu:high_school_biology|0|0
+helm|mmlu:high_school_chemistry|0|0
+helm|mmlu:high_school_computer_science|0|0
+helm|mmlu:high_school_european_history|0|0
+helm|mmlu:high_school_geography|0|0
+helm|mmlu:high_school_government_and_politics|0|0
+helm|mmlu:high_school_macroeconomics|0|0
+helm|mmlu:high_school_mathematics|0|0
+helm|mmlu:high_school_microeconomics|0|0
+helm|mmlu:high_school_physics|0|0
+helm|mmlu:high_school_psychology|0|0
+helm|mmlu:high_school_statistics|0|0
+helm|mmlu:high_school_us_history|0|0
+helm|mmlu:high_school_world_history|0|0
+helm|mmlu:human_aging|0|0
+helm|mmlu:human_sexuality|0|0
+helm|mmlu:international_law|0|0
+helm|mmlu:jurisprudence|0|0
+helm|mmlu:logical_fallacies|0|0
+helm|mmlu:machine_learning|0|0
+helm|mmlu:management|0|0
+helm|mmlu:marketing|0|0
+helm|mmlu:medical_genetics|0|0
+helm|mmlu:miscellaneous|0|0
+helm|mmlu:moral_disputes|0|0
+helm|mmlu:moral_scenarios|0|0
+helm|mmlu:nutrition|0|0
+helm|mmlu:philosophy|0|0
+helm|mmlu:prehistory|0|0
+helm|mmlu:professional_accounting|0|0
+helm|mmlu:professional_law|0|0
+helm|mmlu:professional_medicine|0|0
+helm|mmlu:professional_psychology|0|0
+helm|mmlu:public_relations|0|0
+helm|mmlu:security_studies|0|0
+helm|mmlu:sociology|0|0
+helm|mmlu:us_foreign_policy|0|0
+helm|mmlu:virology|0|0
+helm|mmlu:world_religions|0|0
+helm|mmlu|0|0
+helm|narrativeqa|0|0
+helm|numeracy:linear_example|0|0
+helm|numeracy:linear_standard|0|0
+helm|numeracy:parabola_example|0|0
+helm|numeracy:parabola_standard|0|0
+helm|numeracy:paraboloid_example|0|0
+helm|numeracy:paraboloid_standard|0|0
+helm|numeracy:plane_example|0|0
+helm|numeracy:plane_standard|0|0
+helm|openbookqa|0|0
+helm|piqa|0|0
+helm|pubmedqa|0|0
+helm|quac|0|0
+helm|raft:ade_corpus_v2|0|0
+helm|raft:banking_77|0|0
+helm|raft:neurips_impact_statement_risks|0|0
+helm|raft:one_stop_english|0|0
+helm|raft:overruling|0|0
+helm|raft:semiconductor_org_types|0|0
+helm|raft:systematic_review_inclusion|0|0
+helm|raft:tai_safety_research|0|0
+helm|raft:terms_of_service|0|0
+helm|raft:tweet_eval_hate|0|0
+helm|raft:twitter_complaints|0|0
+helm|real_toxicity_prompts|0|0
+helm|siqa|0|0
+helm|summarization:cnn-dm|0|0
+helm|summarization:xsum-sampled|0|0
+helm|summarization:xsum|0|0
+helm|synthetic_reasoning:induction|0|0
+helm|synthetic_reasoning:natural_easy|0|0
+helm|synthetic_reasoning:natural_hard|0|0
+helm|synthetic_reasoning:pattern_match|0|0
+helm|synthetic_reasoning:variable_substitution|0|0
+helm|the_pile:arxiv|0|0
+helm|the_pile:bibliotik|0|0
+helm|the_pile:commoncrawl|0|0
+helm|the_pile:dm-mathematics|0|0
+helm|the_pile:enron|0|0
+helm|the_pile:europarl|0|0
+helm|the_pile:freelaw|0|0
+helm|the_pile:github|0|0
+helm|the_pile:gutenberg|0|0
+helm|the_pile:hackernews|0|0
+helm|the_pile:nih-exporter|0|0
+helm|the_pile:opensubtitles|0|0
+helm|the_pile:openwebtext2|0|0
+helm|the_pile:pubmed-abstracts|0|0
+helm|the_pile:pubmed-central|0|0
+helm|the_pile:stackexchange|0|0
+helm|the_pile:upsto|0|0
+helm|the_pile:wikipedia|0|0
+helm|the_pile:youtubesubtitles|0|0
+helm|truthfulqa|0|0
+helm|twitterAAE:aa|0|0
+helm|twitterAAE:white|0|0
+helm|wikifact:applies_to_jurisdiction|0|0
+helm|wikifact:atomic_number|0|0
+helm|wikifact:author|0|0
+helm|wikifact:award_received|0|0
+helm|wikifact:basic_form_of_government|0|0
+helm|wikifact:capital_of|0|0
+helm|wikifact:capital|0|0
+helm|wikifact:central_bank|0|0
+helm|wikifact:composer|0|0
+helm|wikifact:continent|0|0
+helm|wikifact:country_of_citizenship|0|0
+helm|wikifact:country_of_origin|0|0
+helm|wikifact:country|0|0
+helm|wikifact:creator|0|0
+helm|wikifact:currency|0|0
+helm|wikifact:defendant|0|0
+helm|wikifact:developer|0|0
+helm|wikifact:diplomatic_relation|0|0
+helm|wikifact:director|0|0
+helm|wikifact:discoverer_or_inventor|0|0
+helm|wikifact:drug_or_therapy_used_for_treatment|0|0
+helm|wikifact:educated_at|0|0
+helm|wikifact:electron_configuration|0|0
+helm|wikifact:employer|0|0
+helm|wikifact:field_of_work|0|0
+helm|wikifact:file_extension|0|0
+helm|wikifact:genetic_association|0|0
+helm|wikifact:genre|0|0
+helm|wikifact:has_part|0|0
+helm|wikifact:head_of_government|0|0
+helm|wikifact:head_of_state|0|0
+helm|wikifact:headquarters_location|0|0
+helm|wikifact:industry|0|0
+helm|wikifact:influenced_by|0|0
+helm|wikifact:instance_of|0|0
+helm|wikifact:instrument|0|0
+helm|wikifact:language_of_work_or_name|0|0
+helm|wikifact:languages_spoken_written_or_signed|0|0
+helm|wikifact:laws_applied|0|0
+helm|wikifact:located_in_the_administrative_territorial_entity|0|0
+helm|wikifact:location_of_discovery|0|0
+helm|wikifact:location_of_formation|0|0
+helm|wikifact:location|0|0
+helm|wikifact:majority_opinion_by|0|0
+helm|wikifact:manufacturer|0|0
+helm|wikifact:measured_physical_quantity|0|0
+helm|wikifact:medical_condition_treated|0|0
+helm|wikifact:member_of_political_party|0|0
+helm|wikifact:member_of_sports_team|0|0
+helm|wikifact:member_of|0|0
+helm|wikifact:movement|0|0
+helm|wikifact:named_after|0|0
+helm|wikifact:native_language|0|0
+helm|wikifact:number_of_processor_cores|0|0
+helm|wikifact:occupation|0|0
+helm|wikifact:office_held_by_head_of_government|0|0
+helm|wikifact:office_held_by_head_of_state|0|0
+helm|wikifact:official_language|0|0
+helm|wikifact:operating_system|0|0
+helm|wikifact:original_language_of_film_or_TV_show|0|0
+helm|wikifact:original_network|0|0
+helm|wikifact:overrules|0|0
+helm|wikifact:owned_by|0|0
+helm|wikifact:part_of|0|0
+helm|wikifact:participating_team|0|0
+helm|wikifact:place_of_birth|0|0
+helm|wikifact:place_of_death|0|0
+helm|wikifact:plaintiff|0|0
+helm|wikifact:position_held|0|0
+helm|wikifact:position_played_on_team|0|0
+helm|wikifact:programming_language|0|0
+helm|wikifact:recommended_unit_of_measurement|0|0
+helm|wikifact:record_label|0|0
+helm|wikifact:religion|0|0
+helm|wikifact:repealed_by|0|0
+helm|wikifact:shares_border_with|0|0
+helm|wikifact:solved_by|0|0
+helm|wikifact:statement_describes|0|0
+helm|wikifact:stock_exchange|0|0
+helm|wikifact:subclass_of|0|0
+helm|wikifact:subsidiary|0|0
+helm|wikifact:symptoms_and_signs|0|0
+helm|wikifact:therapeutic_area|0|0
+helm|wikifact:time_of_discovery_or_invention|0|0
+helm|wikifact:twinned_administrative_body|0|0
+helm|wikifact:work_location|0|0
+helm|wikitext:103|0|0
+helm|wmt14:cs-en|0|0
+helm|wmt14:de-en|0|0
+helm|wmt14:fr-en|0|0
+helm|wmt14:hi-en|0|0
+helm|wmt14:ru-en|0|0
+lighteval|anli:r1|0|0
+lighteval|anli:r2|0|0
+lighteval|anli:r3|0|0
+lighteval|anli|0|0
+leaderboard|arc:challenge|0|0
+lighteval|arc:easy|0|0
+lighteval|arithmetic:1dc|0|0
+lighteval|arithmetic:2da|0|0
+lighteval|arithmetic:2dm|0|0
+lighteval|arithmetic:2ds|0|0
+lighteval|arithmetic:3da|0|0
+lighteval|arithmetic:3ds|0|0
+lighteval|arithmetic:4da|0|0
+lighteval|arithmetic:4ds|0|0
+lighteval|arithmetic:5da|0|0
+lighteval|arithmetic:5ds|0|0
+lighteval|asdiv|0|0
+lighteval|blimp:adjunct_island|0|0
+lighteval|blimp:anaphor_gender_agreement|0|0
+lighteval|blimp:anaphor_number_agreement|0|0
+lighteval|blimp:animate_subject_passive|0|0
+lighteval|blimp:animate_subject_trans|0|0
+lighteval|blimp:causative|0|0
+lighteval|blimp:complex_NP_island|0|0
+lighteval|blimp:coordinate_structure_constraint_complex_left_branch|0|0
+lighteval|blimp:coordinate_structure_constraint_object_extraction|0|0
+lighteval|blimp:determiner_noun_agreement_1|0|0
+lighteval|blimp:determiner_noun_agreement_2|0|0
+lighteval|blimp:determiner_noun_agreement_irregular_1|0|0
+lighteval|blimp:determiner_noun_agreement_irregular_2|0|0
+lighteval|blimp:determiner_noun_agreement_with_adj_2|0|0
+lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1|0|0
+lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2|0|0
+lighteval|blimp:determiner_noun_agreement_with_adjective_1|0|0
+lighteval|blimp:distractor_agreement_relational_noun|0|0
+lighteval|blimp:distractor_agreement_relative_clause|0|0
+lighteval|blimp:drop_argument|0|0
+lighteval|blimp:ellipsis_n_bar_1|0|0
+lighteval|blimp:ellipsis_n_bar_2|0|0
+lighteval|blimp:existential_there_object_raising|0|0
+lighteval|blimp:existential_there_quantifiers_1|0|0
+lighteval|blimp:existential_there_quantifiers_2|0|0
+lighteval|blimp:existential_there_subject_raising|0|0
+lighteval|blimp:expletive_it_object_raising|0|0
+lighteval|blimp:inchoative|0|0
+lighteval|blimp:intransitive|0|0
+lighteval|blimp:irregular_past_participle_adjectives|0|0
+lighteval|blimp:irregular_past_participle_verbs|0|0
+lighteval|blimp:irregular_plural_subject_verb_agreement_1|0|0
+lighteval|blimp:irregular_plural_subject_verb_agreement_2|0|0
+lighteval|blimp:left_branch_island_echo_question|0|0
+lighteval|blimp:left_branch_island_simple_question|0|0
+lighteval|blimp:matrix_question_npi_licensor_present|0|0
+lighteval|blimp:npi_present_1|0|0
+lighteval|blimp:npi_present_2|0|0
+lighteval|blimp:only_npi_licensor_present|0|0
+lighteval|blimp:only_npi_scope|0|0
+lighteval|blimp:passive_1|0|0
+lighteval|blimp:passive_2|0|0
+lighteval|blimp:principle_A_c_command|0|0
+lighteval|blimp:principle_A_case_1|0|0
+lighteval|blimp:principle_A_case_2|0|0
+lighteval|blimp:principle_A_domain_1|0|0
+lighteval|blimp:principle_A_domain_2|0|0
+lighteval|blimp:principle_A_domain_3|0|0
+lighteval|blimp:principle_A_reconstruction|0|0
+lighteval|blimp:regular_plural_subject_verb_agreement_1|0|0
+lighteval|blimp:regular_plural_subject_verb_agreement_2|0|0
+lighteval|blimp:sentential_negation_npi_licensor_present|0|0
+lighteval|blimp:sentential_negation_npi_scope|0|0
+lighteval|blimp:sentential_subject_island|0|0
+lighteval|blimp:superlative_quantifiers_1|0|0
+lighteval|blimp:superlative_quantifiers_2|0|0
+lighteval|blimp:tough_vs_raising_1|0|0
+lighteval|blimp:tough_vs_raising_2|0|0
+lighteval|blimp:transitive|0|0
+lighteval|blimp:wh_island|0|0
+lighteval|blimp:wh_questions_object_gap|0|0
+lighteval|blimp:wh_questions_subject_gap_long_distance|0|0
+lighteval|blimp:wh_questions_subject_gap|0|0
+lighteval|blimp:wh_vs_that_no_gap_long_distance|0|0
+lighteval|blimp:wh_vs_that_no_gap|0|0
+lighteval|blimp:wh_vs_that_with_gap_long_distance|0|0
+lighteval|blimp:wh_vs_that_with_gap|0|0
+lighteval|coqa_bb|0|0
+lighteval|coqa|0|0
+lighteval|drop|0|0
+lighteval|ethics:commonsense|0|0
+lighteval|ethics:deontology|0|0
+lighteval|ethics:justice|0|0
+lighteval|ethics:utilitarianism|0|0
+lighteval|ethics:virtue|0|0
+lighteval|glue:cola|0|0
+lighteval|glue:mnli_mismatched|0|0
+lighteval|glue:mnli|0|0
+lighteval|glue:mrpc|0|0
+lighteval|glue:qnli|0|0
+lighteval|glue:qqp|0|0
+lighteval|glue:rte|0|0
+lighteval|glue:sst2|0|0
+lighteval|glue:stsb|0|0
+lighteval|glue:wnli|0|0
+leaderboard|gsm8k|0|0
+lighteval|headqa:en|0|0
+lighteval|headqa:es|0|0
+leaderboard|hellaswag|0|0
+lighteval|iwslt17:ar-en|0|0
+lighteval|iwslt17:de-en|0|0
+lighteval|iwslt17:en-ar|0|0
+lighteval|iwslt17:en-de|0|0
+lighteval|iwslt17:en-fr|0|0
+lighteval|iwslt17:en-ja|0|0
+lighteval|iwslt17:en-ko|0|0
+lighteval|iwslt17:en-zh|0|0
+lighteval|iwslt17:fr-en|0|0
+lighteval|iwslt17:ja-en|0|0
+lighteval|iwslt17:ko-en|0|0
+lighteval|iwslt17:zh-en|0|0
+lighteval|lambada:openai:de|0|0
+lighteval|lambada:openai:en|0|0
+lighteval|lambada:openai:es|0|0
+lighteval|lambada:openai:fr|0|0
+lighteval|lambada:openai:it|0|0
+lighteval|lambada:openai_cloze|0|0
+lighteval|lambada:openai|0|0
+lighteval|lambada:standard_cloze|0|0
+lighteval|lambada:standard|0|0
+lighteval|logiqa|0|0
+lighteval|math:algebra|0|0
+lighteval|math:counting_and_probability|0|0
+lighteval|math:geometry|0|0
+lighteval|math:intermediate_algebra|0|0
+lighteval|math:number_theory|0|0
+lighteval|math:prealgebra|0|0
+lighteval|math:precalculus|0|0
+lighteval|mathqa|0|0
+lighteval|mgsm:bn|0|0
+lighteval|mgsm:de|0|0
+lighteval|mgsm:en|0|0
+lighteval|mgsm:es|0|0
+lighteval|mgsm:fr|0|0
+lighteval|mgsm:ja|0|0
+lighteval|mgsm:ru|0|0
+lighteval|mgsm:sw|0|0
+lighteval|mgsm:te|0|0
+lighteval|mgsm:th|0|0
+lighteval|mgsm:zh|0|0
+leaderboard|mmlu:abstract_algebra|0|0
+leaderboard|mmlu:anatomy|0|0
+leaderboard|mmlu:astronomy|0|0
+leaderboard|mmlu:business_ethics|0|0
+leaderboard|mmlu:clinical_knowledge|0|0
+leaderboard|mmlu:college_biology|0|0
+leaderboard|mmlu:college_chemistry|0|0
+leaderboard|mmlu:college_computer_science|0|0
+leaderboard|mmlu:college_mathematics|0|0
+leaderboard|mmlu:college_medicine|0|0
+leaderboard|mmlu:college_physics|0|0
+leaderboard|mmlu:computer_security|0|0
+leaderboard|mmlu:conceptual_physics|0|0
+leaderboard|mmlu:econometrics|0|0
+leaderboard|mmlu:electrical_engineering|0|0
+leaderboard|mmlu:elementary_mathematics|0|0
+leaderboard|mmlu:formal_logic|0|0
+leaderboard|mmlu:global_facts|0|0
+leaderboard|mmlu:high_school_biology|0|0
+leaderboard|mmlu:high_school_chemistry|0|0
+leaderboard|mmlu:high_school_computer_science|0|0
+leaderboard|mmlu:high_school_european_history|0|0
+leaderboard|mmlu:high_school_geography|0|0
+leaderboard|mmlu:high_school_government_and_politics|0|0
+leaderboard|mmlu:high_school_macroeconomics|0|0
+leaderboard|mmlu:high_school_mathematics|0|0
+leaderboard|mmlu:high_school_microeconomics|0|0
+leaderboard|mmlu:high_school_physics|0|0
+leaderboard|mmlu:high_school_psychology|0|0
+leaderboard|mmlu:high_school_statistics|0|0
+leaderboard|mmlu:high_school_us_history|0|0
+leaderboard|mmlu:high_school_world_history|0|0
+leaderboard|mmlu:human_aging|0|0
+leaderboard|mmlu:human_sexuality|0|0
+leaderboard|mmlu:international_law|0|0
+leaderboard|mmlu:jurisprudence|0|0
+leaderboard|mmlu:logical_fallacies|0|0
+leaderboard|mmlu:machine_learning|0|0
+leaderboard|mmlu:management|0|0
+leaderboard|mmlu:marketing|0|0
+leaderboard|mmlu:medical_genetics|0|0
+leaderboard|mmlu:miscellaneous|0|0
+leaderboard|mmlu:moral_disputes|0|0
+leaderboard|mmlu:moral_scenarios|0|0
+leaderboard|mmlu:nutrition|0|0
+leaderboard|mmlu:philosophy|0|0
+leaderboard|mmlu:prehistory|0|0
+leaderboard|mmlu:professional_accounting|0|0
+leaderboard|mmlu:professional_law|0|0
+leaderboard|mmlu:professional_medicine|0|0
+leaderboard|mmlu:professional_psychology|0|0
+leaderboard|mmlu:public_relations|0|0
+leaderboard|mmlu:security_studies|0|0
+leaderboard|mmlu:sociology|0|0
+leaderboard|mmlu:us_foreign_policy|0|0
+leaderboard|mmlu:virology|0|0
+leaderboard|mmlu:world_religions|0|0
+lighteval|mtnt2019:en-fr|0|0
+lighteval|mtnt2019:en-ja|0|0
+lighteval|mtnt2019:fr-en|0|0
+lighteval|mtnt2019:ja-en|0|0
+lighteval|mutual_plus|0|0
+lighteval|mutual|0|0
+lighteval|openbookqa|0|0
+lighteval|piqa|0|0
+lighteval|prost|0|0
+lighteval|pubmedqa|0|0
+lighteval|qa4mre:2011|0|0
+lighteval|qa4mre:2012|0|0
+lighteval|qa4mre:2013|0|0
+lighteval|qasper_ll|0|0
+lighteval|qasper|0|0
+lighteval|race:high|0|0
+lighteval|sciq|0|0
+lighteval|storycloze:2016|0|0
+lighteval|storycloze:2018|0|0
+lighteval|super_glue:boolq|0|0
+lighteval|super_glue:cb|0|0
+lighteval|super_glue:copa|0|0
+lighteval|super_glue:multirc|0|0
+lighteval|super_glue:record|0|0
+lighteval|super_glue:rte|0|0
+lighteval|super_glue:wic|0|0
+lighteval|super_glue:wsc|0|0
+lighteval|swag|0|0
+lighteval|the_pile:arxiv|0|0
+lighteval|the_pile:bookcorpus2|0|0
+lighteval|the_pile:books3|0|0
+lighteval|the_pile:dm-mathematics|0|0
+lighteval|the_pile:enron|0|0
+lighteval|the_pile:europarl|0|0
+lighteval|the_pile:freelaw|0|0
+lighteval|the_pile:github|0|0
+lighteval|the_pile:gutenberg|0|0
+lighteval|the_pile:hackernews|0|0
+lighteval|the_pile:nih-exporter|0|0
+lighteval|the_pile:opensubtitles|0|0
+lighteval|the_pile:openwebtext2|0|0
+lighteval|the_pile:philpapers|0|0
+lighteval|the_pile:pile-cc|0|0
+lighteval|the_pile:pubmed-abstracts|0|0
+lighteval|the_pile:pubmed-central|0|0
+lighteval|the_pile:stackexchange|0|0
+lighteval|the_pile:ubuntu-irc|0|0
+lighteval|the_pile:uspto|0|0
+lighteval|the_pile:wikipedia|0|0
+lighteval|the_pile:youtubesubtitles|0|0
+lighteval|toxigen|0|0
+lighteval|triviaqa|0|0
+lighteval|truthfulqa:gen|0|0
+leaderboard|truthfulqa:mc|0|0
+lighteval|unscramble:anagrams1|0|0
+lighteval|unscramble:anagrams2|0|0
+lighteval|unscramble:cycle_letters|0|0
+lighteval|unscramble:random_insertion|0|0
+lighteval|unscramble:reversed_words|0|0
+lighteval|webqs|0|0
+lighteval|wikitext|0|0
+leaderboard|winogrande|0|0
+lighteval|wmt08:cs-en|0|0
+lighteval|wmt08:de-en|0|0
+lighteval|wmt08:en-cs|0|0
+lighteval|wmt08:en-de|0|0
+lighteval|wmt08:en-es|0|0
+lighteval|wmt08:en-fr|0|0
+lighteval|wmt08:en-hu|0|0
+lighteval|wmt08:es-en|0|0
+lighteval|wmt08:fr-en|0|0
+lighteval|wmt08:hu-en|0|0
+lighteval|wmt09:cs-en|0|0
+lighteval|wmt09:de-en|0|0
+lighteval|wmt09:en-cs|0|0
+lighteval|wmt09:en-de|0|0
+lighteval|wmt09:en-es|0|0
+lighteval|wmt09:en-fr|0|0
+lighteval|wmt09:en-hu|0|0
+lighteval|wmt09:en-it|0|0
+lighteval|wmt09:es-en|0|0
+lighteval|wmt09:fr-en|0|0
+lighteval|wmt09:hu-en|0|0
+lighteval|wmt09:it-en|0|0
+lighteval|wmt10:cs-en|0|0
+lighteval|wmt10:de-en|0|0
+lighteval|wmt10:en-cs|0|0
+lighteval|wmt10:en-de|0|0
+lighteval|wmt10:en-es|0|0
+lighteval|wmt10:en-fr|0|0
+lighteval|wmt10:es-en|0|0
+lighteval|wmt10:fr-en|0|0
+lighteval|wmt11:cs-en|0|0
+lighteval|wmt11:de-en|0|0
+lighteval|wmt11:en-cs|0|0
+lighteval|wmt11:en-de|0|0
+lighteval|wmt11:en-es|0|0
+lighteval|wmt11:en-fr|0|0
+lighteval|wmt11:es-en|0|0
+lighteval|wmt11:fr-en|0|0
+lighteval|wmt12:cs-en|0|0
+lighteval|wmt12:de-en|0|0
+lighteval|wmt12:en-cs|0|0
+lighteval|wmt12:en-de|0|0
+lighteval|wmt12:en-es|0|0
+lighteval|wmt12:en-fr|0|0
+lighteval|wmt12:es-en|0|0
+lighteval|wmt12:fr-en|0|0
+lighteval|wmt13:cs-en|0|0
+lighteval|wmt13:de-en|0|0
+lighteval|wmt13:en-cs|0|0
+lighteval|wmt13:en-de|0|0
+lighteval|wmt13:en-es|0|0
+lighteval|wmt13:en-fr|0|0
+lighteval|wmt13:en-ru|0|0
+lighteval|wmt13:es-en|0|0
+lighteval|wmt13:fr-en|0|0
+lighteval|wmt13:ru-en|0|0
+lighteval|wmt14:cs-en|0|0
+lighteval|wmt14:de-en|0|0
+lighteval|wmt14:en-cs|0|0
+lighteval|wmt14:en-de|0|0
+lighteval|wmt14:en-fr|0|0
+lighteval|wmt14:en-fr|0|0
+lighteval|wmt14:en-hi|0|0
+lighteval|wmt14:en-ru|0|0
+lighteval|wmt14:fr-en|0|0
+lighteval|wmt14:fr-en|0|0
+lighteval|wmt14:hi-en|0|0
+lighteval|wmt14:ru-en|0|0
+lighteval|wmt15:cs-en|0|0
+lighteval|wmt15:de-en|0|0
+lighteval|wmt15:en-cs|0|0
+lighteval|wmt15:en-de|0|0
+lighteval|wmt15:en-fi|0|0
+lighteval|wmt15:en-fr|0|0
+lighteval|wmt15:en-ru|0|0
+lighteval|wmt15:fi-en|0|0
+lighteval|wmt15:fr-en|0|0
+lighteval|wmt15:ru-en|0|0
+lighteval|wmt16:cs-en|0|0
+lighteval|wmt16:de-en|0|0
+lighteval|wmt16:de-en|0|0
+lighteval|wmt16:en-cs|0|0
+lighteval|wmt16:en-de|0|0
+lighteval|wmt16:en-de|0|0
+lighteval|wmt16:en-fi|0|0
+lighteval|wmt16:en-ro|0|0
+lighteval|wmt16:en-ro|0|0
+lighteval|wmt16:en-ru|0|0
+lighteval|wmt16:en-tr|0|0
+lighteval|wmt16:fi-en|0|0
+lighteval|wmt16:ro-en|0|0
+lighteval|wmt16:ro-en|0|0
+lighteval|wmt16:ru-en|0|0
+lighteval|wmt16:tr-en|0|0
+lighteval|wmt17:cs-en|0|0
+lighteval|wmt17:de-en|0|0
+lighteval|wmt17:en-cs|0|0
+lighteval|wmt17:en-de|0|0
+lighteval|wmt17:en-fi|0|0
+lighteval|wmt17:en-lv|0|0
+lighteval|wmt17:en-ru|0|0
+lighteval|wmt17:en-tr|0|0
+lighteval|wmt17:en-zh|0|0
+lighteval|wmt17:fi-en|0|0
+lighteval|wmt17:lv-en|0|0
+lighteval|wmt17:ru-en|0|0
+lighteval|wmt17:tr-en|0|0
+lighteval|wmt17:zh-en|0|0
+lighteval|wmt18:cs-en|0|0
+lighteval|wmt18:de-en|0|0
+lighteval|wmt18:en-cs|0|0
+lighteval|wmt18:en-de|0|0
+lighteval|wmt18:en-et|0|0
+lighteval|wmt18:en-fi|0|0
+lighteval|wmt18:en-ru|0|0
+lighteval|wmt18:en-tr|0|0
+lighteval|wmt18:en-zh|0|0
+lighteval|wmt18:et-en|0|0
+lighteval|wmt18:fi-en|0|0
+lighteval|wmt18:ru-en|0|0
+lighteval|wmt18:tr-en|0|0
+lighteval|wmt18:zh-en|0|0
+lighteval|wmt19:cs-de|0|0
+lighteval|wmt19:de-cs|0|0
+lighteval|wmt19:de-en|0|0
+lighteval|wmt19:de-fr|0|0
+lighteval|wmt19:en-cs|0|0
+lighteval|wmt19:en-de|0|0
+lighteval|wmt19:en-fi|0|0
+lighteval|wmt19:en-gu|0|0
+lighteval|wmt19:en-kk|0|0
+lighteval|wmt19:en-lt|0|0
+lighteval|wmt19:en-ru|0|0
+lighteval|wmt19:en-zh|0|0
+lighteval|wmt19:fi-en|0|0
+lighteval|wmt19:fr-de|0|0
+lighteval|wmt19:gu-en|0|0
+lighteval|wmt19:kk-en|0|0
+lighteval|wmt19:lt-en|0|0
+lighteval|wmt19:ru-en|0|0
+lighteval|wmt19:zh-en|0|0
+lighteval|wmt20:cs-en|0|0
+lighteval|wmt20:de-en|0|0
+lighteval|wmt20:de-fr|0|0
+lighteval|wmt20:en-cs|0|0
+lighteval|wmt20:en-de|0|0
+lighteval|wmt20:en-iu|0|0
+lighteval|wmt20:en-ja|0|0
+lighteval|wmt20:en-km|0|0
+lighteval|wmt20:en-pl|0|0
+lighteval|wmt20:en-ps|0|0
+lighteval|wmt20:en-ru|0|0
+lighteval|wmt20:en-ta|0|0
+lighteval|wmt20:en-zh|0|0
+lighteval|wmt20:fr-de|0|0
+lighteval|wmt20:iu-en|0|0
+lighteval|wmt20:ja-en|0|0
+lighteval|wmt20:km-en|0|0
+lighteval|wmt20:pl-en|0|0
+lighteval|wmt20:ps-en|0|0
+lighteval|wmt20:ru-en|0|0
+lighteval|wmt20:ta-en|0|0
+lighteval|wmt20:zh-en|0|0
+lighteval|wsc273|0|0
+lighteval|xcopa:en|0|0
+lighteval|xcopa:et|0|0
+lighteval|xcopa:ht|0|0
+lighteval|xcopa:id|0|0
+lighteval|xcopa:it|0|0
+lighteval|xcopa:qu|0|0
+lighteval|xcopa:sw|0|0
+lighteval|xcopa:ta|0|0
+lighteval|xcopa:th|0|0
+lighteval|xcopa:tr|0|0
+lighteval|xcopa:vi|0|0
+lighteval|xcopa:zh|0|0
+lighteval|xstory_cloze:ar|0|0
+lighteval|xstory_cloze:en|0|0
+lighteval|xstory_cloze:es|0|0
+lighteval|xstory_cloze:eu|0|0
+lighteval|xstory_cloze:hi|0|0
+lighteval|xstory_cloze:id|0|0
+lighteval|xstory_cloze:my|0|0
+lighteval|xstory_cloze:ru|0|0
+lighteval|xstory_cloze:sw|0|0
+lighteval|xstory_cloze:te|0|0
+lighteval|xstory_cloze:zh|0|0
+lighteval|xwinograd:en|0|0
+lighteval|xwinograd:fr|0|0
+lighteval|xwinograd:jp|0|0
+lighteval|xwinograd:pt|0|0
+lighteval|xwinograd:ru|0|0
+lighteval|xwinograd:zh|0|0
+original|arc:c:letters|0|0
+original|arc:c:options|0|0
+original|arc:c:simple|0|0
+original|mmlu:abstract_algebra|0|0
+original|mmlu:anatomy|0|0
+original|mmlu:astronomy|0|0
+original|mmlu:business_ethics|0|0
+original|mmlu:clinical_knowledge|0|0
+original|mmlu:college_biology|0|0
+original|mmlu:college_chemistry|0|0
+original|mmlu:college_computer_science|0|0
+original|mmlu:college_mathematics|0|0
+original|mmlu:college_medicine|0|0
+original|mmlu:college_physics|0|0
+original|mmlu:computer_security|0|0
+original|mmlu:conceptual_physics|0|0
+original|mmlu:econometrics|0|0
+original|mmlu:electrical_engineering|0|0
+original|mmlu:elementary_mathematics|0|0
+original|mmlu:formal_logic|0|0
+original|mmlu:global_facts|0|0
+original|mmlu:high_school_biology|0|0
+original|mmlu:high_school_chemistry|0|0
+original|mmlu:high_school_computer_science|0|0
+original|mmlu:high_school_european_history|0|0
+original|mmlu:high_school_geography|0|0
+original|mmlu:high_school_government_and_politics|0|0
+original|mmlu:high_school_macroeconomics|0|0
+original|mmlu:high_school_mathematics|0|0
+original|mmlu:high_school_microeconomics|0|0
+original|mmlu:high_school_physics|0|0
+original|mmlu:high_school_psychology|0|0
+original|mmlu:high_school_statistics|0|0
+original|mmlu:high_school_us_history|0|0
+original|mmlu:high_school_world_history|0|0
+original|mmlu:human_aging|0|0
+original|mmlu:human_sexuality|0|0
+original|mmlu:international_law|0|0
+original|mmlu:jurisprudence|0|0
+original|mmlu:logical_fallacies|0|0
+original|mmlu:machine_learning|0|0
+original|mmlu:management|0|0
+original|mmlu:marketing|0|0
+original|mmlu:medical_genetics|0|0
+original|mmlu:miscellaneous|0|0
+original|mmlu:moral_disputes|0|0
+original|mmlu:moral_scenarios|0|0
+original|mmlu:nutrition|0|0
+original|mmlu:philosophy|0|0
+original|mmlu:prehistory|0|0
+original|mmlu:professional_accounting|0|0
+original|mmlu:professional_law|0|0
+original|mmlu:professional_medicine|0|0
+original|mmlu:professional_psychology|0|0
+original|mmlu:public_relations|0|0
+original|mmlu:security_studies|0|0
+original|mmlu:sociology|0|0
+original|mmlu:us_foreign_policy|0|0
+original|mmlu:virology|0|0
+original|mmlu:world_religions|0|0
+original|mmlu|0|0
diff --git a/tasks_examples/bbh.txt b/tasks_examples/bbh.txt
new file mode 100644
index 000000000..6b90fa3ae
--- /dev/null
+++ b/tasks_examples/bbh.txt
@@ -0,0 +1,36 @@
+lighteval|bigbench:causal_judgment|3|0
+lighteval|bigbench:date_understanding|3|0
+lighteval|bigbench:disambiguation_qa|3|0
+lighteval|bigbench:geometric_shapes|3|0
+lighteval|bigbench:logical_deduction_five_objects|3|0
+lighteval|bigbench:logical_deduction_seven_objects|3|0
+lighteval|bigbench:logical_deduction_three_objects|3|0
+lighteval|bigbench:movie_recommendation|3|0
+lighteval|bigbench:navigate|3|0
+lighteval|bigbench:reasoning_about_colored_objects|3|0
+lighteval|bigbench:ruin_names|3|0
+lighteval|bigbench:salient_translation_error_detection|3|0
+lighteval|bigbench:snarks|3|0
+lighteval|bigbench:sports_understanding|3|0
+lighteval|bigbench:temporal_sequences|3|0
+lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0
+lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0
+lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0
+harness|bigbench:causal_judgment|3|0
+harness|bigbench:date_understanding|3|0
+harness|bigbench:disambiguation_qa|3|0
+harness|bigbench:geometric_shapes|3|0
+harness|bigbench:logical_deduction_five_objects|3|0
+harness|bigbench:logical_deduction_seven_objects|3|0
+harness|bigbench:logical_deduction_three_objects|3|0
+harness|bigbench:movie_recommendation|3|0
+harness|bigbench:navigate|3|0
+harness|bigbench:reasoning_about_colored_objects|3|0
+harness|bigbench:ruin_names|3|0
+harness|bigbench:salient_translation_error_detection|3|0
+harness|bigbench:snarks|3|0
+harness|bigbench:sports_understanding|3|0
+harness|bigbench:temporal_sequences|3|0
+harness|bigbench:tracking_shuffled_objects_five_objects|3|0
+harness|bigbench:tracking_shuffled_objects_seven_objects|3|0
+harness|bigbench:tracking_shuffled_objects_three_objects|3|0
diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
new file mode 100644
index 000000000..4c0f3c857
--- /dev/null
+++ b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
@@ -0,0 +1,691 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ruff: noqa: F405, F403, F401
+"""
+Custom evaluation tasks for lighteval
+
+This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+"""
+import re
+from dataclasses import asdict
+from typing import Dict, List, Tuple
+
+from lighteval.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
+
+
+_TASKS_STRINGS: List[Tuple[LightevalTaskConfig, str]] = []
+_TASKS: List[LightevalTaskConfig] = []
+
+# COMMON_SENSE_REASONING_TASKS ##
+COMMON_SENSE_REASONING_TASKS = [
+    LightevalTaskConfig(
+        name="hellaswag",
+        prompt_function="hellaswag_prompt",
+        hf_repo="hellaswag",
+        hf_subset="default",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="winogrande",
+        prompt_function="winogrande",
+        hf_repo="winogrande",
+        hf_subset="winogrande_xl",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="piqa",
+        prompt_function="piqa_harness",
+        hf_repo="piqa",
+        hf_subset="plain_text",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="siqa",
+        prompt_function="siqa_prompt",
+        hf_repo="lighteval/siqa",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="openbookqa",
+        prompt_function="openbookqa",
+        hf_repo="openbookqa",
+        hf_subset="main",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="arc:easy",
+        prompt_function="arc",
+        hf_repo="ai2_arc",
+        hf_subset="ARC-Easy",
+        evaluation_splits=["test"],
+        generation_size=1,
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="arc:challenge",
+        prompt_function="arc",
+        hf_repo="ai2_arc",
+        hf_subset="ARC-Challenge",
+        evaluation_splits=["test"],
+        generation_size=1,
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="commonsense_qa",
+        prompt_function="commonsense_qa_prompt",
+        hf_repo="commonsense_qa",
+        hf_subset="default",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+]
+
+
+def commonsense_qa_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[f" {c}" for c in line["choices"]["text"]],
+        gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
+        instruction="",
+    )
+
+
+def siqa_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["context"] + " " + line["question"],
+        choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]],
+        gold_index=int(line["label"]) - 1,
+        instruction="",
+    )
+
+
+def hellaswag_prompt(line, task_name: str = None):
+    def preprocess(text):
+        """Comes from AiHarness"""
+        # text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+
+    ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
+    return Doc(
+        task_name=task_name,
+        query=preprocess(line["activity_label"] + ": " + ctx),
+        choices=[" " + preprocess(ending) for ending in line["endings"]],
+        gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
+        # "metric": "choices_loglikelihood",
+    )
+
+
+# 0 short for common sense
+COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
+_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
+_TASKS += COMMON_SENSE_REASONING_TASKS
+
+# WORLD_KNOWLEDGE_TASKS ##
+
+WORLD_KNOWLEDGE_TASKS = [
+    LightevalTaskConfig(
+        name="trivia_qa",
+        prompt_function="triviaqa",
+        hf_repo="trivia_qa",
+        hf_subset="rc.nocontext",
+        metric=[Metrics.quasi_exact_match],
+        generation_size=20,
+        trust_dataset=True,
+        stop_sequence=["\n", ".", ","],
+    ),
+    LightevalTaskConfig(
+        name="natural_questions",
+        prompt_function="natural_questions_prompt",
+        hf_repo="lighteval/natural_questions_clean",
+        hf_subset="default",
+        metric=[Metrics.quasi_exact_match],
+        generation_size=20,
+        trust_dataset=True,
+        stop_sequence=["\n", ".", ","],
+    ),
+]
+
+
+def natural_questions_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"] + "?\nAnswer: ",
+        choices=[line["short_answers"]],
+        gold_index=0,
+        instruction="",
+    )
+
+
+WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS]
+# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS}
+_TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING)
+_TASKS += WORLD_KNOWLEDGE_TASKS
+
+# Reading comprehension ##
+
+READING_COMP_TASKS = [
+    LightevalTaskConfig(
+        name="super_glue:boolq",
+        prompt_function="boolq_prompt",
+        hf_repo="super_glue",
+        hf_subset="boolq",
+        metric=["target_perplexity"],
+        trust_dataset=True,
+        stop_sequence=["\n"],
+    ),
+    LightevalTaskConfig(
+        name="quac",
+        prompt_function="quac",
+        hf_repo="lighteval/quac_helm",
+        hf_subset="deault",
+        metric=[Metrics.quasi_exact_match],
+        generation_size=20,
+        trust_dataset=True,
+        stop_sequence=["\n", ".", ","],
+    ),
+]
+
+
+def boolq_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:",
+        choices=[" No", " Yes"],  # Only gold
+        gold_index=int(line["label"]),
+    )
+
+
+READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS]
+_TASKS_STRINGS.extend(READING_COMP_STRING)
+_TASKS += READING_COMP_TASKS
+
+
+# MATH ##
+class CustomMathEvaluationTask(LightevalTaskConfig):
+    """Custom class for math tasks with all the defaults set"""
+
+    def __init__(
+        self,
+        name,
+        prompt_function="math",
+        hf_repo="lighteval/MATH",
+        hf_subset=None,
+        metric=[Metrics.quasi_exact_match_math],
+        hf_avail_splits=None,
+        evaluation_splits=["test"],
+        few_shots_split=None,
+        few_shots_select=None,
+        suite=["custom"],
+        generation_size=40,
+        trust_dataset=True,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            output_regex=output_regex,
+            frozen=frozen,
+            trust_dataset=trust_dataset,
+            stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
+        )
+
+
+MATH_TASKS = [
+    CustomMathEvaluationTask(name="math:algebra", hf_subset="algebra"),
+    CustomMathEvaluationTask(name="math:counting_and_probability", hf_subset="counting_and_probability"),
+    CustomMathEvaluationTask(name="math:geometry", hf_subset="geometry"),
+    CustomMathEvaluationTask(name="math:intermediate_algebra", hf_subset="intermediate_algebra"),
+    CustomMathEvaluationTask(name="math:number_theory", hf_subset="number_theory"),
+    CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
+    CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
+]
+GSM8K = LightevalTaskConfig(
+    name="gsm8k",
+    prompt_function="gsm8k",
+    hf_repo="gsm8k",
+    hf_subset="main",
+    hf_avail_splits=["train", "test"],
+    evaluation_splits=["test"],
+    metric=[Metrics.perfect_exact_match],
+    generation_size=10,
+    stop_sequence=["\n"],
+)
+
+
+MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS]
+GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")]
+_TASKS_STRINGS.extend(MATH_STRING)
+_TASKS_STRINGS.extend(GSM8K_STRING)
+_TASKS += MATH_TASKS + [GSM8K]
+
+
+# MMLU ##
+class CustomMMLUEvaluationTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        prompt_function="mmlu_prompt",
+        hf_repo="lighteval/mmlu",
+        hf_subset=None,
+        #  metric=[Metrics.loglikelihood_acc_single_token],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
+        hf_avail_splits=None,
+        evaluation_splits=["test"],
+        few_shots_split="dev",
+        few_shots_select=None,
+        suite=None,
+        generation_size=-1,
+        trust_dataset=True,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            trust_dataset=trust_dataset,
+            stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+
+
+MMLU_TASKS = [
+    CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"),
+    CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"),
+    CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"),
+    CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"),
+    CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"),
+    CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"),
+    CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"),
+    CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"),
+    CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"),
+    CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"),
+    CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"),
+    CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"),
+    CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"),
+    CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"),
+    CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"),
+    CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"),
+    CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"),
+    CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"),
+    CustomMMLUEvaluationTask(
+        name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics"
+    ),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"),
+    CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"),
+    CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"),
+    CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"),
+    CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"),
+    CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"),
+    CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"),
+    CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"),
+    CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"),
+    CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"),
+    CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"),
+    CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"),
+    CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"),
+    CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"),
+    CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"),
+    CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"),
+    CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"),
+    CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"),
+    CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"),
+    CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"),
+    CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"),
+    CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"),
+]
+
+
+def mmlu_harness(line, task_name: str = None):
+    topic = line["subject"]
+    prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n"
+    prompt += line["question"] + "\n"
+    prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
+    prompt += "Answer:"
+
+    gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    "__few_shots" in line and line["__few_shots"] is True  # We are adding few shots
+
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=[" A", " B", " C", " D"],
+        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
+        gold_index=gold_ix,
+        instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
+    )
+
+
+def mmlu_prompt(line, task_name: str = None):
+    """MMLU prompt without letters"""
+    topic = line["subject"]
+    prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: "
+    prompt += line["question"] + "\nAnswer:"
+
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=[f" {c}" for c in line["choices"]],
+        gold_index=line["answer"],
+        instruction=f"The following are questions about {topic.replace('_', ' ')}.\n",
+    )
+
+
+# MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS}
+MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
+_TASKS_STRINGS.extend(MMLU_STRING)
+_TASKS += MMLU_TASKS
+
+# BBH ##
+
+
+class CustomBBHEvaluationTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        prompt_function="bbh_prompt",
+        hf_repo="lighteval/big_bench_hard",
+        hf_subset=None,
+        metric=[Metrics.exact_match],
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select=None,
+        suite=None,
+        generation_size=4,
+        trust_dataset=True,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            trust_dataset=trust_dataset,
+            stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+
+
+BBH_TASKS = [
+    CustomBBHEvaluationTask(name="bbh:boolean_expressions", hf_subset="boolean_expressions"),
+    CustomBBHEvaluationTask(name="bbh:causal_judgement", hf_subset="causal_judgement"),
+    CustomBBHEvaluationTask(name="bbh:date_understanding", hf_subset="date_understanding"),
+    CustomBBHEvaluationTask(name="bbh:disambiguation_qa", hf_subset="disambiguation_qa"),
+    CustomBBHEvaluationTask(name="bbh:dyck_languages", hf_subset="dyck_languages"),
+    CustomBBHEvaluationTask(name="bbh:formal_fallacies", hf_subset="formal_fallacies"),
+    CustomBBHEvaluationTask(name="bbh:geometric_shapes", hf_subset="geometric_shapes"),
+    CustomBBHEvaluationTask(name="bbh:hyperbaton", hf_subset="hyperbaton"),
+    CustomBBHEvaluationTask(name="bbh:logical_deduction_five_objects", hf_subset="logical_deduction_five_objects"),
+    CustomBBHEvaluationTask(name="bbh:logical_deduction_seven_objects", hf_subset="logical_deduction_seven_objects"),
+    CustomBBHEvaluationTask(name="bbh:logical_deduction_three_objects", hf_subset="logical_deduction_three_objects"),
+    CustomBBHEvaluationTask(name="bbh:movie_recommendation", hf_subset="movie_recommendation"),
+    CustomBBHEvaluationTask(name="bbh:multistep_arithmetic_two", hf_subset="multistep_arithmetic_two"),
+    CustomBBHEvaluationTask(name="bbh:navigate", hf_subset="navigate"),
+    CustomBBHEvaluationTask(name="bbh:object_counting", hf_subset="object_counting"),
+    CustomBBHEvaluationTask(name="bbh:penguins_in_a_table", hf_subset="penguins_in_a_table"),
+    CustomBBHEvaluationTask(name="bbh:reasoning_about_colored_objects", hf_subset="reasoning_about_colored_objects"),
+    CustomBBHEvaluationTask(name="bbh:ruin_names", hf_subset="ruin_names"),
+    CustomBBHEvaluationTask(
+        name="bbh:salient_translation_error_detection", hf_subset="salient_translation_error_detection"
+    ),
+    CustomBBHEvaluationTask(name="bbh:snarks", hf_subset="snarks"),
+    CustomBBHEvaluationTask(name="bbh:sports_understanding", hf_subset="sports_understanding"),
+    CustomBBHEvaluationTask(name="bbh:temporal_sequences", hf_subset="temporal_sequences"),
+    CustomBBHEvaluationTask(
+        name="bbh:tracking_shuffled_objects_five_objects", hf_subset="tracking_shuffled_objects_five_objects"
+    ),
+    CustomBBHEvaluationTask(
+        name="bbh:tracking_shuffled_objects_seven_objects", hf_subset="tracking_shuffled_objects_seven_objects"
+    ),
+    CustomBBHEvaluationTask(
+        name="bbh:tracking_shuffled_objects_three_objects", hf_subset="tracking_shuffled_objects_three_objects"
+    ),
+    CustomBBHEvaluationTask(name="bbh:web_of_lies", hf_subset="web_of_lies"),
+    CustomBBHEvaluationTask(name="bbh:word_sorting", hf_subset="word_sorting"),
+]
+
+
+def bbh_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["input"] + "\nAnswer: ",
+        choices=[line["target"]],
+        gold_index=0,
+    )
+
+
+# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS}
+BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS]
+_TASKS_STRINGS.extend(BBH_STRING)
+_TASKS += BBH_TASKS
+
+
+# AGI eval ##
+class CustomAGIEvalEvaluationTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        prompt_function="agi_eval_prompt_no_letters",
+        hf_repo="lighteval/agi_eval_en",
+        hf_subset=None,
+        #  metric=[Metrics.loglikelihood_acc_single_token],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["train"],
+        few_shots_split="validation",
+        few_shots_select=None,
+        suite=None,
+        generation_size=-1,
+        trust_dataset=True,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            trust_dataset=trust_dataset,
+            stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+
+
+AGIEVAL_TASKS = [
+    CustomAGIEvalEvaluationTask(name="agi_eval:aqua_rat", hf_subset="aqua_rat"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:logiqa-en", hf_subset="logiqa-en"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-ar", hf_subset="lsat-ar"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-lr", hf_subset="lsat-lr"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-rc", hf_subset="lsat-rc"),
+    CustomAGIEvalEvaluationTask(
+        name="agi_eval:math",
+        hf_subset="math",
+        prompt_function="agi_eval_math_prompt",
+        metric=[Metrics.exact_match, Metrics.quasi_exact_match],
+        generation_size=40,
+    ),
+    CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:sat-math", hf_subset="sat-math"),
+]
+
+
+def agi_eval_math_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[line["answer"]],
+        gold_index=0,
+        instruction="",
+    )
+
+
+def agi_eval_prompt(line, task_name: str = None):
+    cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]]
+    prompt = "The following are multiple choice questions (with answers).\n\n"
+    prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n"
+    prompt += "Answer: "
+
+    choices = LETTER_INDICES[: len(line["options"])]
+
+    output = Doc(
+        query=prompt,
+        instruction="The following are multiple choice questions (with answers).\n\n",
+    )
+
+    if line["label"]:
+        output.choices = choices
+        output.gold_index = LETTER_INDICES.index(line["label"].strip())
+    else:
+        output.choices = [line["answer"]]
+        output.gold_index = 0
+
+    return output
+
+
+def agi_eval_prompt_no_letters(line, task_name: str = None):
+    cleaned_options = [
+        " " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "")
+        for o in line["options"]
+    ]
+
+    output = Doc(
+        query=line["question"],
+        choices=cleaned_options,
+        gold_index=LETTER_INDICES.index(line["label"].strip()),
+        instruction="",
+    )
+
+    return output
+
+
+# AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS}
+AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS]
+_TASKS_STRINGS.extend(AGIEVAL_STRING)
+_TASKS += AGIEVAL_TASKS
+
+
+# HUMAN EVAL ##
+# human_eval = LightevalTaskConfig(
+#         name="human_eval",
+#         prompt_function="human_eval",
+#         hf_repo="lighteval/human_eval",
+#         metric=["human_eval_pass_at_1"],
+#     ),
+
+
+EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING])
+
+# Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in _TASKS]
+# You can have a few pre-organised groups of tasks
+TASKS_GROUPS = {
+    "all": ",".join(t[1] for t in _TASKS_STRINGS),
+    "early-signal": EARLY_SIGNAL_TASKS,
+}
+
+if __name__ == "__main__":
+    print(t["name"] for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))
diff --git a/tasks_examples/custom_tasks/custom_task.py b/tasks_examples/custom_tasks/custom_task.py
new file mode 100644
index 000000000..77f43c657
--- /dev/null
+++ b/tasks_examples/custom_tasks/custom_task.py
@@ -0,0 +1,100 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+TASKS_TABLE = [
+    {
+        "name": "mmlu:anatomy",
+        "suite": ["custom"],
+        "prompt_function": "mmlu_anatomy",
+        "hf_repo": "lighteval/mmlu",
+        "hf_subset": "anatomy",
+        "hf_avail_splits": ["auxiliary_train", "test", "validation", "dev"],
+        "evaluation_splits": ["test"],
+        "few_shots_split": "dev",
+        "few_shots_select": "sequential",
+        "generation_size": 5,
+        "metric": ["loglikelihood_acc_single_token"],
+        "stop_sequence": ["\n"],
+        "output_regex": None,
+        "frozen": False,
+    },
+    {
+        "name": "mmlu:anatomy_signs",
+        "suite": ["custom"],
+        "prompt_function": "mmlu_anatomy_signs",
+        "hf_repo": "lighteval/mmlu",
+        "hf_subset": "anatomy",
+        "hf_avail_splits": ["auxiliary_train", "test", "validation", "dev"],
+        "evaluation_splits": ["test"],
+        "few_shots_split": "dev",
+        "few_shots_select": "sequential",
+        "generation_size": 5,
+        "metric": ["loglikelihood_acc_single_token"],
+        "stop_sequence": ["\n"],
+        "output_regex": None,
+        "frozen": False,
+    },
+]
+
+
+def mmlu_anatomy_signs(line):
+    return mmlu_signs(line, "anatomy")
+
+
+def mmlu_anatomy(line):
+    return mmlu_numbers(line, "anatomy")
+
+
+def mmlu_numbers(line, topic):
+    prompt = f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n"
+    prompt += line["question"] + "\n"
+    prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(["1", "2", "3", "4"], line["choices"])])
+    prompt += "Answer:"
+
+    gold_ix = ["1", "2", "3", "4"].index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    is_few_shots = line.get("__few_shots", False)  # We are adding few shots
+
+    return {
+        "query": prompt,
+        "choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"],
+        "target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix],
+        "gold_index": gold_ix,
+        "instruction": f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
+    }
+
+
+def mmlu_signs(line, topic):
+    prompt = f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n"
+    prompt += line["question"] + "\n"
+    prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(["+", "*", "=", "#"], line["choices"])])
+    prompt += "Answer:"
+
+    gold_ix = ["+", "*", "=", "#"].index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    is_few_shots = line.get("__few_shots", False)  # We are adding few shots
+
+    return {
+        "query": prompt,
+        "choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"],
+        "target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix],
+        "gold_index": gold_ix,
+        "instruction": f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
+    }
diff --git a/tasks_examples/custom_tasks/lighteval_config_override_template.yaml b/tasks_examples/custom_tasks/lighteval_config_override_template.yaml
new file mode 100644
index 000000000..390a81ecb
--- /dev/null
+++ b/tasks_examples/custom_tasks/lighteval_config_override_template.yaml
@@ -0,0 +1,30 @@
+lighteval:
+  batch_size: 16
+  checkpoints_path: null
+  generation: null
+  logging:
+    hub_repo_details: null
+    hub_repo_results: null
+    hub_repo_tensorboard: HuggingFaceBR4/thomwolf-webdata-std-two
+    local_output_path: /scratch/thomwolf/lighteval/webdata-std-two-1p82G-wet_files_1-seed-5-698496
+    push_details_to_hub: false
+    push_results_to_hub: false
+    push_results_to_tensorboard: true
+    tensorboard_metric_prefix: e
+  parallelism:
+    dp: 1
+    pp: 1
+    pp_engine: 1f1b
+    recompute_granularity: null
+    tp: 1
+    tp_linear_async_communication: false
+    tp_mode: ALL_REDUCE
+  tasks:
+    custom_tasks: /fsx/thomwolf/github/lighteval/tasks_examples/custom_tasks/custom_evaluation_tasks.py
+    dataset_loading_processes: 8
+    max_samples: 10
+    multichoice_continuations_start_space: null
+    no_multichoice_continuations_start_space: null
+    num_fewshot_seeds: null
+    tasks: early-signal
+    # tasks: custom|hellaswag|0
diff --git a/tasks_examples/open_llm_leaderboard_tasks.txt b/tasks_examples/open_llm_leaderboard_tasks.txt
new file mode 100644
index 000000000..51de4f473
--- /dev/null
+++ b/tasks_examples/open_llm_leaderboard_tasks.txt
@@ -0,0 +1,68 @@
+# ARC
+leaderboard|arc:challenge|25|0
+# HellaSwag
+leaderboard|hellaswag|10|0
+# TruthfulQA
+leaderboard|truthfulqa:mc|0|0
+# MMLU
+leaderboard|mmlu:abstract_algebra|5|0
+leaderboard|mmlu:anatomy|5|0
+leaderboard|mmlu:astronomy|5|0
+leaderboard|mmlu:business_ethics|5|0
+leaderboard|mmlu:clinical_knowledge|5|0
+leaderboard|mmlu:college_biology|5|0
+leaderboard|mmlu:college_chemistry|5|0
+leaderboard|mmlu:college_computer_science|5|0
+leaderboard|mmlu:college_mathematics|5|0
+leaderboard|mmlu:college_medicine|5|0
+leaderboard|mmlu:college_physics|5|0
+leaderboard|mmlu:computer_security|5|0
+leaderboard|mmlu:conceptual_physics|5|0
+leaderboard|mmlu:econometrics|5|0
+leaderboard|mmlu:electrical_engineering|5|0
+leaderboard|mmlu:elementary_mathematics|5|0
+leaderboard|mmlu:formal_logic|5|0
+leaderboard|mmlu:global_facts|5|0
+leaderboard|mmlu:high_school_biology|5|0
+leaderboard|mmlu:high_school_chemistry|5|0
+leaderboard|mmlu:high_school_computer_science|5|0
+leaderboard|mmlu:high_school_european_history|5|0
+leaderboard|mmlu:high_school_geography|5|0
+leaderboard|mmlu:high_school_government_and_politics|5|0
+leaderboard|mmlu:high_school_macroeconomics|5|0
+leaderboard|mmlu:high_school_mathematics|5|0
+leaderboard|mmlu:high_school_microeconomics|5|0
+leaderboard|mmlu:high_school_physics|5|0
+leaderboard|mmlu:high_school_psychology|5|0
+leaderboard|mmlu:high_school_statistics|5|0
+leaderboard|mmlu:high_school_us_history|5|0
+leaderboard|mmlu:high_school_world_history|5|0
+leaderboard|mmlu:human_aging|5|0
+leaderboard|mmlu:human_sexuality|5|0
+leaderboard|mmlu:international_law|5|0
+leaderboard|mmlu:jurisprudence|5|0
+leaderboard|mmlu:logical_fallacies|5|0
+leaderboard|mmlu:machine_learning|5|0
+leaderboard|mmlu:management|5|0
+leaderboard|mmlu:marketing|5|0
+leaderboard|mmlu:medical_genetics|5|0
+leaderboard|mmlu:miscellaneous|5|0
+leaderboard|mmlu:moral_disputes|5|0
+leaderboard|mmlu:moral_scenarios|5|0
+leaderboard|mmlu:nutrition|5|0
+leaderboard|mmlu:philosophy|5|0
+leaderboard|mmlu:prehistory|5|0
+leaderboard|mmlu:professional_accounting|5|0
+leaderboard|mmlu:professional_law|5|0
+leaderboard|mmlu:professional_medicine|5|0
+leaderboard|mmlu:professional_psychology|5|0
+leaderboard|mmlu:public_relations|5|0
+leaderboard|mmlu:security_studies|5|0
+leaderboard|mmlu:sociology|5|0
+leaderboard|mmlu:us_foreign_policy|5|0
+leaderboard|mmlu:virology|5|0
+leaderboard|mmlu:world_religions|5|0
+# WinoGrande
+leaderboard|winogrande|5|0
+# GSM8K
+leaderboard|gsm8k|5|0
diff --git a/tasks_examples/recommended_set.txt b/tasks_examples/recommended_set.txt
new file mode 100644
index 000000000..d1904e3cc
--- /dev/null
+++ b/tasks_examples/recommended_set.txt
@@ -0,0 +1,160 @@
+# Commonsense-QA
+helm|commonsenseqa|0|0
+lighteval|ethics:commonsense|0|0
+lighteval|ethics:deontology|0|0
+lighteval|ethics:justice|0|0
+lighteval|ethics:utilitarianism|0|0
+lighteval|ethics:virtue|0|0
+# MMLU
+leaderboard|mmlu:abstract_algebra|0|0
+leaderboard|mmlu:anatomy|0|0
+leaderboard|mmlu:astronomy|0|0
+leaderboard|mmlu:business_ethics|0|0
+leaderboard|mmlu:clinical_knowledge|0|0
+leaderboard|mmlu:college_biology|0|0
+leaderboard|mmlu:college_chemistry|0|0
+leaderboard|mmlu:college_computer_science|0|0
+leaderboard|mmlu:college_mathematics|0|0
+leaderboard|mmlu:college_medicine|0|0
+leaderboard|mmlu:college_physics|0|0
+leaderboard|mmlu:computer_security|0|0
+leaderboard|mmlu:conceptual_physics|0|0
+leaderboard|mmlu:econometrics|0|0
+leaderboard|mmlu:electrical_engineering|0|0
+leaderboard|mmlu:elementary_mathematics|0|0
+leaderboard|mmlu:formal_logic|0|0
+leaderboard|mmlu:global_facts|0|0
+leaderboard|mmlu:high_school_biology|0|0
+leaderboard|mmlu:high_school_chemistry|0|0
+leaderboard|mmlu:high_school_computer_science|0|0
+leaderboard|mmlu:high_school_european_history|0|0
+leaderboard|mmlu:high_school_geography|0|0
+leaderboard|mmlu:high_school_government_and_politics|0|0
+leaderboard|mmlu:high_school_macroeconomics|0|0
+leaderboard|mmlu:high_school_mathematics|0|0
+leaderboard|mmlu:high_school_microeconomics|0|0
+leaderboard|mmlu:high_school_physics|0|0
+leaderboard|mmlu:high_school_psychology|0|0
+leaderboard|mmlu:high_school_statistics|0|0
+leaderboard|mmlu:high_school_us_history|0|0
+leaderboard|mmlu:high_school_world_history|0|0
+leaderboard|mmlu:human_aging|0|0
+leaderboard|mmlu:human_sexuality|0|0
+leaderboard|mmlu:international_law|0|0
+leaderboard|mmlu:jurisprudence|0|0
+leaderboard|mmlu:logical_fallacies|0|0
+leaderboard|mmlu:machine_learning|0|0
+leaderboard|mmlu:management|0|0
+leaderboard|mmlu:marketing|0|0
+leaderboard|mmlu:medical_genetics|0|0
+leaderboard|mmlu:miscellaneous|0|0
+leaderboard|mmlu:moral_disputes|0|0
+leaderboard|mmlu:moral_scenarios|0|0
+leaderboard|mmlu:nutrition|0|0
+leaderboard|mmlu:philosophy|0|0
+leaderboard|mmlu:prehistory|0|0
+leaderboard|mmlu:professional_accounting|0|0
+leaderboard|mmlu:professional_law|0|0
+leaderboard|mmlu:professional_medicine|0|0
+leaderboard|mmlu:professional_psychology|0|0
+leaderboard|mmlu:public_relations|0|0
+leaderboard|mmlu:security_studies|0|0
+leaderboard|mmlu:sociology|0|0
+leaderboard|mmlu:us_foreign_policy|0|0
+leaderboard|mmlu:virology|0|0
+leaderboard|mmlu:world_religions|0|0
+original|mmlu:abstract_algebra|0|0
+original|mmlu:anatomy|0|0
+original|mmlu:astronomy|0|0
+original|mmlu:business_ethics|0|0
+original|mmlu:clinical_knowledge|0|0
+original|mmlu:college_biology|0|0
+original|mmlu:college_chemistry|0|0
+original|mmlu:college_computer_science|0|0
+original|mmlu:college_mathematics|0|0
+original|mmlu:college_medicine|0|0
+original|mmlu:college_physics|0|0
+original|mmlu:computer_security|0|0
+original|mmlu:conceptual_physics|0|0
+original|mmlu:econometrics|0|0
+original|mmlu:electrical_engineering|0|0
+original|mmlu:elementary_mathematics|0|0
+original|mmlu:formal_logic|0|0
+original|mmlu:global_facts|0|0
+original|mmlu:high_school_biology|0|0
+original|mmlu:high_school_chemistry|0|0
+original|mmlu:high_school_computer_science|0|0
+original|mmlu:high_school_european_history|0|0
+original|mmlu:high_school_geography|0|0
+original|mmlu:high_school_government_and_politics|0|0
+original|mmlu:high_school_macroeconomics|0|0
+original|mmlu:high_school_mathematics|0|0
+original|mmlu:high_school_microeconomics|0|0
+original|mmlu:high_school_physics|0|0
+original|mmlu:high_school_psychology|0|0
+original|mmlu:high_school_statistics|0|0
+original|mmlu:high_school_us_history|0|0
+original|mmlu:high_school_world_history|0|0
+original|mmlu:human_aging|0|0
+original|mmlu:human_sexuality|0|0
+original|mmlu:international_law|0|0
+original|mmlu:jurisprudence|0|0
+original|mmlu:logical_fallacies|0|0
+original|mmlu:machine_learning|0|0
+original|mmlu:management|0|0
+original|mmlu:marketing|0|0
+original|mmlu:medical_genetics|0|0
+original|mmlu:miscellaneous|0|0
+original|mmlu:moral_disputes|0|0
+original|mmlu:moral_scenarios|0|0
+original|mmlu:nutrition|0|0
+original|mmlu:philosophy|0|0
+original|mmlu:prehistory|0|0
+original|mmlu:professional_accounting|0|0
+original|mmlu:professional_law|0|0
+original|mmlu:professional_medicine|0|0
+original|mmlu:professional_psychology|0|0
+original|mmlu:public_relations|0|0
+original|mmlu:security_studies|0|0
+original|mmlu:sociology|0|0
+original|mmlu:us_foreign_policy|0|0
+original|mmlu:virology|0|0
+original|mmlu:world_religions|0|0
+original|mmlu|0|0
+# ARC
+leaderboard|arc:challenge|0|0
+lighteval|arc:easy|0|0
+original|arc:c:letters|0|0
+original|arc:c:options|0|0
+original|arc:c:simple|0|0
+# HellaSwag
+helm|hellaswag|0|0
+leaderboard|hellaswag|0|0
+# PIQA
+helm|piqa|0|0
+lighteval|piqa|0|0
+# SIQA
+helm|siqa|0|0
+# WinoGrande
+leaderboard|winogrande|0|0
+# OpenBookQA
+lighteval|openbookqa|0|0
+helm|openbookqa|0|0
+# TriviaQA
+lighteval|triviaqa|0|0
+# BoolQ
+helm|boolq:contrastset|0|0
+helm|boolq|0|0
+# QUAC
+helm|quac|0|0
+# GSM8K
+leaderboard|gsm8k|0|0
+# MATH
+lighteval|math:algebra|0|0
+lighteval|math:counting_and_probability|0|0
+lighteval|math:geometry|0|0
+lighteval|math:intermediate_algebra|0|0
+lighteval|math:number_theory|0|0
+lighteval|math:prealgebra|0|0
+lighteval|math:precalculus|0|0
+# To add: NaturalQuestions, BBH, AGIEval

From 49037557109898462d8aa7ff9d98569430f9027d Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 14:41:23 +0000
Subject: [PATCH 18/45] commit

---
 extended_tasks/mt_bench/judges.py        | 2 +-
 extended_tasks/mt_bench/model_adapter.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index 05d986133..c2a998997 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -8,7 +8,7 @@
 
 import openai
 
-from tasks_examples.custom_tasks_with_custom_metrics.mt_bench.model_adapter import conv_templates
+from extended_tasks.mt_bench.model_adapter import conv_templates
 
 
 openai.api_key = os.environ["OPENAI_API_KEY"]
diff --git a/extended_tasks/mt_bench/model_adapter.py b/extended_tasks/mt_bench/model_adapter.py
index 7239dd270..e653efb6e 100644
--- a/extended_tasks/mt_bench/model_adapter.py
+++ b/extended_tasks/mt_bench/model_adapter.py
@@ -5,8 +5,8 @@
 import re
 import sys
 import warnings
-from enum import IntEnum, auto
 from dataclasses import field
+from enum import IntEnum, auto
 from io import BytesIO
 from typing import Dict, List, Optional, Tuple, Union
 

From 9ff0707f089e46059392e29bd109ca27888be368 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 14:43:01 +0000
Subject: [PATCH 19/45] commit

---
 extended_tasks/mt_bench/main.py | 122 ++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 extended_tasks/mt_bench/main.py

diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
new file mode 100644
index 000000000..f5afef823
--- /dev/null
+++ b/extended_tasks/mt_bench/main.py
@@ -0,0 +1,122 @@
+# ruff: noqa: F405, F403, F401
+"""
+Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
+This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+Author:
+"""
+
+from pprint import pprint
+
+import numpy as np
+from aenum import extend_enum
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from extended_tasks.mt_bench.judges import (
+    load_judge_prompts,
+    make_judge_single,
+    play_a_match_single,
+)
+from lighteval.metrics import Metrics
+from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
+
+
+NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
+
+## EVAL WITH NO SUBSET ##
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+task = LightevalTaskConfig(
+    name="mt_bench",
+    prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["extended"],
+    hf_repo="SaylorTwift/mt-bench",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split="",
+    few_shots_select="random",
+    metric=["mt_bench_metric"],
+    generation_size=1024,
+    stop_sequence=[],
+)
+
+
+## DEFINE YOUR PROMPT FUNCTIONS
+# Define as many as you need for your different tasks
+def prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=f"{line['turns'][0]}",
+        choices=None,
+        instruction=None,
+        gold_index=[],
+        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["turns"], "id": line["question_id"]},
+    )
+
+
+
+
+def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+    """Defines how to go from a list of predictions to a score.
+    Follow examples in src/lighteval/metrics/metrics.py, or get more info
+    about what this function should do in the README.
+    """
+    judge_model = "gpt-3.5-turbo"
+    judge_file = "tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl"
+    judge_prompts = load_judge_prompts(judge_file)
+    judges = make_judge_single(judge_model, judge_prompts)
+
+    question = formatted_doc.specific["multi_turn_queries"]
+    ref_answer = formatted_doc.specific["reference"]
+    category = formatted_doc.specific["category"]
+
+    if category not in NEED_REF_CATS:
+        score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+        score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+    else:
+        try:
+            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
+            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
+        except KeyError:
+            print(f"Category {category} not found in judge prompts, using default judge")
+            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+
+    return {"single_turn": score, "multi_turn": score_mt, "user_prompt": [user_prompt_1, user_prompt_2], "judgement": [judgement_1, judgement_2]}
+
+
+mt_bench_metric = SampleLevelMetricGrouping(
+    metric="mt_bench_metric",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE_MULTI_TURN,
+    use_case=MetricUseCase.SUMMARIZATION,
+    sample_level_fn=mt_bench_metric,
+    corpus_level_fn={
+        "single_turn": np.mean,
+        "multi_turn": np.mean,
+    }
+)
+
+## STORE YOUR EVALS
+_TASKS = [task]
+
+## MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in _TASKS]
+extend_enum(
+    Metrics,
+    "mt_bench_metric",
+    mt_bench_metric,
+)
+
+if __name__ == "__main__":
+    print(t["name"] for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))

From ff177a169ecffcfef384d8b3f4ed4b8c43f1e4a0 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 16:14:46 +0000
Subject: [PATCH 20/45] commit

---
 extended_tasks/ifeval/main.py         |  2 +
 extended_tasks/mt_bench/main.py       |  2 +-
 src/lighteval/evaluator.py            |  6 --
 src/lighteval/logging/info_loggers.py | 11 ----
 src/lighteval/metrics/__init__.py     |  3 +-
 src/lighteval/models/base_model.py    |  5 +-
 src/lighteval/tasks/lighteval_task.py | 82 +++++++++++++++++----------
 src/lighteval/tasks/registry.py       |  1 -
 8 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/extended_tasks/ifeval/main.py b/extended_tasks/ifeval/main.py
index 70a3013f4..8211338f8 100644
--- a/extended_tasks/ifeval/main.py
+++ b/extended_tasks/ifeval/main.py
@@ -50,6 +50,8 @@
     stop_sequence=[],  # no stop sequence, will use eot token
 )
 
+def tmp():
+    pass
 
 # very specific task where there are no precise outputs but instead we test if the format obeys rules
 def ifeval_prompt(line, task_name: str = None):
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index f5afef823..f44e47990 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -69,7 +69,7 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     about what this function should do in the README.
     """
     judge_model = "gpt-3.5-turbo"
-    judge_file = "tasks_examples/custom_tasks_with_custom_metrics/mt_bench/judge_prompts.jsonl"
+    judge_file = "extended_tasks/mt_bench/judge_prompts.jsonl"
     judge_prompts = load_judge_prompts(judge_file)
     judges = make_judge_single(judge_model, judge_prompts)
 
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index c273c6718..9449afef1 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -25,7 +25,6 @@
 
 import collections
 import copy
-from pprint import pprint
 from typing import Dict, Union
 
 from pytablewriter import LatexTableWriter, MarkdownTableWriter
@@ -130,11 +129,6 @@ def evaluate(  # noqa: C901
         else:
             judgement = None
 
-        # pprint(doc)
-        # pprint(metrics)
-        # pprint(model_responses)
-        # print("===========")
-
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
         evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement))
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index f0cd66e2a..046022601 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -446,22 +446,11 @@ class MetricsLogger:
 
     metrics_values: dict[str, dict[str, list[float]]] = collections.defaultdict(lambda: collections.defaultdict(list))
     metric_aggregated: dict[str, dict[str, float]] = collections.defaultdict(lambda: collections.defaultdict(dict))
-    llm_as_judge_prompts: dict[str, dict[str, list[str]]] = collections.defaultdict(lambda: collections.defaultdict(list))
 
     def log(self, task_name: str, metrics: dict) -> None:
         for metric_name, metric_value in metrics.items():
             self.metrics_values[task_name][metric_name].append(metric_value)
 
-    def log_llm_as_judge(self, task_name: str, user_prompt: str, judgement: str) -> None:
-        """Logs the user prompt and the judgement of the model as a judge.
-
-        Args:
-            user_prompt (str): User prompt used to judge the model response.
-            judgement (str): Judgement of the model response.
-
-        """
-        self.llm_as_judge_prompts[task_name].append({"judgement": judgement, "user_prompt": user_prompt})
-
     def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = 1000):  # noqa: C901
         """
         Aggregate the metrics for each task and then for all tasks.
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 5e20d6b8b..3774fc273 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -147,8 +147,7 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
 
     return results, outputs
 
-
-def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], eval_tracker=None):
+def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
     outputs = {}
     predictions = results.pop(0).result
 
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 78904178e..81b4a8341 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -21,7 +21,6 @@
 # SOFTWARE.
 
 import os
-from pprint import pprint
 from typing import Optional, Tuple, Union
 
 import torch
@@ -394,7 +393,7 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
             cur_reponses = self._generate(
                 batch=prepared_batch,
-                max_tokens=max_generated_tokens,
+                max_new_tokens=max_generated_tokens,
                 stop_tokens=stop_tokens,
                 returns_logits=False,
             )
@@ -427,7 +426,7 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
                 cur_reponses = self._generate(
                     batch=prepared_batch,
-                    max_tokens=max_generated_tokens,
+                    max_new_tokens=max_generated_tokens,
                     stop_tokens=stop_tokens,
                     returns_logits=False,
                 )
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index c6a0b6ebf..c5826b84e 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -28,6 +28,7 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 from datasets import load_dataset
+from transformers import PreTrainedTokenizer
 
 from lighteval.few_shot_manager import FewShotSampler
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
@@ -221,7 +222,7 @@ def __init__(  # noqa: C901
                     hlog_warn(
                         f"Be careful you are using custom prompt function {cfg.prompt_function} and not the default one."
                     )
-                self.formatter = getattr(module, cfg.prompt_function)
+                self.formatter = formatter[0]
             else:
                 raise Exception(
                     f"You defined the prompt function {cfg.prompt_function} several times in the different custom modules you are loading."
@@ -518,7 +519,7 @@ def construct_requests(
 
         return requests
 
-    def process_results(self, formatted_doc: Doc, results: list[ModelReturn], evaluation_tracker) -> dict[str, float]:
+    def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]:
         """
         Processes the results of the task, and stores them in the output dict.
 
@@ -563,7 +564,7 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn], evalua
             outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
             results, cur_outputs = apply_generative_multi_turn_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics, eval_tracker=evaluation_tracker
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
 
@@ -622,6 +623,47 @@ def download_dataset_worker(args):
     return dataset
 
 
+def create_multi_turn_contexts(
+    doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
+) -> list[str]:
+    """Creates N contexts (depending on the number of turn) for a tasks.
+    Multi turn tasks need use chat templating.
+
+    Args:
+        doc (Doc): Formated document.
+        use_chat_template (bool): wether or not to use chat template. Will fail if false.
+        system_prompt (Optional[str]): The system prompt to use
+        tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
+
+    Raises:
+        ValueError: If use_chat_template is set to false.
+
+    Returns:
+        list[str]: contexts for every turn
+    """
+    if not use_chat_template:
+        raise ValueError("You need to use the chat template to create multi turn contexts")
+
+    role_content_list = []
+    if system_prompt is not None:
+        role_content_list.append({"role": "system", "content": system_prompt})
+
+    for i in doc.specific["multi_turn_queries"]:
+        role_content_list.append({"role": "user", "content": i})
+        role_content_list.append({"role": "assistant", "content": "{model_response}"})
+    role_content_list.pop(-1)
+
+    contexts = []
+    offset = 2 if system_prompt is not None else 1
+    for i in range(0, len(role_content_list), offset + 1):
+        c = tokenizer.apply_chat_template(
+            role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
+        )
+        contexts.append(c)
+
+    return contexts, 0
+
+
 def create_requests_from_tasks(  # noqa: C901
     task_dict: dict[str, LightevalTask],
     fewshot_dict: dict[str, list[Tuple[int, bool]]],
@@ -705,34 +747,14 @@ def create_requests_from_tasks(  # noqa: C901
                             use_chat_template=use_chat_template,
                             system_prompt=system_prompt,
                         )
-                        doc.num_effective_few_shots = num_effective_few_shots
-                        doc.num_asked_few_shots = num_fewshot
                     else:
-                        if use_chat_template:
-                            k = []
-                            if system_prompt is not None:
-                                k.append({"role": "system", "content": system_prompt})
-
-                            for i in doc.specific["multi_turn_queries"]:
-                                k.append(
-                                    {"role": "user", "content": i}
-                                )
-                                k.append({"role": "assistant", "content": "{model_response}"})
-                            k.pop(-1)
-
-                            from pprint import pprint
-                            ctx = []
-
-                            offset = 2 if system_prompt is not None else 1
-
-                            for i in range(0, len(k), offset+1):
-                                c = lm.tokenizer.apply_chat_template(k[:i+offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False)
-                                ctx.append(c)
-
-                        doc.specific["multi_turn_queries_context"] = ctx
-                        doc.num_effective_few_shots = 0
-                        doc.num_asked_few_shots = 0
-                    doc.ctx = ctx
+                        doc.ctx, num_effective_few_shots = create_multi_turn_contexts(
+                            doc, use_chat_template, system_prompt, lm.tokenizer
+                        )
+                        doc.specific["multi_turn_queries_context"] = doc.ctx
+
+                    doc.num_effective_few_shots = num_effective_few_shots
+                    doc.num_asked_few_shots = num_fewshot
 
                     # Constructing the requests
                     docs[TaskExampleId(cur_task_name, doc_id_seed)] = doc
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index fde147426..5625fe75d 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -177,7 +177,6 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
     if isinstance(custom_tasks, ModuleType):
         return custom_tasks
     if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks):
-        print(f"{custom_tasks=}")
         dataset_module = dataset_module_factory(str(custom_tasks))
         return importlib.import_module(dataset_module.module_path)
     if isinstance(custom_tasks, (str, Path)):

From 9794b7c77d62382c74d878f29abe3141ec3da9ae Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 16:16:25 +0000
Subject: [PATCH 21/45] commit

---
 extended_tasks/ifeval/main.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/extended_tasks/ifeval/main.py b/extended_tasks/ifeval/main.py
index 8211338f8..70a3013f4 100644
--- a/extended_tasks/ifeval/main.py
+++ b/extended_tasks/ifeval/main.py
@@ -50,8 +50,6 @@
     stop_sequence=[],  # no stop sequence, will use eot token
 )
 
-def tmp():
-    pass
 
 # very specific task where there are no precise outputs but instead we test if the format obeys rules
 def ifeval_prompt(line, task_name: str = None):

From 6268ff67e19de72852b96544e6864167c561a081 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 16:19:53 +0000
Subject: [PATCH 22/45] commit

---
 src/lighteval/tasks/lighteval_task.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index c5826b84e..ee7a18dd4 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -748,13 +748,14 @@ def create_requests_from_tasks(  # noqa: C901
                             system_prompt=system_prompt,
                         )
                     else:
-                        doc.ctx, num_effective_few_shots = create_multi_turn_contexts(
+                        ctx, num_effective_few_shots = create_multi_turn_contexts(
                             doc, use_chat_template, system_prompt, lm.tokenizer
                         )
                         doc.specific["multi_turn_queries_context"] = doc.ctx
 
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot
+                    doc.ctx = ctx
 
                     # Constructing the requests
                     docs[TaskExampleId(cur_task_name, doc_id_seed)] = doc

From 31eaab1bd73b468411fc1c28a7f5c0ea348a71e6 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 20 Mar 2024 16:20:43 +0000
Subject: [PATCH 23/45] commit

---
 src/lighteval/tasks/lighteval_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index ee7a18dd4..46679e948 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -751,7 +751,7 @@ def create_requests_from_tasks(  # noqa: C901
                         ctx, num_effective_few_shots = create_multi_turn_contexts(
                             doc, use_chat_template, system_prompt, lm.tokenizer
                         )
-                        doc.specific["multi_turn_queries_context"] = doc.ctx
+                        doc.specific["multi_turn_queries_context"] = ctx
 
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot

From c80ef8cdb83623a86086e247fba38a3cf573173c Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 11:19:17 +0000
Subject: [PATCH 24/45] commit

---
 extended_tasks/mt_bench/judges.py             | 222 ++++------
 extended_tasks/mt_bench/main.py               |  73 ++--
 extended_tasks/mt_bench/model_adapter.py      | 406 ------------------
 extended_tasks/tiny_benchmarks/main.py        |   2 +-
 src/lighteval/data.py                         |   2 +-
 src/lighteval/evaluator.py                    |   6 +-
 src/lighteval/logging/info_loggers.py         |  10 +-
 src/lighteval/metrics/__init__.py             |   1 +
 src/lighteval/metrics/imports/bert_scorer.py  |  10 +-
 .../metrics/imports/data_stats_utils.py       |   2 +-
 src/lighteval/metrics/imports/summac.py       |   2 +-
 src/lighteval/metrics/normalizations.py       |   8 +-
 src/lighteval/models/base_model.py            |  24 +-
 src/lighteval/models/model_output.py          |   1 +
 src/lighteval/models/nanotron_model.py        |  10 +-
 src/lighteval/models/tgi_model.py             |   2 +-
 src/lighteval/tasks/lighteval_task.py         |   4 +-
 src/lighteval/tasks/requests.py               |   2 +
 .../tasks/tasks_prompt_formatting.py          |   2 +-
 19 files changed, 165 insertions(+), 624 deletions(-)
 delete mode 100644 extended_tasks/mt_bench/model_adapter.py

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index c2a998997..e3620670a 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -1,173 +1,97 @@
 import ast
-import copy
-import dataclasses
 import json
-import os
 import re
-import time
 
-import openai
+from openai import OpenAI
 
-from extended_tasks.mt_bench.model_adapter import conv_templates
 
-
-openai.api_key = os.environ["OPENAI_API_KEY"]
-
-
-# Extract scores from judgments
-two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
-two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
-one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
-one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
-
-OPENAI_MODEL_LIST = (
-    "gpt-3.5-turbo",
-    "gpt-3.5-turbo-0301",
-    "gpt-3.5-turbo-0613",
-    "gpt-3.5-turbo-1106",
-    "gpt-3.5-turbo-0125",
-    "gpt-4",
-    "gpt-4-0314",
-    "gpt-4-0613",
-    "gpt-4-turbo",
-    "gpt-4-1106-preview",
-    "gpt-4-0125-preview",
-)
-
-# API setting constants
-API_MAX_RETRY = 16
-API_RETRY_SLEEP = 10
-API_ERROR_OUTPUT = "$ERROR$"
-
-# Categories that need reference answers
-NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
-
-@dataclasses.dataclass
 class Judge:
-    model_name: str
-    prompt_template: dict
-    ref_based: bool = False
-    multi_turn: bool = False
+    def evaluate_answer(answer, question, reference) -> tuple[str, list[dict[str, str]], str]:
+        pass
 
-@dataclasses.dataclass
-class MatchSingle:
-    question: dict
-    model: str
-    answer: dict
-    judge: Judge
-    ref_answer: dict = None
-    multi_turn: bool = False
 
-def make_judge_single(judge_model, judge_prompts):
-    judges = {}
-    judges["default"] = Judge(judge_model, judge_prompts["single-v1"])
-    judges["math"] = Judge(judge_model, judge_prompts["single-math-v1"], ref_based=True)
-    judges["default-mt"] = Judge(
-        judge_model, judge_prompts["single-v1-multi-turn"], multi_turn=True
-    )
-    judges["math-mt"] = Judge(
-        judge_model,
-        judge_prompts["single-math-v1-multi-turn"],
-        ref_based=True,
-        multi_turn=True,
-    )
-    return judges
+class Judge_OpenAI(Judge):
+    def __init__(self, model: str, seed: int, temperature: float, templates_path: str):
+        self.client = OpenAI()
+        self.model = model
+        self.seed = seed
+        self.temperature = temperature
 
+        data = []
+        with open(templates_path, "r") as f:
+            for line in f:
+                tmp = json.loads(line)
+                data.append(tmp)
 
-def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
-    if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            messages = conv.to_openai_api_messages()
-            response = openai.ChatCompletion.create(
-                model=model,
-                messages=messages,
-                n=1,
-                temperature=temperature,
-                max_tokens=max_tokens,
-            )
-            output = response["choices"][0]["message"]["content"]
-            break
-        except openai.error.OpenAIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-
-    return output
-
+        self.templates = {d["name"]: d for d in data}
 
-def load_judge_prompts(prompt_file: str):
-    """Load judge prompts.
-
-    The return value is a python dict of type:
-    Dict[judge_name: str -> dict]
-    """
-    prompts = {}
-    with open(prompt_file) as fin:
-        for line in fin:
-            line = json.loads(line)
-            prompts[line["name"]] = line
-    return prompts
+        self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
+        self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
 
+    def evaluate_answer(self, questions, answers, references, single_turn: bool):
+        if single_turn:
+            score, messages, answer = self.__single_turn_evaluate(
+                questions[0], answers[0], references[0] if len(references) > 0 else None
+            )
+        else:
+            score, messages, answer = self.__multi_turn_evaluate(questions, answers, references)
+        return score, messages, answer
 
-def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
-    kwargs = {}
-    model = judge.model_name
-    if ref_answer is not None and len(ref_answer) > 0:
-        kwargs["ref_answer_1"] = ref_answer[0]
-        if multi_turn:
-            kwargs["ref_answer_2"] = ref_answer[1]
+    def __single_turn_evaluate(self, question, answer, reference):
+        if reference is None or len(reference) == 0:
+            system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
+            user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
+        else:
+            system_prompt = {"role": "system", "content": self.templates["single-math-v1"]["system_prompt"]}
+            user_prompt_str = self.templates["single-math-v1"]["prompt_template"].format(
+                question=question, answer=answer, ref_answer_1=reference
+            )
 
-    if multi_turn:
-        user_prompt = judge.prompt_template["prompt_template"].format(
-            question_1=question[0],
-            question_2=question[1],
-            answer_1=answer[0],
-            answer_2=answer[1],
-            **kwargs,
+        user_prompt = {"role": "user", "content": user_prompt_str}
+        messages = [system_prompt, user_prompt]
+        response = self.client.chat.completions.create(
+            model=self.model,
+            seed=self.seed,
+            temperature=self.temperature,
+            messages=messages,
         )
-    else:
-        user_prompt = judge.prompt_template["prompt_template"].format(
-            question=question[0],
-            answer=answer[0],
-            **kwargs,
+        judgment = response.choices[0].message.content
+        return self.__process_judge_response(judgment), messages, judgment
+
+    def __multi_turn_evaluate(self, questions, answers, references):
+        if references is None or len(references) == 0:
+            system_prompt = {"role": "system", "content": self.templates["single-v1-multi-turn"]["system_prompt"]}
+            user_prompt_str = self.templates["single-v1-multi-turn"]["prompt_template"].format(
+                question_1=questions[0], answer_1=answers[0], question_2=questions[1], answer_2=answers[1]
+            )
+        else:
+            system_prompt = {"role": "system", "content": self.templates["single-math-v1-multi-turn"]["system_prompt"]}
+            user_prompt_str = self.templates["single-math-v1-multi-turn"]["prompt_template"].format(
+                question_1=questions[0],
+                answer_1=answers[0],
+                ref_answer_1=references[0],
+                question_2=questions[1],
+                answer_2=answers[1],
+                ref_answer_2=references[1],
+            )
+        user_prompt = {"role": "user", "content": user_prompt_str}
+        messages = [system_prompt, user_prompt]
+        response = self.client.chat.completions.create(
+            model=self.model,
+            seed=self.seed,
+            temperature=self.temperature,
+            messages=messages,
         )
+        judgment = response.choices[0].message.content
+        return self.__process_judge_response(judgment), messages, judgment
 
-    rating = -1
-
-    system_prompt = judge.prompt_template["system_prompt"]
-    conv = copy.deepcopy(conv_templates["chatgpt"])
-    conv.set_system_message(system_prompt)
-    conv.append_message(conv.roles[0], user_prompt)
-    conv.append_message(conv.roles[1], None)
-
-    if model in OPENAI_MODEL_LIST:
-        judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
-    else:
-        raise ValueError(f"Invalid judge model name: {model}")
-
-    if judge.prompt_template["output_format"] == "[[rating]]":
-        match = re.search(one_score_pattern, judgment)
+    def __process_judge_response(self, judgment: str) -> int:
+        match = re.search(self.one_score_pattern, judgment)
         if not match:
-            match = re.search(one_score_pattern_backup, judgment)
-
+            match = re.search(self.one_score_pattern_backup, judgment)
         if match:
             rating = ast.literal_eval(match.groups()[0])
         else:
             rating = -1
-    else:
-        raise ValueError(
-            f"invalid output format: {judge.prompt_template['output_format']}"
-        )
 
-    return rating, user_prompt, judgment
-
-
-def play_a_match_single(question, answer, ref_answer, judge, multi_turn, output_file: str):
-    if judge.prompt_template["type"] == "single":
-        score, user_prompt, judgment = run_judge_single(
-            question, answer, judge, ref_answer, multi_turn=multi_turn
-        )
-        return score, user_prompt, judgment
+        return rating
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index f44e47990..15f73ebe5 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -11,11 +11,7 @@
 from aenum import extend_enum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from extended_tasks.mt_bench.judges import (
-    load_judge_prompts,
-    make_judge_single,
-    play_a_match_single,
-)
+from extended_tasks.mt_bench.judges import Judge_OpenAI
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -23,9 +19,7 @@
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
 
-NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
-
-## EVAL WITH NO SUBSET ##
+# EVAL WITH NO SUBSET ##
 # This is how you create a simple tasks (like hellaswag) which has one single subset
 # attached to it, and one evaluation possible.
 task = LightevalTaskConfig(
@@ -44,7 +38,7 @@
 )
 
 
-## DEFINE YOUR PROMPT FUNCTIONS
+# DEFINE YOUR PROMPT FUNCTIONS
 # Define as many as you need for your different tasks
 def prompt_fn(line, task_name: str = None):
     """Defines how to go from a dataset line to a doc object.
@@ -57,39 +51,44 @@ def prompt_fn(line, task_name: str = None):
         choices=None,
         instruction=None,
         gold_index=[],
-        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["turns"], "id": line["question_id"]},
+        specific={
+            "reference": line["reference"],
+            "category": line["category"],
+            "multi_turn_queries": line["turns"],
+            "id": line["question_id"],
+        },
     )
 
 
-
-
 def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
     """Defines how to go from a list of predictions to a score.
     Follow examples in src/lighteval/metrics/metrics.py, or get more info
     about what this function should do in the README.
     """
-    judge_model = "gpt-3.5-turbo"
-    judge_file = "extended_tasks/mt_bench/judge_prompts.jsonl"
-    judge_prompts = load_judge_prompts(judge_file)
-    judges = make_judge_single(judge_model, judge_prompts)
-
-    question = formatted_doc.specific["multi_turn_queries"]
-    ref_answer = formatted_doc.specific["reference"]
-    category = formatted_doc.specific["category"]
-
-    if category not in NEED_REF_CATS:
-        score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-        score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
-    else:
-        try:
-            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
-            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
-        except KeyError:
-            print(f"Category {category} not found in judge prompts, using default judge")
-            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
-
-    return {"single_turn": score, "multi_turn": score_mt, "user_prompt": [user_prompt_1, user_prompt_2], "judgement": [judgement_1, judgement_2]}
+
+    judge = Judge_OpenAI(
+        model="gpt-3.5-turbo",
+        seed=42,
+        temperature=0.0,
+        templates_path="extended_tasks/mt_bench/judge_prompts.jsonl",
+    )
+
+    questions = formatted_doc.specific["multi_turn_queries"]
+    ref_answers = formatted_doc.specific["reference"]
+
+    score, messages, judgement = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=True)
+    score_mt, messages_mt, judgement_mt = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=False)
+
+    pprint(score)
+    pprint(messages)
+    pprint(judgement)
+
+    return {
+        "single_turn": score,
+        "multi_turn": score_mt,
+        "user_prompt": [messages, messages_mt],
+        "judgement": [judgement, judgement_mt],
+    }
 
 
 mt_bench_metric = SampleLevelMetricGrouping(
@@ -101,13 +100,13 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     corpus_level_fn={
         "single_turn": np.mean,
         "multi_turn": np.mean,
-    }
+    },
 )
 
-## STORE YOUR EVALS
+# STORE YOUR EVALS
 _TASKS = [task]
 
-## MODULE LOGIC
+# MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]
diff --git a/extended_tasks/mt_bench/model_adapter.py b/extended_tasks/mt_bench/model_adapter.py
deleted file mode 100644
index e653efb6e..000000000
--- a/extended_tasks/mt_bench/model_adapter.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import base64
-import dataclasses
-import math
-import os
-import re
-import sys
-import warnings
-from dataclasses import field
-from enum import IntEnum, auto
-from io import BytesIO
-from typing import Dict, List, Optional, Tuple, Union
-
-
-IMAGE_PLACEHOLDER_STR = "$$<image>$$"
-
-
-class SeparatorStyle(IntEnum):
-    """Separator styles."""
-
-    ADD_COLON_SINGLE = auto()
-    ADD_COLON_TWO = auto()
-    ADD_COLON_SPACE_SINGLE = auto()
-    NO_COLON_SINGLE = auto()
-    NO_COLON_TWO = auto()
-    ADD_NEW_LINE_SINGLE = auto()
-    LLAMA2 = auto()
-    CHATGLM = auto()
-    CHATML = auto()
-    CHATINTERN = auto()
-    DOLLY = auto()
-    RWKV = auto()
-    PHOENIX = auto()
-    ROBIN = auto()
-    FALCON_CHAT = auto()
-    CHATGLM3 = auto()
-    DEEPSEEK_CHAT = auto()
-    METAMATH = auto()
-    YUAN2 = auto()
-
-@dataclasses.dataclass
-class Conversation:
-    """A class that manages prompt templates and keeps all conversation history."""
-
-    # The name of this template
-    name: str
-    # The template of the system prompt
-    system_template: str = "{system_message}"
-    # The system message
-    system_message: str = ""
-    # The names of two roles
-    roles: Tuple[str] = ("USER", "ASSISTANT")
-    # All messages. Each item is (role, message).
-    # Each message is either a string or a tuple of (string, List[image_url]).
-    messages: List[List[str]] = field(default_factory=list)
-    # The number of few shot examples
-    offset: int = 0
-    # The separator style and configurations
-    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
-    sep: str = "\n"
-    sep2: str = None
-    # Stop criteria (the default one is EOS token)
-    stop_str: Union[str, List[str]] = None
-    # Stops generation if meeting any token in this list
-    stop_token_ids: List[int] = None
-
-    def get_prompt(self) -> str:
-        """Get the prompt for generation."""
-        system_prompt = self.system_template.format(system_message=self.system_message)
-        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    if type(message) is tuple:
-                        message, images = message
-                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ": "  # must be end with a space
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
-            ret = "" if system_prompt == "" else system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + "\n" + message + self.sep
-                else:
-                    ret += role + "\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + message + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.RWKV:
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += (
-                        role
-                        + ": "
-                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
-                    )
-                    ret += "\n\n"
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.LLAMA2:
-            seps = [self.sep, self.sep2]
-            if self.system_message:
-                ret = system_prompt
-            else:
-                ret = "[INST] "
-            for i, (role, message) in enumerate(self.messages):
-                tag = self.roles[i % 2]
-                if message:
-                    if i == 0:
-                        ret += message + " "
-                    else:
-                        ret += tag + " " + message + seps[i % 2]
-                else:
-                    ret += tag
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM:
-            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
-            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
-            round_add_n = 1 if self.name == "chatglm2" else 0
-            if system_prompt:
-                ret = system_prompt + self.sep
-            else:
-                ret = ""
-
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
-
-                if message:
-                    ret += f"{role}：{message}{self.sep}"
-                else:
-                    ret += f"{role}："
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATML:
-            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, images = message
-                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
-                    ret += role + "\n" + message + self.sep + "\n"
-                else:
-                    ret += role + "\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM3:
-            ret = ""
-            if self.system_message:
-                ret += system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + "\n" + message
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATINTERN:
-            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += "<s>"
-                if message:
-                    ret += role + ":" + message + seps[i % 2] + "\n"
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.DOLLY:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ":\n" + message + seps[i % 2]
-                    if i % 2 == 1:
-                        ret += "\n\n"
-                else:
-                    ret += role + ":\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.PHOENIX:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + "<s>" + message + "</s>"
-                else:
-                    ret += role + ": " + "<s>"
-            return ret
-        elif self.sep_style == SeparatorStyle.ROBIN:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ":\n" + message + self.sep
-                else:
-                    ret += role + ":\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
-            ret = ""
-            if self.system_message:
-                ret += system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.METAMATH:
-            ret = "" if system_prompt == "" else system_prompt + self.sep
-            for i, (role, message) in enumerate(self.messages):
-                # For MetaMath, sep2 is used to prefix the message.
-                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
-                ending_sep = self.sep if i % 2 == 0 else ""
-                if message:
-                    ret += role + starting_sep + message + ending_sep
-                else:
-                    ret += role + starting_sep
-            return ret
-        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.YUAN2:
-            seps = [self.sep, self.sep2]
-            ret = ""
-            if self.system_message:
-                ret += system_prompt + seps[1]
-            for _, message in self.messages:
-                if message:
-                    ret += message + "<n>"
-                else:
-                    ret += ""
-            ret = ret.rstrip("<n>") + seps[0]
-            return ret
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-
-    def get_images(self):
-        images = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    for image in msg[1]:
-                        images.append(image)
-
-        return images
-
-    def set_system_message(self, system_message: str):
-        """Set the system message."""
-        self.system_message = system_message
-
-    def append_message(self, role: str, message: str):
-        """Append a new message."""
-        self.messages.append([role, message])
-
-    def update_last_message(self, message: str):
-        """Update the last output.
-
-        The last message is typically set to be None when constructing the prompt,
-        so we need to update it in-place after getting the response from a model.
-        """
-        self.messages[-1][1] = message
-
-    def convert_image_to_base64(self, image):
-        """Given an image, return the base64 encoded image string."""
-        import requests
-        from PIL import Image
-
-        # Load image if it has not been loaded in yet
-        if type(image) == str:
-            if image.startswith("http://") or image.startswith("https://"):
-                response = requests.get(image)
-                image = Image.open(BytesIO(response.content)).convert("RGB")
-            elif "base64" in image:
-                # OpenAI format is: data:image/jpeg;base64,{base64_encoded_image_str}
-                return image.split(",")[1]
-            else:
-                image = Image.open(image).convert("RGB")
-
-        max_hw, min_hw = max(image.size), min(image.size)
-        aspect_ratio = max_hw / min_hw
-        max_len, min_len = 2048, 2048
-        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-        longest_edge = int(shortest_edge * aspect_ratio)
-        W, H = image.size
-        if longest_edge != max(image.size):
-            if H > W:
-                H, W = longest_edge, shortest_edge
-            else:
-                H, W = shortest_edge, longest_edge
-            image = image.resize((W, H))
-
-        buffered = BytesIO()
-        image.save(buffered, format="PNG")
-        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-
-        return img_b64_str
-
-    def to_gradio_chatbot(self):
-        """Convert the conversation to gradio chatbot format."""
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    msg, image = msg
-                    img_b64_str = image[0]  # Only one image on gradio at one time
-                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace("<image>\n", "").strip()
-
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
-    def to_openai_api_messages(self):
-        """Convert the conversation to OpenAI chat completion format."""
-        if self.system_message == "":
-            ret = []
-        else:
-            ret = [{"role": "system", "content": self.system_message}]
-
-        for i, (_, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append({"role": "user", "content": msg})
-            else:
-                if msg is not None:
-                    ret.append({"role": "assistant", "content": msg})
-        return ret
-
-    def extract_text_from_messages(self):
-        return [
-            (role, message[0]) if type(message) is tuple else (role, message)
-            for role, message in self.messages
-        ]
-
-    def copy(self):
-        return Conversation(
-            name=self.name,
-            system_template=self.system_template,
-            system_message=self.system_message,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            stop_str=self.stop_str,
-            stop_token_ids=self.stop_token_ids,
-        )
-
-    def dict(self):
-        return {
-            "template_name": self.name,
-            "system_message": self.system_message,
-            "roles": self.roles,
-            "messages": self.extract_text_from_messages(),
-            "offset": self.offset,
-        }
-
-
-# A global registry for all conversation templates
-conv_templates: Dict[str, Conversation] = {
-    "chatgpt":  Conversation(
-        name="chatgpt",
-        system_message="You are a helpful assistant.",
-        roles=("user", "assistant"),
-        sep_style=None,
-        sep=None,
-    )
-}
diff --git a/extended_tasks/tiny_benchmarks/main.py b/extended_tasks/tiny_benchmarks/main.py
index 079ec706b..dddf936c6 100644
--- a/extended_tasks/tiny_benchmarks/main.py
+++ b/extended_tasks/tiny_benchmarks/main.py
@@ -139,7 +139,7 @@ def aggregate(self, y_input):
             ind_scenario = (
                 self.number_of_examples * ([i for i, s in enumerate(scenarios_position.keys()) if s == self.task][0])
             )
-            seen_examples = seen_examples[ind_scenario : ind_scenario + self.number_of_examples]
+            seen_examples = seen_examples[ind_scenario: ind_scenario + self.number_of_examples]
         else:
             scenarios = list(scenarios_position.keys())
 
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 33437175a..9df3559c0 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -296,7 +296,7 @@ def __iter__(self) -> Iterator[T_co]:
         assert len(indices) == self.total_size
 
         # subsample
-        indices = indices[self.rank : self.total_size : self.num_replicas]
+        indices = indices[self.rank: self.total_size: self.num_replicas]
         assert len(indices) == self.num_samples
 
         return iter(indices)
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index 9449afef1..5070261cc 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -115,7 +115,7 @@ def evaluate(  # noqa: C901
             doc.instruction = ""
 
         # using a deep copy here because process results pops from the model responses
-        metrics = task.process_results(doc, copy.deepcopy(model_responses), evaluation_tracker=evaluation_tracker)
+        metrics = task.process_results(doc, copy.deepcopy(model_responses))
 
         # Remove the user_prompt from the metrics in case of llm-as-judge metric
         if "user_prompt" in metrics:
@@ -130,7 +130,9 @@ def evaluate(  # noqa: C901
             judgement = None
 
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
-        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement))
+        evaluation_tracker.details_logger.log(
+            task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
+        )
 
     return evaluation_tracker
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 046022601..f65359d5b 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -305,7 +305,15 @@ class CompiledHash:
     compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
     compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()
 
-    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict, llm_as_prompt_judgement: tuple[str, str]) -> None:
+    def log(
+        self,
+        task_name: str,
+        task: LightevalTask,
+        doc: Doc,
+        outputs: list[ModelReturn],
+        metrics: dict,
+        llm_as_prompt_judgement: tuple[str, str],
+    ) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
         Args:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 3774fc273..3dfd0ca9a 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -147,6 +147,7 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
 
     return results, outputs
 
+
 def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
     outputs = {}
     predictions = results.pop(0).result
diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
index dd8c0ee84..cd6d6d86a 100644
--- a/src/lighteval/metrics/imports/bert_scorer.py
+++ b/src/lighteval/metrics/imports/bert_scorer.py
@@ -131,8 +131,8 @@ def get_bert_embedding(
         for i in range(0, len(all_sens), batch_size):
             batch_embedding = bert_encode(
                 model,
-                padded_sens[i : i + batch_size],
-                attention_mask=mask[i : i + batch_size],
+                padded_sens[i: i + batch_size],
+                attention_mask=mask[i: i + batch_size],
                 all_layers=all_layers,
             )
             embeddings.append(batch_embedding)
@@ -269,7 +269,7 @@ def dedup_and_sort(l_item):
     iter_range = range(0, len(sentences), batch_size)
     stats_dict = {}
     for batch_start in iter_range:
-        sen_batch = sentences[batch_start : batch_start + batch_size]
+        sen_batch = sentences[batch_start: batch_start + batch_size]
         embs, masks, padded_idf = get_bert_embedding(
             sen_batch, model, tokenizer, idf_dict, device=device, all_layers=all_layers
         )
@@ -305,8 +305,8 @@ def length_to_mask(lens):
 
     with torch.no_grad():
         for batch_start in iter_range:
-            batch_refs = refs[batch_start : batch_start + batch_size]
-            batch_hyps = hyps[batch_start : batch_start + batch_size]
+            batch_refs = refs[batch_start: batch_start + batch_size]
+            batch_hyps = hyps[batch_start: batch_start + batch_size]
             ref_stats = pad_batch_stats(batch_refs, stats_dict, device)
             hyp_stats = pad_batch_stats(batch_hyps, stats_dict, device)
 
diff --git a/src/lighteval/metrics/imports/data_stats_utils.py b/src/lighteval/metrics/imports/data_stats_utils.py
index 708edee42..77e279a81 100644
--- a/src/lighteval/metrics/imports/data_stats_utils.py
+++ b/src/lighteval/metrics/imports/data_stats_utils.py
@@ -80,7 +80,7 @@ def strings(self, min_length=0, summary_base=True):
 
         # Generate strings, filtering out strings below the minimum length.
 
-        strings = [base[i : i + length] for i, j, length in self.overlaps() if length > min_length]
+        strings = [base[i: i + length] for i, j, length in self.overlaps() if length > min_length]
 
         # By default, we just return the tokenization being used.
         # But if they user wants a raw string, then we convert.
diff --git a/src/lighteval/metrics/imports/summac.py b/src/lighteval/metrics/imports/summac.py
index 2803ba59e..d5fa99a77 100644
--- a/src/lighteval/metrics/imports/summac.py
+++ b/src/lighteval/metrics/imports/summac.py
@@ -146,7 +146,7 @@ def split_sentences(self, text):
     def split_2sents(self, text):
         sentences = nltk.tokenize.sent_tokenize(text)
         sentences = [sent for sent in sentences if len(sent) > 10]
-        two_sents = [" ".join(sentences[i : (i + 2)]) for i in range(len(sentences))]
+        two_sents = [" ".join(sentences[i: (i + 2)]) for i in range(len(sentences))]
         return two_sents
 
     def split_paragraphs(self, text):
diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py
index 2bf180007..12c6357d2 100644
--- a/src/lighteval/metrics/normalizations.py
+++ b/src/lighteval/metrics/normalizations.py
@@ -98,14 +98,14 @@ def _remove_boxed(text: str) -> str:
         if "\\boxed " in text:
             left = "\\boxed "
             assert text[: len(left)] == left
-            return text[len(left) :]
+            return text[len(left):]
 
         left = "\\boxed{"
 
         assert text[: len(left)] == left
         assert text[-1] == "}"
 
-        return text[len(left) : -1]
+        return text[len(left): -1]
 
     def _last_boxed_only_string(text: str) -> str | None:
         """Extract the last \\boxed{...} or \\fbox{...} element from a string."""
@@ -131,7 +131,7 @@ def _last_boxed_only_string(text: str) -> str | None:
         if right_brace_idx is None:
             retval = None
         else:
-            retval = text[idx : right_brace_idx + 1]
+            retval = text[idx: right_brace_idx + 1]
 
         return retval
 
@@ -222,7 +222,7 @@ def _fix_sqrt(text: str) -> str:
     else:
         indices = [pos for pos, char in enumerate(text) if char == "$"]
         if len(indices) > 1:
-            text = text[indices[0] + 1 : indices[-1]]
+            text = text[indices[0] + 1: indices[-1]]
 
     to_replace_1 = [
         ("\n", ""),  # linebreaks
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 81b4a8341..83e1581f0 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -352,7 +352,9 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None) -> GenerateMultiTurnReturn:
+    def greedy_until_multi_turn(
+        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
+    ) -> GenerateMultiTurnReturn:
         for request in requests:
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
@@ -433,7 +435,15 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
                 model_answers.append(cur_reponses[0].result)
 
-            results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
+            results.append(
+                GenerateMultiTurnReturn(
+                    result=model_answers,
+                    input_tokens=[],
+                    generated_tokens=[],
+                    truncated_tokens_count=0,
+                    padded_tokens_count=0,
+                )
+            )
 
         return results
 
@@ -581,7 +591,7 @@ def _generate(
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        generations = outputs.sequences[:, batch.input_ids.size(1) :]
+        generations = outputs.sequences[:, batch.input_ids.size(1):]
         generations, len_gens = self.pad_and_gather(generations)
         batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids)
 
@@ -685,7 +695,7 @@ def _loglikelihood_tokens(
                 max_context_continuation_size_allowed = len(context_enc + continuation_enc)
             else:  # in normal mode, we left cut the context if needed
                 max_context_continuation_size_allowed = len(
-                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+                    (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
                 )
 
             batch_size = self._get_batch_size(
@@ -722,7 +732,7 @@ def _loglikelihood_tokens(
                         cont_toks = cont_toks[:inplen].unsqueeze(0).to(self.device)  # [1, seq]
                     else:
                         cur_logits = (
-                            cur_logits[inplen - contlen : inplen].unsqueeze(0).to(self.device)
+                            cur_logits[inplen - contlen: inplen].unsqueeze(0).to(self.device)
                         )  # [1, seq, voc]
                         cont_toks = cont_toks.unsqueeze(0).to(self.device)  # [1, seq]
 
@@ -902,7 +912,7 @@ def _loglikelihood_single_token(
 
         for split_start, split_end in tqdm(dataset.splits_start_end_iterator()):
             context_enc = dataset[0].tokenized_context
-            max_context = len(context_enc[-self.max_length :])
+            max_context = len(context_enc[-self.max_length:])
             batch_size = self._get_batch_size(override_bs=override_bs, max_input_length=max_context)
             starting_batch_size = batch_size * 2
 
@@ -997,7 +1007,7 @@ def __init__(
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
         # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][:, -self.sequence_id_len :]
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length:][:, -self.sequence_id_len:]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
index e36b92432..510278585 100644
--- a/src/lighteval/models/model_output.py
+++ b/src/lighteval/models/model_output.py
@@ -65,6 +65,7 @@ class GenerateReturn(ModelReturn):
     def get_result_for_eval(self):
         return self.result if self.logits is None else (self.result, self.logits)
 
+
 @dataclass
 class GenerateMultiTurnReturn(ModelReturn):
     result: list[str] = field(default_factory=list)
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 89c1b75c9..7189e4d11 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -693,7 +693,7 @@ def _loglikelihood_single_token(
             # automatic (variable) batch size detection for vectorization
             # pull longest context sample from request
             context_enc = dataset[0].tokenized_context
-            max_context = len(context_enc[-self.max_length :])
+            max_context = len(context_enc[-self.max_length:])
             batch_size = self._get_batch_size(
                 override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
             )
@@ -925,7 +925,7 @@ def _loglikelihood_tokens(
             context_enc = dataset[0].tokenized_context
             continuation_enc = dataset[0].tokenized_continuation
 
-            max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
+            max_context = len((context_enc + continuation_enc)[-(self.max_length + 1):][:-1])
 
             batch_size = self._get_batch_size(
                 override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
@@ -1011,7 +1011,7 @@ def _loglikelihood_tokens(
                             #             f"top_tokens: {top_toks_str}\ncont_tokens: {cont_toks_str}")
 
                             cur_logits = (
-                                cur_logits[inplen - contlen : inplen].unsqueeze(0).to(self.device)
+                                cur_logits[inplen - contlen: inplen].unsqueeze(0).to(self.device)
                             )  # [1, seq, voc]
                             cont_toks = cont_toks.unsqueeze(0).to(self.device)  # [1, seq]
 
@@ -1233,7 +1233,7 @@ def greedy_until(
                 dist.barrier()  # Got everyone to send their stuff
                 outputs = list(outputs)
 
-                generations = torch.stack([o.generation_ids[o.input_ids.shape[0] :] for o in outputs])
+                generations = torch.stack([o.generation_ids[o.input_ids.shape[0]:] for o in outputs])
                 batch_input_ids, len_ids = self.pad_and_gather(batch_model.input_ids)
                 batch_generations, _ = self.pad_and_gather(generations)
 
@@ -1336,7 +1336,7 @@ def __init__(
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
         # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][:, -self.sequence_id_len :]
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length:][:, -self.sequence_id_len:]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 
diff --git a/src/lighteval/models/tgi_model.py b/src/lighteval/models/tgi_model.py
index 5d519667b..694bb398b 100644
--- a/src/lighteval/models/tgi_model.py
+++ b/src/lighteval/models/tgi_model.py
@@ -42,7 +42,7 @@
 def divide_chunks(array, n):
     # looping till length array
     for i in range(0, len(array), n):
-        yield array[i : i + n]
+        yield array[i: i + n]
 
 
 class ModelClient:
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 46679e948..c4858a574 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -258,7 +258,7 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str:
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
                 raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
-            return doc.query[len(doc.instruction) :]
+            return doc.query[len(doc.instruction):]
         return doc.query
 
     def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
@@ -277,7 +277,7 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
                 raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
-            return (doc.query[len(doc.instruction) :], doc.instruction)
+            return (doc.query[len(doc.instruction):], doc.instruction)
         return (doc.query, "")
 
     def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[str]:
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 8fb355e1e..c4c863359 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -120,6 +120,7 @@ class GreedyUntilRequest(Request):
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
 
+
 @dataclass
 class GreedyUntilMultiTurnRequest(Request):
     """
@@ -130,6 +131,7 @@ class GreedyUntilMultiTurnRequest(Request):
         generation_size (int): The maximum number of tokens to generate.
         request_type (RequestType): The type of the request, set to RequestType.GREEDY_UNTIL.
     """
+
     stop_sequence: str
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py
index 08f088b29..3cf565ec1 100644
--- a/src/lighteval/tasks/tasks_prompt_formatting.py
+++ b/src/lighteval/tasks/tasks_prompt_formatting.py
@@ -2452,7 +2452,7 @@ def normalize(doc, option):
             return option.replace(pronoun, pronoun.lower())
         return option
 
-    context, eos = line["text"][: line["pronoun_loc"]], line["text"][line["pronoun_loc"] + len(line["pronoun"]) :]
+    context, eos = line["text"][: line["pronoun_loc"]], line["text"][line["pronoun_loc"] + len(line["pronoun"]):]
 
     return Doc(
         task_name=task_name,

From c296b63f6d3fba8b70ce17c5714eafea47828bc1 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 11:21:11 +0000
Subject: [PATCH 25/45] Revert "commit"

This reverts commit c80ef8cdb83623a86086e247fba38a3cf573173c.
---
 extended_tasks/mt_bench/judges.py             | 222 ++++++----
 extended_tasks/mt_bench/main.py               |  73 ++--
 extended_tasks/mt_bench/model_adapter.py      | 406 ++++++++++++++++++
 extended_tasks/tiny_benchmarks/main.py        |   2 +-
 src/lighteval/data.py                         |   2 +-
 src/lighteval/evaluator.py                    |   6 +-
 src/lighteval/logging/info_loggers.py         |  10 +-
 src/lighteval/metrics/__init__.py             |   1 -
 src/lighteval/metrics/imports/bert_scorer.py  |  10 +-
 .../metrics/imports/data_stats_utils.py       |   2 +-
 src/lighteval/metrics/imports/summac.py       |   2 +-
 src/lighteval/metrics/normalizations.py       |   8 +-
 src/lighteval/models/base_model.py            |  24 +-
 src/lighteval/models/model_output.py          |   1 -
 src/lighteval/models/nanotron_model.py        |  10 +-
 src/lighteval/models/tgi_model.py             |   2 +-
 src/lighteval/tasks/lighteval_task.py         |   4 +-
 src/lighteval/tasks/requests.py               |   2 -
 .../tasks/tasks_prompt_formatting.py          |   2 +-
 19 files changed, 624 insertions(+), 165 deletions(-)
 create mode 100644 extended_tasks/mt_bench/model_adapter.py

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index e3620670a..c2a998997 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -1,97 +1,173 @@
 import ast
+import copy
+import dataclasses
 import json
+import os
 import re
+import time
 
-from openai import OpenAI
+import openai
 
+from extended_tasks.mt_bench.model_adapter import conv_templates
 
-class Judge:
-    def evaluate_answer(answer, question, reference) -> tuple[str, list[dict[str, str]], str]:
-        pass
 
+openai.api_key = os.environ["OPENAI_API_KEY"]
 
-class Judge_OpenAI(Judge):
-    def __init__(self, model: str, seed: int, temperature: float, templates_path: str):
-        self.client = OpenAI()
-        self.model = model
-        self.seed = seed
-        self.temperature = temperature
 
-        data = []
-        with open(templates_path, "r") as f:
-            for line in f:
-                tmp = json.loads(line)
-                data.append(tmp)
+# Extract scores from judgments
+two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
+two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
+one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
 
-        self.templates = {d["name"]: d for d in data}
+OPENAI_MODEL_LIST = (
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0301",
+    "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-1106",
+    "gpt-3.5-turbo-0125",
+    "gpt-4",
+    "gpt-4-0314",
+    "gpt-4-0613",
+    "gpt-4-turbo",
+    "gpt-4-1106-preview",
+    "gpt-4-0125-preview",
+)
 
-        self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
-        self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
+# API setting constants
+API_MAX_RETRY = 16
+API_RETRY_SLEEP = 10
+API_ERROR_OUTPUT = "$ERROR$"
 
-    def evaluate_answer(self, questions, answers, references, single_turn: bool):
-        if single_turn:
-            score, messages, answer = self.__single_turn_evaluate(
-                questions[0], answers[0], references[0] if len(references) > 0 else None
-            )
-        else:
-            score, messages, answer = self.__multi_turn_evaluate(questions, answers, references)
-        return score, messages, answer
+# Categories that need reference answers
+NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
 
-    def __single_turn_evaluate(self, question, answer, reference):
-        if reference is None or len(reference) == 0:
-            system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
-            user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
-        else:
-            system_prompt = {"role": "system", "content": self.templates["single-math-v1"]["system_prompt"]}
-            user_prompt_str = self.templates["single-math-v1"]["prompt_template"].format(
-                question=question, answer=answer, ref_answer_1=reference
+@dataclasses.dataclass
+class Judge:
+    model_name: str
+    prompt_template: dict
+    ref_based: bool = False
+    multi_turn: bool = False
+
+@dataclasses.dataclass
+class MatchSingle:
+    question: dict
+    model: str
+    answer: dict
+    judge: Judge
+    ref_answer: dict = None
+    multi_turn: bool = False
+
+def make_judge_single(judge_model, judge_prompts):
+    judges = {}
+    judges["default"] = Judge(judge_model, judge_prompts["single-v1"])
+    judges["math"] = Judge(judge_model, judge_prompts["single-math-v1"], ref_based=True)
+    judges["default-mt"] = Judge(
+        judge_model, judge_prompts["single-v1-multi-turn"], multi_turn=True
+    )
+    judges["math-mt"] = Judge(
+        judge_model,
+        judge_prompts["single-math-v1-multi-turn"],
+        ref_based=True,
+        multi_turn=True,
+    )
+    return judges
+
+
+def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
+    if api_dict is not None:
+        openai.api_base = api_dict["api_base"]
+        openai.api_key = api_dict["api_key"]
+    output = API_ERROR_OUTPUT
+    for _ in range(API_MAX_RETRY):
+        try:
+            messages = conv.to_openai_api_messages()
+            response = openai.ChatCompletion.create(
+                model=model,
+                messages=messages,
+                n=1,
+                temperature=temperature,
+                max_tokens=max_tokens,
             )
+            output = response["choices"][0]["message"]["content"]
+            break
+        except openai.error.OpenAIError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+
+    return output
+
+
+def load_judge_prompts(prompt_file: str):
+    """Load judge prompts.
+
+    The return value is a python dict of type:
+    Dict[judge_name: str -> dict]
+    """
+    prompts = {}
+    with open(prompt_file) as fin:
+        for line in fin:
+            line = json.loads(line)
+            prompts[line["name"]] = line
+    return prompts
 
-        user_prompt = {"role": "user", "content": user_prompt_str}
-        messages = [system_prompt, user_prompt]
-        response = self.client.chat.completions.create(
-            model=self.model,
-            seed=self.seed,
-            temperature=self.temperature,
-            messages=messages,
+
+def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
+    kwargs = {}
+    model = judge.model_name
+    if ref_answer is not None and len(ref_answer) > 0:
+        kwargs["ref_answer_1"] = ref_answer[0]
+        if multi_turn:
+            kwargs["ref_answer_2"] = ref_answer[1]
+
+    if multi_turn:
+        user_prompt = judge.prompt_template["prompt_template"].format(
+            question_1=question[0],
+            question_2=question[1],
+            answer_1=answer[0],
+            answer_2=answer[1],
+            **kwargs,
         )
-        judgment = response.choices[0].message.content
-        return self.__process_judge_response(judgment), messages, judgment
-
-    def __multi_turn_evaluate(self, questions, answers, references):
-        if references is None or len(references) == 0:
-            system_prompt = {"role": "system", "content": self.templates["single-v1-multi-turn"]["system_prompt"]}
-            user_prompt_str = self.templates["single-v1-multi-turn"]["prompt_template"].format(
-                question_1=questions[0], answer_1=answers[0], question_2=questions[1], answer_2=answers[1]
-            )
-        else:
-            system_prompt = {"role": "system", "content": self.templates["single-math-v1-multi-turn"]["system_prompt"]}
-            user_prompt_str = self.templates["single-math-v1-multi-turn"]["prompt_template"].format(
-                question_1=questions[0],
-                answer_1=answers[0],
-                ref_answer_1=references[0],
-                question_2=questions[1],
-                answer_2=answers[1],
-                ref_answer_2=references[1],
-            )
-        user_prompt = {"role": "user", "content": user_prompt_str}
-        messages = [system_prompt, user_prompt]
-        response = self.client.chat.completions.create(
-            model=self.model,
-            seed=self.seed,
-            temperature=self.temperature,
-            messages=messages,
+    else:
+        user_prompt = judge.prompt_template["prompt_template"].format(
+            question=question[0],
+            answer=answer[0],
+            **kwargs,
         )
-        judgment = response.choices[0].message.content
-        return self.__process_judge_response(judgment), messages, judgment
 
-    def __process_judge_response(self, judgment: str) -> int:
-        match = re.search(self.one_score_pattern, judgment)
+    rating = -1
+
+    system_prompt = judge.prompt_template["system_prompt"]
+    conv = copy.deepcopy(conv_templates["chatgpt"])
+    conv.set_system_message(system_prompt)
+    conv.append_message(conv.roles[0], user_prompt)
+    conv.append_message(conv.roles[1], None)
+
+    if model in OPENAI_MODEL_LIST:
+        judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
+    else:
+        raise ValueError(f"Invalid judge model name: {model}")
+
+    if judge.prompt_template["output_format"] == "[[rating]]":
+        match = re.search(one_score_pattern, judgment)
         if not match:
-            match = re.search(self.one_score_pattern_backup, judgment)
+            match = re.search(one_score_pattern_backup, judgment)
+
         if match:
             rating = ast.literal_eval(match.groups()[0])
         else:
             rating = -1
+    else:
+        raise ValueError(
+            f"invalid output format: {judge.prompt_template['output_format']}"
+        )
 
-        return rating
+    return rating, user_prompt, judgment
+
+
+def play_a_match_single(question, answer, ref_answer, judge, multi_turn, output_file: str):
+    if judge.prompt_template["type"] == "single":
+        score, user_prompt, judgment = run_judge_single(
+            question, answer, judge, ref_answer, multi_turn=multi_turn
+        )
+        return score, user_prompt, judgment
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index 15f73ebe5..f44e47990 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -11,7 +11,11 @@
 from aenum import extend_enum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from extended_tasks.mt_bench.judges import Judge_OpenAI
+from extended_tasks.mt_bench.judges import (
+    load_judge_prompts,
+    make_judge_single,
+    play_a_match_single,
+)
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -19,7 +23,9 @@
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
 
-# EVAL WITH NO SUBSET ##
+NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
+
+## EVAL WITH NO SUBSET ##
 # This is how you create a simple tasks (like hellaswag) which has one single subset
 # attached to it, and one evaluation possible.
 task = LightevalTaskConfig(
@@ -38,7 +44,7 @@
 )
 
 
-# DEFINE YOUR PROMPT FUNCTIONS
+## DEFINE YOUR PROMPT FUNCTIONS
 # Define as many as you need for your different tasks
 def prompt_fn(line, task_name: str = None):
     """Defines how to go from a dataset line to a doc object.
@@ -51,44 +57,39 @@ def prompt_fn(line, task_name: str = None):
         choices=None,
         instruction=None,
         gold_index=[],
-        specific={
-            "reference": line["reference"],
-            "category": line["category"],
-            "multi_turn_queries": line["turns"],
-            "id": line["question_id"],
-        },
+        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["turns"], "id": line["question_id"]},
     )
 
 
+
+
 def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
     """Defines how to go from a list of predictions to a score.
     Follow examples in src/lighteval/metrics/metrics.py, or get more info
     about what this function should do in the README.
     """
-
-    judge = Judge_OpenAI(
-        model="gpt-3.5-turbo",
-        seed=42,
-        temperature=0.0,
-        templates_path="extended_tasks/mt_bench/judge_prompts.jsonl",
-    )
-
-    questions = formatted_doc.specific["multi_turn_queries"]
-    ref_answers = formatted_doc.specific["reference"]
-
-    score, messages, judgement = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=True)
-    score_mt, messages_mt, judgement_mt = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=False)
-
-    pprint(score)
-    pprint(messages)
-    pprint(judgement)
-
-    return {
-        "single_turn": score,
-        "multi_turn": score_mt,
-        "user_prompt": [messages, messages_mt],
-        "judgement": [judgement, judgement_mt],
-    }
+    judge_model = "gpt-3.5-turbo"
+    judge_file = "extended_tasks/mt_bench/judge_prompts.jsonl"
+    judge_prompts = load_judge_prompts(judge_file)
+    judges = make_judge_single(judge_model, judge_prompts)
+
+    question = formatted_doc.specific["multi_turn_queries"]
+    ref_answer = formatted_doc.specific["reference"]
+    category = formatted_doc.specific["category"]
+
+    if category not in NEED_REF_CATS:
+        score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+        score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+    else:
+        try:
+            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
+            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
+        except KeyError:
+            print(f"Category {category} not found in judge prompts, using default judge")
+            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
+            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
+
+    return {"single_turn": score, "multi_turn": score_mt, "user_prompt": [user_prompt_1, user_prompt_2], "judgement": [judgement_1, judgement_2]}
 
 
 mt_bench_metric = SampleLevelMetricGrouping(
@@ -100,13 +101,13 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     corpus_level_fn={
         "single_turn": np.mean,
         "multi_turn": np.mean,
-    },
+    }
 )
 
-# STORE YOUR EVALS
+## STORE YOUR EVALS
 _TASKS = [task]
 
-# MODULE LOGIC
+## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]
diff --git a/extended_tasks/mt_bench/model_adapter.py b/extended_tasks/mt_bench/model_adapter.py
new file mode 100644
index 000000000..e653efb6e
--- /dev/null
+++ b/extended_tasks/mt_bench/model_adapter.py
@@ -0,0 +1,406 @@
+import base64
+import dataclasses
+import math
+import os
+import re
+import sys
+import warnings
+from dataclasses import field
+from enum import IntEnum, auto
+from io import BytesIO
+from typing import Dict, List, Optional, Tuple, Union
+
+
+IMAGE_PLACEHOLDER_STR = "$$<image>$$"
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    DEEPSEEK_CHAT = auto()
+    METAMATH = auto()
+    YUAN2 = auto()
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: Tuple[str] = ("USER", "ASSISTANT")
+    # All messages. Each item is (role, message).
+    # Each message is either a string or a tuple of (string, List[image_url]).
+    messages: List[List[str]] = field(default_factory=list)
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += tag + " " + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
+
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.METAMATH:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                # For MetaMath, sep2 is used to prefix the message.
+                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
+                ending_sep = self.sep if i % 2 == 0 else ""
+                if message:
+                    ret += role + starting_sep + message + ending_sep
+                else:
+                    ret += role + starting_sep
+            return ret
+        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.YUAN2:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + seps[1]
+            for _, message in self.messages:
+                if message:
+                    ret += message + "<n>"
+                else:
+                    ret += ""
+            ret = ret.rstrip("<n>") + seps[0]
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def get_images(self):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    for image in msg[1]:
+                        images.append(image)
+
+        return images
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def convert_image_to_base64(self, image):
+        """Given an image, return the base64 encoded image string."""
+        import requests
+        from PIL import Image
+
+        # Load image if it has not been loaded in yet
+        if type(image) == str:
+            if image.startswith("http://") or image.startswith("https://"):
+                response = requests.get(image)
+                image = Image.open(BytesIO(response.content)).convert("RGB")
+            elif "base64" in image:
+                # OpenAI format is: data:image/jpeg;base64,{base64_encoded_image_str}
+                return image.split(",")[1]
+            else:
+                image = Image.open(image).convert("RGB")
+
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        max_len, min_len = 2048, 2048
+        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if longest_edge != max(image.size):
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+
+        buffered = BytesIO()
+        image.save(buffered, format="PNG")
+        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+
+        return img_b64_str
+
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image = msg
+                    img_b64_str = image[0]  # Only one image on gradio at one time
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>\n", "").strip()
+
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def extract_text_from_messages(self):
+        return [
+            (role, message[0]) if type(message) is tuple else (role, message)
+            for role, message in self.messages
+        ]
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.extract_text_from_messages(),
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {
+    "chatgpt":  Conversation(
+        name="chatgpt",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        sep_style=None,
+        sep=None,
+    )
+}
diff --git a/extended_tasks/tiny_benchmarks/main.py b/extended_tasks/tiny_benchmarks/main.py
index dddf936c6..079ec706b 100644
--- a/extended_tasks/tiny_benchmarks/main.py
+++ b/extended_tasks/tiny_benchmarks/main.py
@@ -139,7 +139,7 @@ def aggregate(self, y_input):
             ind_scenario = (
                 self.number_of_examples * ([i for i, s in enumerate(scenarios_position.keys()) if s == self.task][0])
             )
-            seen_examples = seen_examples[ind_scenario: ind_scenario + self.number_of_examples]
+            seen_examples = seen_examples[ind_scenario : ind_scenario + self.number_of_examples]
         else:
             scenarios = list(scenarios_position.keys())
 
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 9df3559c0..33437175a 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -296,7 +296,7 @@ def __iter__(self) -> Iterator[T_co]:
         assert len(indices) == self.total_size
 
         # subsample
-        indices = indices[self.rank: self.total_size: self.num_replicas]
+        indices = indices[self.rank : self.total_size : self.num_replicas]
         assert len(indices) == self.num_samples
 
         return iter(indices)
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index 5070261cc..9449afef1 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -115,7 +115,7 @@ def evaluate(  # noqa: C901
             doc.instruction = ""
 
         # using a deep copy here because process results pops from the model responses
-        metrics = task.process_results(doc, copy.deepcopy(model_responses))
+        metrics = task.process_results(doc, copy.deepcopy(model_responses), evaluation_tracker=evaluation_tracker)
 
         # Remove the user_prompt from the metrics in case of llm-as-judge metric
         if "user_prompt" in metrics:
@@ -130,9 +130,7 @@ def evaluate(  # noqa: C901
             judgement = None
 
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
-        evaluation_tracker.details_logger.log(
-            task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
-        )
+        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement))
 
     return evaluation_tracker
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index f65359d5b..046022601 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -305,15 +305,7 @@ class CompiledHash:
     compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
     compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()
 
-    def log(
-        self,
-        task_name: str,
-        task: LightevalTask,
-        doc: Doc,
-        outputs: list[ModelReturn],
-        metrics: dict,
-        llm_as_prompt_judgement: tuple[str, str],
-    ) -> None:
+    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict, llm_as_prompt_judgement: tuple[str, str]) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
         Args:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 3dfd0ca9a..3774fc273 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -147,7 +147,6 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
 
     return results, outputs
 
-
 def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
     outputs = {}
     predictions = results.pop(0).result
diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
index cd6d6d86a..dd8c0ee84 100644
--- a/src/lighteval/metrics/imports/bert_scorer.py
+++ b/src/lighteval/metrics/imports/bert_scorer.py
@@ -131,8 +131,8 @@ def get_bert_embedding(
         for i in range(0, len(all_sens), batch_size):
             batch_embedding = bert_encode(
                 model,
-                padded_sens[i: i + batch_size],
-                attention_mask=mask[i: i + batch_size],
+                padded_sens[i : i + batch_size],
+                attention_mask=mask[i : i + batch_size],
                 all_layers=all_layers,
             )
             embeddings.append(batch_embedding)
@@ -269,7 +269,7 @@ def dedup_and_sort(l_item):
     iter_range = range(0, len(sentences), batch_size)
     stats_dict = {}
     for batch_start in iter_range:
-        sen_batch = sentences[batch_start: batch_start + batch_size]
+        sen_batch = sentences[batch_start : batch_start + batch_size]
         embs, masks, padded_idf = get_bert_embedding(
             sen_batch, model, tokenizer, idf_dict, device=device, all_layers=all_layers
         )
@@ -305,8 +305,8 @@ def length_to_mask(lens):
 
     with torch.no_grad():
         for batch_start in iter_range:
-            batch_refs = refs[batch_start: batch_start + batch_size]
-            batch_hyps = hyps[batch_start: batch_start + batch_size]
+            batch_refs = refs[batch_start : batch_start + batch_size]
+            batch_hyps = hyps[batch_start : batch_start + batch_size]
             ref_stats = pad_batch_stats(batch_refs, stats_dict, device)
             hyp_stats = pad_batch_stats(batch_hyps, stats_dict, device)
 
diff --git a/src/lighteval/metrics/imports/data_stats_utils.py b/src/lighteval/metrics/imports/data_stats_utils.py
index 77e279a81..708edee42 100644
--- a/src/lighteval/metrics/imports/data_stats_utils.py
+++ b/src/lighteval/metrics/imports/data_stats_utils.py
@@ -80,7 +80,7 @@ def strings(self, min_length=0, summary_base=True):
 
         # Generate strings, filtering out strings below the minimum length.
 
-        strings = [base[i: i + length] for i, j, length in self.overlaps() if length > min_length]
+        strings = [base[i : i + length] for i, j, length in self.overlaps() if length > min_length]
 
         # By default, we just return the tokenization being used.
         # But if they user wants a raw string, then we convert.
diff --git a/src/lighteval/metrics/imports/summac.py b/src/lighteval/metrics/imports/summac.py
index d5fa99a77..2803ba59e 100644
--- a/src/lighteval/metrics/imports/summac.py
+++ b/src/lighteval/metrics/imports/summac.py
@@ -146,7 +146,7 @@ def split_sentences(self, text):
     def split_2sents(self, text):
         sentences = nltk.tokenize.sent_tokenize(text)
         sentences = [sent for sent in sentences if len(sent) > 10]
-        two_sents = [" ".join(sentences[i: (i + 2)]) for i in range(len(sentences))]
+        two_sents = [" ".join(sentences[i : (i + 2)]) for i in range(len(sentences))]
         return two_sents
 
     def split_paragraphs(self, text):
diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py
index 12c6357d2..2bf180007 100644
--- a/src/lighteval/metrics/normalizations.py
+++ b/src/lighteval/metrics/normalizations.py
@@ -98,14 +98,14 @@ def _remove_boxed(text: str) -> str:
         if "\\boxed " in text:
             left = "\\boxed "
             assert text[: len(left)] == left
-            return text[len(left):]
+            return text[len(left) :]
 
         left = "\\boxed{"
 
         assert text[: len(left)] == left
         assert text[-1] == "}"
 
-        return text[len(left): -1]
+        return text[len(left) : -1]
 
     def _last_boxed_only_string(text: str) -> str | None:
         """Extract the last \\boxed{...} or \\fbox{...} element from a string."""
@@ -131,7 +131,7 @@ def _last_boxed_only_string(text: str) -> str | None:
         if right_brace_idx is None:
             retval = None
         else:
-            retval = text[idx: right_brace_idx + 1]
+            retval = text[idx : right_brace_idx + 1]
 
         return retval
 
@@ -222,7 +222,7 @@ def _fix_sqrt(text: str) -> str:
     else:
         indices = [pos for pos, char in enumerate(text) if char == "$"]
         if len(indices) > 1:
-            text = text[indices[0] + 1: indices[-1]]
+            text = text[indices[0] + 1 : indices[-1]]
 
     to_replace_1 = [
         ("\n", ""),  # linebreaks
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 83e1581f0..81b4a8341 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -352,9 +352,7 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def greedy_until_multi_turn(
-        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
-    ) -> GenerateMultiTurnReturn:
+    def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None) -> GenerateMultiTurnReturn:
         for request in requests:
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
@@ -435,15 +433,7 @@ def greedy_until_multi_turn(
 
                 model_answers.append(cur_reponses[0].result)
 
-            results.append(
-                GenerateMultiTurnReturn(
-                    result=model_answers,
-                    input_tokens=[],
-                    generated_tokens=[],
-                    truncated_tokens_count=0,
-                    padded_tokens_count=0,
-                )
-            )
+            results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
 
         return results
 
@@ -591,7 +581,7 @@ def _generate(
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        generations = outputs.sequences[:, batch.input_ids.size(1):]
+        generations = outputs.sequences[:, batch.input_ids.size(1) :]
         generations, len_gens = self.pad_and_gather(generations)
         batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids)
 
@@ -695,7 +685,7 @@ def _loglikelihood_tokens(
                 max_context_continuation_size_allowed = len(context_enc + continuation_enc)
             else:  # in normal mode, we left cut the context if needed
                 max_context_continuation_size_allowed = len(
-                    (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
                 )
 
             batch_size = self._get_batch_size(
@@ -732,7 +722,7 @@ def _loglikelihood_tokens(
                         cont_toks = cont_toks[:inplen].unsqueeze(0).to(self.device)  # [1, seq]
                     else:
                         cur_logits = (
-                            cur_logits[inplen - contlen: inplen].unsqueeze(0).to(self.device)
+                            cur_logits[inplen - contlen : inplen].unsqueeze(0).to(self.device)
                         )  # [1, seq, voc]
                         cont_toks = cont_toks.unsqueeze(0).to(self.device)  # [1, seq]
 
@@ -912,7 +902,7 @@ def _loglikelihood_single_token(
 
         for split_start, split_end in tqdm(dataset.splits_start_end_iterator()):
             context_enc = dataset[0].tokenized_context
-            max_context = len(context_enc[-self.max_length:])
+            max_context = len(context_enc[-self.max_length :])
             batch_size = self._get_batch_size(override_bs=override_bs, max_input_length=max_context)
             starting_batch_size = batch_size * 2
 
@@ -1007,7 +997,7 @@ def __init__(
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
         # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length:][:, -self.sequence_id_len:]
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][:, -self.sequence_id_len :]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
index 510278585..e36b92432 100644
--- a/src/lighteval/models/model_output.py
+++ b/src/lighteval/models/model_output.py
@@ -65,7 +65,6 @@ class GenerateReturn(ModelReturn):
     def get_result_for_eval(self):
         return self.result if self.logits is None else (self.result, self.logits)
 
-
 @dataclass
 class GenerateMultiTurnReturn(ModelReturn):
     result: list[str] = field(default_factory=list)
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 7189e4d11..89c1b75c9 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -693,7 +693,7 @@ def _loglikelihood_single_token(
             # automatic (variable) batch size detection for vectorization
             # pull longest context sample from request
             context_enc = dataset[0].tokenized_context
-            max_context = len(context_enc[-self.max_length:])
+            max_context = len(context_enc[-self.max_length :])
             batch_size = self._get_batch_size(
                 override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
             )
@@ -925,7 +925,7 @@ def _loglikelihood_tokens(
             context_enc = dataset[0].tokenized_context
             continuation_enc = dataset[0].tokenized_continuation
 
-            max_context = len((context_enc + continuation_enc)[-(self.max_length + 1):][:-1])
+            max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
 
             batch_size = self._get_batch_size(
                 override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
@@ -1011,7 +1011,7 @@ def _loglikelihood_tokens(
                             #             f"top_tokens: {top_toks_str}\ncont_tokens: {cont_toks_str}")
 
                             cur_logits = (
-                                cur_logits[inplen - contlen: inplen].unsqueeze(0).to(self.device)
+                                cur_logits[inplen - contlen : inplen].unsqueeze(0).to(self.device)
                             )  # [1, seq, voc]
                             cont_toks = cont_toks.unsqueeze(0).to(self.device)  # [1, seq]
 
@@ -1233,7 +1233,7 @@ def greedy_until(
                 dist.barrier()  # Got everyone to send their stuff
                 outputs = list(outputs)
 
-                generations = torch.stack([o.generation_ids[o.input_ids.shape[0]:] for o in outputs])
+                generations = torch.stack([o.generation_ids[o.input_ids.shape[0] :] for o in outputs])
                 batch_input_ids, len_ids = self.pad_and_gather(batch_model.input_ids)
                 batch_generations, _ = self.pad_and_gather(generations)
 
@@ -1336,7 +1336,7 @@ def __init__(
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
         # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length:][:, -self.sequence_id_len:]
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][:, -self.sequence_id_len :]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 
diff --git a/src/lighteval/models/tgi_model.py b/src/lighteval/models/tgi_model.py
index 694bb398b..5d519667b 100644
--- a/src/lighteval/models/tgi_model.py
+++ b/src/lighteval/models/tgi_model.py
@@ -42,7 +42,7 @@
 def divide_chunks(array, n):
     # looping till length array
     for i in range(0, len(array), n):
-        yield array[i: i + n]
+        yield array[i : i + n]
 
 
 class ModelClient:
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index c4858a574..46679e948 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -258,7 +258,7 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str:
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
                 raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
-            return doc.query[len(doc.instruction):]
+            return doc.query[len(doc.instruction) :]
         return doc.query
 
     def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
@@ -277,7 +277,7 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
                 raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
-            return (doc.query[len(doc.instruction):], doc.instruction)
+            return (doc.query[len(doc.instruction) :], doc.instruction)
         return (doc.query, "")
 
     def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[str]:
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index c4c863359..8fb355e1e 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -120,7 +120,6 @@ class GreedyUntilRequest(Request):
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
 
-
 @dataclass
 class GreedyUntilMultiTurnRequest(Request):
     """
@@ -131,7 +130,6 @@ class GreedyUntilMultiTurnRequest(Request):
         generation_size (int): The maximum number of tokens to generate.
         request_type (RequestType): The type of the request, set to RequestType.GREEDY_UNTIL.
     """
-
     stop_sequence: str
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py
index 3cf565ec1..08f088b29 100644
--- a/src/lighteval/tasks/tasks_prompt_formatting.py
+++ b/src/lighteval/tasks/tasks_prompt_formatting.py
@@ -2452,7 +2452,7 @@ def normalize(doc, option):
             return option.replace(pronoun, pronoun.lower())
         return option
 
-    context, eos = line["text"][: line["pronoun_loc"]], line["text"][line["pronoun_loc"] + len(line["pronoun"]):]
+    context, eos = line["text"][: line["pronoun_loc"]], line["text"][line["pronoun_loc"] + len(line["pronoun"]) :]
 
     return Doc(
         task_name=task_name,

From 804f41ae583f222f3a4aecb29f32e2e7a2d91749 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 11:22:10 +0000
Subject: [PATCH 26/45] commit

---
 extended_tasks/mt_bench/judges.py | 222 ++++++++++--------------------
 extended_tasks/mt_bench/main.py   |  73 +++++-----
 2 files changed, 109 insertions(+), 186 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index c2a998997..e3620670a 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -1,173 +1,97 @@
 import ast
-import copy
-import dataclasses
 import json
-import os
 import re
-import time
 
-import openai
+from openai import OpenAI
 
-from extended_tasks.mt_bench.model_adapter import conv_templates
 
-
-openai.api_key = os.environ["OPENAI_API_KEY"]
-
-
-# Extract scores from judgments
-two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
-two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
-one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
-one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
-
-OPENAI_MODEL_LIST = (
-    "gpt-3.5-turbo",
-    "gpt-3.5-turbo-0301",
-    "gpt-3.5-turbo-0613",
-    "gpt-3.5-turbo-1106",
-    "gpt-3.5-turbo-0125",
-    "gpt-4",
-    "gpt-4-0314",
-    "gpt-4-0613",
-    "gpt-4-turbo",
-    "gpt-4-1106-preview",
-    "gpt-4-0125-preview",
-)
-
-# API setting constants
-API_MAX_RETRY = 16
-API_RETRY_SLEEP = 10
-API_ERROR_OUTPUT = "$ERROR$"
-
-# Categories that need reference answers
-NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
-
-@dataclasses.dataclass
 class Judge:
-    model_name: str
-    prompt_template: dict
-    ref_based: bool = False
-    multi_turn: bool = False
+    def evaluate_answer(answer, question, reference) -> tuple[str, list[dict[str, str]], str]:
+        pass
 
-@dataclasses.dataclass
-class MatchSingle:
-    question: dict
-    model: str
-    answer: dict
-    judge: Judge
-    ref_answer: dict = None
-    multi_turn: bool = False
 
-def make_judge_single(judge_model, judge_prompts):
-    judges = {}
-    judges["default"] = Judge(judge_model, judge_prompts["single-v1"])
-    judges["math"] = Judge(judge_model, judge_prompts["single-math-v1"], ref_based=True)
-    judges["default-mt"] = Judge(
-        judge_model, judge_prompts["single-v1-multi-turn"], multi_turn=True
-    )
-    judges["math-mt"] = Judge(
-        judge_model,
-        judge_prompts["single-math-v1-multi-turn"],
-        ref_based=True,
-        multi_turn=True,
-    )
-    return judges
+class Judge_OpenAI(Judge):
+    def __init__(self, model: str, seed: int, temperature: float, templates_path: str):
+        self.client = OpenAI()
+        self.model = model
+        self.seed = seed
+        self.temperature = temperature
 
+        data = []
+        with open(templates_path, "r") as f:
+            for line in f:
+                tmp = json.loads(line)
+                data.append(tmp)
 
-def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
-    if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            messages = conv.to_openai_api_messages()
-            response = openai.ChatCompletion.create(
-                model=model,
-                messages=messages,
-                n=1,
-                temperature=temperature,
-                max_tokens=max_tokens,
-            )
-            output = response["choices"][0]["message"]["content"]
-            break
-        except openai.error.OpenAIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-
-    return output
-
+        self.templates = {d["name"]: d for d in data}
 
-def load_judge_prompts(prompt_file: str):
-    """Load judge prompts.
-
-    The return value is a python dict of type:
-    Dict[judge_name: str -> dict]
-    """
-    prompts = {}
-    with open(prompt_file) as fin:
-        for line in fin:
-            line = json.loads(line)
-            prompts[line["name"]] = line
-    return prompts
+        self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
+        self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
 
+    def evaluate_answer(self, questions, answers, references, single_turn: bool):
+        if single_turn:
+            score, messages, answer = self.__single_turn_evaluate(
+                questions[0], answers[0], references[0] if len(references) > 0 else None
+            )
+        else:
+            score, messages, answer = self.__multi_turn_evaluate(questions, answers, references)
+        return score, messages, answer
 
-def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
-    kwargs = {}
-    model = judge.model_name
-    if ref_answer is not None and len(ref_answer) > 0:
-        kwargs["ref_answer_1"] = ref_answer[0]
-        if multi_turn:
-            kwargs["ref_answer_2"] = ref_answer[1]
+    def __single_turn_evaluate(self, question, answer, reference):
+        if reference is None or len(reference) == 0:
+            system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
+            user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
+        else:
+            system_prompt = {"role": "system", "content": self.templates["single-math-v1"]["system_prompt"]}
+            user_prompt_str = self.templates["single-math-v1"]["prompt_template"].format(
+                question=question, answer=answer, ref_answer_1=reference
+            )
 
-    if multi_turn:
-        user_prompt = judge.prompt_template["prompt_template"].format(
-            question_1=question[0],
-            question_2=question[1],
-            answer_1=answer[0],
-            answer_2=answer[1],
-            **kwargs,
+        user_prompt = {"role": "user", "content": user_prompt_str}
+        messages = [system_prompt, user_prompt]
+        response = self.client.chat.completions.create(
+            model=self.model,
+            seed=self.seed,
+            temperature=self.temperature,
+            messages=messages,
         )
-    else:
-        user_prompt = judge.prompt_template["prompt_template"].format(
-            question=question[0],
-            answer=answer[0],
-            **kwargs,
+        judgment = response.choices[0].message.content
+        return self.__process_judge_response(judgment), messages, judgment
+
+    def __multi_turn_evaluate(self, questions, answers, references):
+        if references is None or len(references) == 0:
+            system_prompt = {"role": "system", "content": self.templates["single-v1-multi-turn"]["system_prompt"]}
+            user_prompt_str = self.templates["single-v1-multi-turn"]["prompt_template"].format(
+                question_1=questions[0], answer_1=answers[0], question_2=questions[1], answer_2=answers[1]
+            )
+        else:
+            system_prompt = {"role": "system", "content": self.templates["single-math-v1-multi-turn"]["system_prompt"]}
+            user_prompt_str = self.templates["single-math-v1-multi-turn"]["prompt_template"].format(
+                question_1=questions[0],
+                answer_1=answers[0],
+                ref_answer_1=references[0],
+                question_2=questions[1],
+                answer_2=answers[1],
+                ref_answer_2=references[1],
+            )
+        user_prompt = {"role": "user", "content": user_prompt_str}
+        messages = [system_prompt, user_prompt]
+        response = self.client.chat.completions.create(
+            model=self.model,
+            seed=self.seed,
+            temperature=self.temperature,
+            messages=messages,
         )
+        judgment = response.choices[0].message.content
+        return self.__process_judge_response(judgment), messages, judgment
 
-    rating = -1
-
-    system_prompt = judge.prompt_template["system_prompt"]
-    conv = copy.deepcopy(conv_templates["chatgpt"])
-    conv.set_system_message(system_prompt)
-    conv.append_message(conv.roles[0], user_prompt)
-    conv.append_message(conv.roles[1], None)
-
-    if model in OPENAI_MODEL_LIST:
-        judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
-    else:
-        raise ValueError(f"Invalid judge model name: {model}")
-
-    if judge.prompt_template["output_format"] == "[[rating]]":
-        match = re.search(one_score_pattern, judgment)
+    def __process_judge_response(self, judgment: str) -> int:
+        match = re.search(self.one_score_pattern, judgment)
         if not match:
-            match = re.search(one_score_pattern_backup, judgment)
-
+            match = re.search(self.one_score_pattern_backup, judgment)
         if match:
             rating = ast.literal_eval(match.groups()[0])
         else:
             rating = -1
-    else:
-        raise ValueError(
-            f"invalid output format: {judge.prompt_template['output_format']}"
-        )
 
-    return rating, user_prompt, judgment
-
-
-def play_a_match_single(question, answer, ref_answer, judge, multi_turn, output_file: str):
-    if judge.prompt_template["type"] == "single":
-        score, user_prompt, judgment = run_judge_single(
-            question, answer, judge, ref_answer, multi_turn=multi_turn
-        )
-        return score, user_prompt, judgment
+        return rating
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index f44e47990..15f73ebe5 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -11,11 +11,7 @@
 from aenum import extend_enum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from extended_tasks.mt_bench.judges import (
-    load_judge_prompts,
-    make_judge_single,
-    play_a_match_single,
-)
+from extended_tasks.mt_bench.judges import Judge_OpenAI
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -23,9 +19,7 @@
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
 
-NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
-
-## EVAL WITH NO SUBSET ##
+# EVAL WITH NO SUBSET ##
 # This is how you create a simple tasks (like hellaswag) which has one single subset
 # attached to it, and one evaluation possible.
 task = LightevalTaskConfig(
@@ -44,7 +38,7 @@
 )
 
 
-## DEFINE YOUR PROMPT FUNCTIONS
+# DEFINE YOUR PROMPT FUNCTIONS
 # Define as many as you need for your different tasks
 def prompt_fn(line, task_name: str = None):
     """Defines how to go from a dataset line to a doc object.
@@ -57,39 +51,44 @@ def prompt_fn(line, task_name: str = None):
         choices=None,
         instruction=None,
         gold_index=[],
-        specific={"reference": line["reference"], "category": line["category"], "multi_turn_queries": line["turns"], "id": line["question_id"]},
+        specific={
+            "reference": line["reference"],
+            "category": line["category"],
+            "multi_turn_queries": line["turns"],
+            "id": line["question_id"],
+        },
     )
 
 
-
-
 def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
     """Defines how to go from a list of predictions to a score.
     Follow examples in src/lighteval/metrics/metrics.py, or get more info
     about what this function should do in the README.
     """
-    judge_model = "gpt-3.5-turbo"
-    judge_file = "extended_tasks/mt_bench/judge_prompts.jsonl"
-    judge_prompts = load_judge_prompts(judge_file)
-    judges = make_judge_single(judge_model, judge_prompts)
-
-    question = formatted_doc.specific["multi_turn_queries"]
-    ref_answer = formatted_doc.specific["reference"]
-    category = formatted_doc.specific["category"]
-
-    if category not in NEED_REF_CATS:
-        score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-        score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
-    else:
-        try:
-            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["math"], multi_turn=False, output_file=None)
-            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["math-mt"], multi_turn=True, output_file=None)
-        except KeyError:
-            print(f"Category {category} not found in judge prompts, using default judge")
-            score, user_prompt_1, judgement_1 = play_a_match_single(question, predictions, ref_answer, judges["default"], multi_turn=False, output_file=None)
-            score_mt, user_prompt_2, judgement_2 = play_a_match_single(question, predictions, ref_answer, judges["default-mt"], multi_turn=True, output_file=None)
-
-    return {"single_turn": score, "multi_turn": score_mt, "user_prompt": [user_prompt_1, user_prompt_2], "judgement": [judgement_1, judgement_2]}
+
+    judge = Judge_OpenAI(
+        model="gpt-3.5-turbo",
+        seed=42,
+        temperature=0.0,
+        templates_path="extended_tasks/mt_bench/judge_prompts.jsonl",
+    )
+
+    questions = formatted_doc.specific["multi_turn_queries"]
+    ref_answers = formatted_doc.specific["reference"]
+
+    score, messages, judgement = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=True)
+    score_mt, messages_mt, judgement_mt = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=False)
+
+    pprint(score)
+    pprint(messages)
+    pprint(judgement)
+
+    return {
+        "single_turn": score,
+        "multi_turn": score_mt,
+        "user_prompt": [messages, messages_mt],
+        "judgement": [judgement, judgement_mt],
+    }
 
 
 mt_bench_metric = SampleLevelMetricGrouping(
@@ -101,13 +100,13 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     corpus_level_fn={
         "single_turn": np.mean,
         "multi_turn": np.mean,
-    }
+    },
 )
 
-## STORE YOUR EVALS
+# STORE YOUR EVALS
 _TASKS = [task]
 
-## MODULE LOGIC
+# MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]

From 48b0fee55a142057979fad2464f67ac87ee949e6 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 11:23:05 +0000
Subject: [PATCH 27/45] remove model adapter

---
 extended_tasks/mt_bench/model_adapter.py | 406 -----------------------
 1 file changed, 406 deletions(-)
 delete mode 100644 extended_tasks/mt_bench/model_adapter.py

diff --git a/extended_tasks/mt_bench/model_adapter.py b/extended_tasks/mt_bench/model_adapter.py
deleted file mode 100644
index e653efb6e..000000000
--- a/extended_tasks/mt_bench/model_adapter.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import base64
-import dataclasses
-import math
-import os
-import re
-import sys
-import warnings
-from dataclasses import field
-from enum import IntEnum, auto
-from io import BytesIO
-from typing import Dict, List, Optional, Tuple, Union
-
-
-IMAGE_PLACEHOLDER_STR = "$$<image>$$"
-
-
-class SeparatorStyle(IntEnum):
-    """Separator styles."""
-
-    ADD_COLON_SINGLE = auto()
-    ADD_COLON_TWO = auto()
-    ADD_COLON_SPACE_SINGLE = auto()
-    NO_COLON_SINGLE = auto()
-    NO_COLON_TWO = auto()
-    ADD_NEW_LINE_SINGLE = auto()
-    LLAMA2 = auto()
-    CHATGLM = auto()
-    CHATML = auto()
-    CHATINTERN = auto()
-    DOLLY = auto()
-    RWKV = auto()
-    PHOENIX = auto()
-    ROBIN = auto()
-    FALCON_CHAT = auto()
-    CHATGLM3 = auto()
-    DEEPSEEK_CHAT = auto()
-    METAMATH = auto()
-    YUAN2 = auto()
-
-@dataclasses.dataclass
-class Conversation:
-    """A class that manages prompt templates and keeps all conversation history."""
-
-    # The name of this template
-    name: str
-    # The template of the system prompt
-    system_template: str = "{system_message}"
-    # The system message
-    system_message: str = ""
-    # The names of two roles
-    roles: Tuple[str] = ("USER", "ASSISTANT")
-    # All messages. Each item is (role, message).
-    # Each message is either a string or a tuple of (string, List[image_url]).
-    messages: List[List[str]] = field(default_factory=list)
-    # The number of few shot examples
-    offset: int = 0
-    # The separator style and configurations
-    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
-    sep: str = "\n"
-    sep2: str = None
-    # Stop criteria (the default one is EOS token)
-    stop_str: Union[str, List[str]] = None
-    # Stops generation if meeting any token in this list
-    stop_token_ids: List[int] = None
-
-    def get_prompt(self) -> str:
-        """Get the prompt for generation."""
-        system_prompt = self.system_template.format(system_message=self.system_message)
-        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    if type(message) is tuple:
-                        message, images = message
-                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ": "  # must be end with a space
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
-            ret = "" if system_prompt == "" else system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + "\n" + message + self.sep
-                else:
-                    ret += role + "\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + message + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.RWKV:
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += (
-                        role
-                        + ": "
-                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
-                    )
-                    ret += "\n\n"
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.LLAMA2:
-            seps = [self.sep, self.sep2]
-            if self.system_message:
-                ret = system_prompt
-            else:
-                ret = "[INST] "
-            for i, (role, message) in enumerate(self.messages):
-                tag = self.roles[i % 2]
-                if message:
-                    if i == 0:
-                        ret += message + " "
-                    else:
-                        ret += tag + " " + message + seps[i % 2]
-                else:
-                    ret += tag
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM:
-            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
-            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
-            round_add_n = 1 if self.name == "chatglm2" else 0
-            if system_prompt:
-                ret = system_prompt + self.sep
-            else:
-                ret = ""
-
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
-
-                if message:
-                    ret += f"{role}：{message}{self.sep}"
-                else:
-                    ret += f"{role}："
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATML:
-            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, images = message
-                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
-                    ret += role + "\n" + message + self.sep + "\n"
-                else:
-                    ret += role + "\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM3:
-            ret = ""
-            if self.system_message:
-                ret += system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + "\n" + message
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATINTERN:
-            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += "<s>"
-                if message:
-                    ret += role + ":" + message + seps[i % 2] + "\n"
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.DOLLY:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ":\n" + message + seps[i % 2]
-                    if i % 2 == 1:
-                        ret += "\n\n"
-                else:
-                    ret += role + ":\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.PHOENIX:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + "<s>" + message + "</s>"
-                else:
-                    ret += role + ": " + "<s>"
-            return ret
-        elif self.sep_style == SeparatorStyle.ROBIN:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ":\n" + message + self.sep
-                else:
-                    ret += role + ":\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
-            ret = ""
-            if self.system_message:
-                ret += system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.METAMATH:
-            ret = "" if system_prompt == "" else system_prompt + self.sep
-            for i, (role, message) in enumerate(self.messages):
-                # For MetaMath, sep2 is used to prefix the message.
-                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
-                ending_sep = self.sep if i % 2 == 0 else ""
-                if message:
-                    ret += role + starting_sep + message + ending_sep
-                else:
-                    ret += role + starting_sep
-            return ret
-        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.YUAN2:
-            seps = [self.sep, self.sep2]
-            ret = ""
-            if self.system_message:
-                ret += system_prompt + seps[1]
-            for _, message in self.messages:
-                if message:
-                    ret += message + "<n>"
-                else:
-                    ret += ""
-            ret = ret.rstrip("<n>") + seps[0]
-            return ret
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-
-    def get_images(self):
-        images = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    for image in msg[1]:
-                        images.append(image)
-
-        return images
-
-    def set_system_message(self, system_message: str):
-        """Set the system message."""
-        self.system_message = system_message
-
-    def append_message(self, role: str, message: str):
-        """Append a new message."""
-        self.messages.append([role, message])
-
-    def update_last_message(self, message: str):
-        """Update the last output.
-
-        The last message is typically set to be None when constructing the prompt,
-        so we need to update it in-place after getting the response from a model.
-        """
-        self.messages[-1][1] = message
-
-    def convert_image_to_base64(self, image):
-        """Given an image, return the base64 encoded image string."""
-        import requests
-        from PIL import Image
-
-        # Load image if it has not been loaded in yet
-        if type(image) == str:
-            if image.startswith("http://") or image.startswith("https://"):
-                response = requests.get(image)
-                image = Image.open(BytesIO(response.content)).convert("RGB")
-            elif "base64" in image:
-                # OpenAI format is: data:image/jpeg;base64,{base64_encoded_image_str}
-                return image.split(",")[1]
-            else:
-                image = Image.open(image).convert("RGB")
-
-        max_hw, min_hw = max(image.size), min(image.size)
-        aspect_ratio = max_hw / min_hw
-        max_len, min_len = 2048, 2048
-        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-        longest_edge = int(shortest_edge * aspect_ratio)
-        W, H = image.size
-        if longest_edge != max(image.size):
-            if H > W:
-                H, W = longest_edge, shortest_edge
-            else:
-                H, W = shortest_edge, longest_edge
-            image = image.resize((W, H))
-
-        buffered = BytesIO()
-        image.save(buffered, format="PNG")
-        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-
-        return img_b64_str
-
-    def to_gradio_chatbot(self):
-        """Convert the conversation to gradio chatbot format."""
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    msg, image = msg
-                    img_b64_str = image[0]  # Only one image on gradio at one time
-                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace("<image>\n", "").strip()
-
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
-    def to_openai_api_messages(self):
-        """Convert the conversation to OpenAI chat completion format."""
-        if self.system_message == "":
-            ret = []
-        else:
-            ret = [{"role": "system", "content": self.system_message}]
-
-        for i, (_, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append({"role": "user", "content": msg})
-            else:
-                if msg is not None:
-                    ret.append({"role": "assistant", "content": msg})
-        return ret
-
-    def extract_text_from_messages(self):
-        return [
-            (role, message[0]) if type(message) is tuple else (role, message)
-            for role, message in self.messages
-        ]
-
-    def copy(self):
-        return Conversation(
-            name=self.name,
-            system_template=self.system_template,
-            system_message=self.system_message,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            stop_str=self.stop_str,
-            stop_token_ids=self.stop_token_ids,
-        )
-
-    def dict(self):
-        return {
-            "template_name": self.name,
-            "system_message": self.system_message,
-            "roles": self.roles,
-            "messages": self.extract_text_from_messages(),
-            "offset": self.offset,
-        }
-
-
-# A global registry for all conversation templates
-conv_templates: Dict[str, Conversation] = {
-    "chatgpt":  Conversation(
-        name="chatgpt",
-        system_message="You are a helpful assistant.",
-        roles=("user", "assistant"),
-        sep_style=None,
-        sep=None,
-    )
-}

From e5b6ea889f647cab644bc180f6f8f0a951e03aac Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 11:25:53 +0000
Subject: [PATCH 28/45] commit

---
 extended_tasks/mt_bench/judges.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index e3620670a..2ced66d8b 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -1,12 +1,14 @@
 import ast
 import json
 import re
+from abc import ABC
 
 from openai import OpenAI
 
 
-class Judge:
-    def evaluate_answer(answer, question, reference) -> tuple[str, list[dict[str, str]], str]:
+# Abstract class for a judge
+class Judge(ABC):
+    def evaluate_answer(answers, questions, references) -> tuple[str, list[dict[str, str]], str]:
         pass
 
 

From 0dcdb1efa49ec33bc046fbd0adc6d7308f794554 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 11:26:30 +0000
Subject: [PATCH 29/45] update readme

---
 README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/README.md b/README.md
index 14ec5edd2..b862e6c65 100644
--- a/README.md
+++ b/README.md
@@ -286,13 +286,6 @@ You can then give your custom metric to lighteval by using `--custom-tasks path_
 
 To see an example of a custom metric added along with a custom task, look at `tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
 
-Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
-
-#### Community evaluations
-Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets.
-
-Make sure you can launch your model with your new task using `--tasks community|yournewtask|2|0 --custom_tasks community_tasks/yourevalname.py`.
-
 ## Available metrics
 ### Metrics for multiple choice tasks
 These metrics use log-likelihood of the different possible targets.

From 703741be626dbdd0f868f7c17e6604e0f0b7753e Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Thu, 21 Mar 2024 13:24:17 +0000
Subject: [PATCH 30/45] commti

---
 extended_tasks/mt_bench/main.py    | 6 ------
 src/lighteval/models/base_model.py | 4 ----
 2 files changed, 10 deletions(-)

diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index 15f73ebe5..1ca8c55be 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -5,8 +5,6 @@
 Author:
 """
 
-from pprint import pprint
-
 import numpy as np
 from aenum import extend_enum
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -79,10 +77,6 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     score, messages, judgement = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=True)
     score_mt, messages_mt, judgement_mt = judge.evaluate_answer(questions, predictions, ref_answers, single_turn=False)
 
-    pprint(score)
-    pprint(messages)
-    pprint(judgement)
-
     return {
         "single_turn": score,
         "multi_turn": score_mt,
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 81b4a8341..b39fd910c 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -403,10 +403,6 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
             for i, multi_turn_context in enumerate(request.context[1:]):
                 multi_turn_context = multi_turn_context.format(model_response=model_answers[0])
 
-                # print("multi_turn_context ====== ")
-                # pprint(multi_turn_context)
-                # print("multi_turn_context ====== ")
-
                 tokenized = self.tokenizer(
                     multi_turn_context,
                     padding=True,

From 6e8026fa2f87abbfe119a8137f52d557d468afc6 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 22 Mar 2024 11:37:10 +0000
Subject: [PATCH 31/45] commit

---
 extended_tasks/mt_bench/judges.py | 79 ++++++++++++++++++-------------
 extended_tasks/mt_bench/main.py   |  4 +-
 pyproject.toml                    |  3 +-
 3 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index 2ced66d8b..29c842ffc 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -1,10 +1,13 @@
 import ast
 import json
 import re
+import time
 from abc import ABC
 
 from openai import OpenAI
 
+from lighteval.logging.hierarchical_logger import hlog_warn
+
 
 # Abstract class for a judge
 class Judge(ABC):
@@ -12,7 +15,7 @@ def evaluate_answer(answers, questions, references) -> tuple[str, list[dict[str,
         pass
 
 
-class Judge_OpenAI(Judge):
+class JudgeOpenAI(Judge):
     def __init__(self, model: str, seed: int, temperature: float, templates_path: str):
         self.client = OpenAI()
         self.model = model
@@ -30,37 +33,43 @@ def __init__(self, model: str, seed: int, temperature: float, templates_path: st
         self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
         self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
 
-    def evaluate_answer(self, questions, answers, references, single_turn: bool):
+        self.API_MAX_RETRY = 16
+        self.API_RETRY_SLEEP = 10
+        self.max_tokens = 2048
+
+    def evaluate_answer(
+        self, questions: list[str], answers: list[str], references: list[str], single_turn: bool
+    ) -> tuple[int, list[dict[str, str]], str]:
         if single_turn:
-            score, messages, answer = self.__single_turn_evaluate(
+            prompts = self.__get_prompts_single_turn(
                 questions[0], answers[0], references[0] if len(references) > 0 else None
             )
         else:
-            score, messages, answer = self.__multi_turn_evaluate(questions, answers, references)
-        return score, messages, answer
-
-    def __single_turn_evaluate(self, question, answer, reference):
-        if reference is None or len(reference) == 0:
-            system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
-            user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
-        else:
-            system_prompt = {"role": "system", "content": self.templates["single-math-v1"]["system_prompt"]}
-            user_prompt_str = self.templates["single-math-v1"]["prompt_template"].format(
-                question=question, answer=answer, ref_answer_1=reference
+            prompts = self.__get_prompts_multi_turn(
+                questions[0], answers[0], references[0] if len(references) > 0 else None
             )
 
-        user_prompt = {"role": "user", "content": user_prompt_str}
-        messages = [system_prompt, user_prompt]
-        response = self.client.chat.completions.create(
-            model=self.model,
-            seed=self.seed,
-            temperature=self.temperature,
-            messages=messages,
-        )
+        for _ in range(self.API_MAX_RETRY):
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    seed=self.seed,
+                    temperature=self.temperature,
+                    messages=prompts,
+                    max_tokens=self.max_tokens,
+                    n=1,
+                )
+                break
+            except Exception as e:
+                hlog_warn(f"{type(e), e}")
+                time.sleep(self.API_RETRY_SLEEP)
+
         judgment = response.choices[0].message.content
-        return self.__process_judge_response(judgment), messages, judgment
+        score = self.__process_judge_response(judgment)
 
-    def __multi_turn_evaluate(self, questions, answers, references):
+        return score, prompts, judgment
+
+    def __get_prompts_multi_turn(self, questions, answers, references):
         if references is None or len(references) == 0:
             system_prompt = {"role": "system", "content": self.templates["single-v1-multi-turn"]["system_prompt"]}
             user_prompt_str = self.templates["single-v1-multi-turn"]["prompt_template"].format(
@@ -77,15 +86,19 @@ def __multi_turn_evaluate(self, questions, answers, references):
                 ref_answer_2=references[1],
             )
         user_prompt = {"role": "user", "content": user_prompt_str}
-        messages = [system_prompt, user_prompt]
-        response = self.client.chat.completions.create(
-            model=self.model,
-            seed=self.seed,
-            temperature=self.temperature,
-            messages=messages,
-        )
-        judgment = response.choices[0].message.content
-        return self.__process_judge_response(judgment), messages, judgment
+        return [system_prompt, user_prompt]
+
+    def __get_prompts_single_turn(self, question, answer, reference):
+        if reference is None or len(reference) == 0:
+            system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
+            user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
+        else:
+            system_prompt = {"role": "system", "content": self.templates["single-math-v1"]["system_prompt"]}
+            user_prompt_str = self.templates["single-math-v1"]["prompt_template"].format(
+                question=question, answer=answer, ref_answer_1=reference
+            )
+        user_prompt = {"role": "user", "content": user_prompt_str}
+        return [system_prompt, user_prompt]
 
     def __process_judge_response(self, judgment: str) -> int:
         match = re.search(self.one_score_pattern, judgment)
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index 1ca8c55be..6668edca4 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -9,7 +9,7 @@
 from aenum import extend_enum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from extended_tasks.mt_bench.judges import Judge_OpenAI
+from extended_tasks.mt_bench.judges import JudgeOpenAI
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -64,7 +64,7 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     about what this function should do in the README.
     """
 
-    judge = Judge_OpenAI(
+    judge = JudgeOpenAI(
         model="gpt-3.5-turbo",
         seed=42,
         temperature=0.0,
diff --git a/pyproject.toml b/pyproject.toml
index d8953d7d4..6eabe8a54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,7 +88,8 @@ quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]
 extended_tasks = [
-  "langdetect", #ifeval
+  "langdetect", # ifeval
+  "openai", # mt-bench
 ]
 
 [project.urls]

From 588fb2f9bb2640fa775786109e3984a64d20721f Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 22 Mar 2024 11:40:11 +0000
Subject: [PATCH 32/45] format

---
 extended_tasks/mt_bench/judges.py     | 23 +++++++++++++++++++++++
 extended_tasks/mt_bench/main.py       | 23 +++++++++++++++++++++++
 src/lighteval/evaluator.py            |  4 +++-
 src/lighteval/logging/info_loggers.py | 10 +++++++++-
 src/lighteval/metrics/__init__.py     |  1 +
 src/lighteval/models/base_model.py    | 14 ++++++++++++--
 src/lighteval/models/model_output.py  |  1 +
 src/lighteval/tasks/requests.py       |  2 ++
 8 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index 29c842ffc..40b30ee0b 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -1,3 +1,26 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
 import ast
 import json
 import re
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index 6668edca4..cbdd91c36 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -1,3 +1,26 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
 # ruff: noqa: F405, F403, F401
 """
 Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index 9449afef1..efb146d63 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -130,7 +130,9 @@ def evaluate(  # noqa: C901
             judgement = None
 
         evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
-        evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement))
+        evaluation_tracker.details_logger.log(
+            task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
+        )
 
     return evaluation_tracker
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 046022601..f65359d5b 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -305,7 +305,15 @@ class CompiledHash:
     compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
     compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()
 
-    def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict, llm_as_prompt_judgement: tuple[str, str]) -> None:
+    def log(
+        self,
+        task_name: str,
+        task: LightevalTask,
+        doc: Doc,
+        outputs: list[ModelReturn],
+        metrics: dict,
+        llm_as_prompt_judgement: tuple[str, str],
+    ) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
         Args:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 3774fc273..3dfd0ca9a 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -147,6 +147,7 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
 
     return results, outputs
 
+
 def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
     outputs = {}
     predictions = results.pop(0).result
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index b39fd910c..7b46861b3 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -352,7 +352,9 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None) -> GenerateMultiTurnReturn:
+    def greedy_until_multi_turn(
+        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
+    ) -> GenerateMultiTurnReturn:
         for request in requests:
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
@@ -429,7 +431,15 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o
 
                 model_answers.append(cur_reponses[0].result)
 
-            results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
+            results.append(
+                GenerateMultiTurnReturn(
+                    result=model_answers,
+                    input_tokens=[],
+                    generated_tokens=[],
+                    truncated_tokens_count=0,
+                    padded_tokens_count=0,
+                )
+            )
 
         return results
 
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
index e36b92432..510278585 100644
--- a/src/lighteval/models/model_output.py
+++ b/src/lighteval/models/model_output.py
@@ -65,6 +65,7 @@ class GenerateReturn(ModelReturn):
     def get_result_for_eval(self):
         return self.result if self.logits is None else (self.result, self.logits)
 
+
 @dataclass
 class GenerateMultiTurnReturn(ModelReturn):
     result: list[str] = field(default_factory=list)
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 8fb355e1e..c4c863359 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -120,6 +120,7 @@ class GreedyUntilRequest(Request):
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
 
+
 @dataclass
 class GreedyUntilMultiTurnRequest(Request):
     """
@@ -130,6 +131,7 @@ class GreedyUntilMultiTurnRequest(Request):
         generation_size (int): The maximum number of tokens to generate.
         request_type (RequestType): The type of the request, set to RequestType.GREEDY_UNTIL.
     """
+
     stop_sequence: str
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN

From 8cb4894c36c3d710d4fab4173674be2c55da5b57 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 22 Mar 2024 11:47:02 +0000
Subject: [PATCH 33/45] format

---
 extended_tasks/mt_bench/judges.py  | 15 ++++++++-------
 src/lighteval/models/base_model.py |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index 40b30ee0b..908de2a03 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -26,6 +26,7 @@
 import re
 import time
 from abc import ABC
+from typing import Optional
 
 from openai import OpenAI
 
@@ -68,9 +69,7 @@ def evaluate_answer(
                 questions[0], answers[0], references[0] if len(references) > 0 else None
             )
         else:
-            prompts = self.__get_prompts_multi_turn(
-                questions[0], answers[0], references[0] if len(references) > 0 else None
-            )
+            prompts = self.__get_prompts_multi_turn(questions, answers, references if len(references) > 0 else None)
 
         for _ in range(self.API_MAX_RETRY):
             try:
@@ -92,8 +91,10 @@ def evaluate_answer(
 
         return score, prompts, judgment
 
-    def __get_prompts_multi_turn(self, questions, answers, references):
-        if references is None or len(references) == 0:
+    def __get_prompts_multi_turn(
+        self, questions: list[str], answers: list[str], references: Optional[list[str]]
+    ) -> list[dict[str, str]]:
+        if references is None:
             system_prompt = {"role": "system", "content": self.templates["single-v1-multi-turn"]["system_prompt"]}
             user_prompt_str = self.templates["single-v1-multi-turn"]["prompt_template"].format(
                 question_1=questions[0], answer_1=answers[0], question_2=questions[1], answer_2=answers[1]
@@ -111,8 +112,8 @@ def __get_prompts_multi_turn(self, questions, answers, references):
         user_prompt = {"role": "user", "content": user_prompt_str}
         return [system_prompt, user_prompt]
 
-    def __get_prompts_single_turn(self, question, answer, reference):
-        if reference is None or len(reference) == 0:
+    def __get_prompts_single_turn(self, question: str, answer: str, reference: Optional[str]) -> list[dict[str, str]]:
+        if reference is None:
             system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
             user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
         else:
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 7b46861b3..bfbc4ae7e 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -583,7 +583,7 @@ def _generate(
             pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id,
             return_dict_in_generate=True,
             output_scores=True,
-            eos_token_id=self.tokenizer.eos_token_id,
+            # eos_token_id=self.tokenizer.eos_token_id,
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)

From c08a8f6ca5d7ed6bcc2ecab26a88ddf96c756f8e Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 25 Mar 2024 10:30:57 +0000
Subject: [PATCH 34/45] commit

---
 extended_tasks/mt_bench/judges.py  |   2 +-
 src/lighteval/evaluator.py         |   2 +-
 src/lighteval/models/base_model.py | 156 ++++++++++++++++++++---------
 3 files changed, 108 insertions(+), 52 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index 908de2a03..9549964e4 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -69,7 +69,7 @@ def evaluate_answer(
                 questions[0], answers[0], references[0] if len(references) > 0 else None
             )
         else:
-            prompts = self.__get_prompts_multi_turn(questions, answers, references if len(references) > 0 else None)
+            prompts = self.__get_prompts_multi_turn(questions, answers, references if len(references) > 1 else None)
 
         for _ in range(self.API_MAX_RETRY):
             try:
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index efb146d63..5070261cc 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -115,7 +115,7 @@ def evaluate(  # noqa: C901
             doc.instruction = ""
 
         # using a deep copy here because process results pops from the model responses
-        metrics = task.process_results(doc, copy.deepcopy(model_responses), evaluation_tracker=evaluation_tracker)
+        metrics = task.process_results(doc, copy.deepcopy(model_responses))
 
         # Remove the user_prompt from the metrics in case of llm-as-judge metric
         if "user_prompt" in metrics:
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index bfbc4ae7e..476d3b6b7 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import os
+from itertools import islice
 from typing import Optional, Tuple, Union
 
 import torch
@@ -352,7 +353,15 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def greedy_until_multi_turn(
+    def batched(self, iterable, n):
+        # batched('ABCDEFG', 3) → ABC DEF G
+        if n < 1:
+            raise ValueError("n must be at least one")
+        it = iter(iterable)
+        while batch := tuple(islice(it, n)):
+            yield batch
+
+    def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
         for request in requests:
@@ -361,22 +370,26 @@ def greedy_until_multi_turn(
 
         results = []
 
-        # if self.accelerator:
-        #     dataloader = self.accelerator.prepare(dataloader)
+        dataset = GenerativeTaskDataset(requests=requests, dataset_splits=1)
+        dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda batch: batch)
+
+        if self.accelerator:
+            dataloader = self.accelerator.prepare(dataloader)
 
         # Always batch size 1 for multi-turn
-        for request in tqdm(
-            requests, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
+        for request_batch in tqdm(
+            dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
         ):
             # NOTE: we are assuming all items in a batch behave similarly (same
             # stop_tokens and max_tokens genrated) which is not necessarily
             # the case! Because of that we only use batch size of 1
+            request = request_batch[0]
             stop_tokens = request.stop_sequence
             max_generated_tokens = request.generation_size
             context = request.context[0]
             max_context_size_allowed = self.max_length - max_generated_tokens
 
-            tokenized = self.tokenizer(
+            model_inputs = self.tokenizer(
                 context,
                 padding=True,
                 truncation=True,
@@ -385,27 +398,35 @@ def greedy_until_multi_turn(
                 add_special_tokens=self.add_special_tokens,
             ).to(self.device)
 
-            prepared_batch = Batch(
-                input_ids=tokenized["input_ids"],
-                input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
-                input_mask=tokenized["attention_mask"],
-                truncated=[0] * len(tokenized["input_ids"]),
-                padded=[0] * len(tokenized["input_ids"]),
+            stopping_criteria = transformers.StoppingCriteriaList(
+                [
+                    *[
+                        MultiTokenEOSCriteria(
+                            sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape
+                        )
+                        for sequence in stop_tokens
+                    ],
+                ]
             )
-
-            cur_reponses = self._generate(
-                batch=prepared_batch,
+            model_outputs = self.model.generate(
+                **model_inputs,
                 max_new_tokens=max_generated_tokens,
-                stop_tokens=stop_tokens,
-                returns_logits=False,
+                stopping_criteria=stopping_criteria,
+                do_sample=False,
+                pad_token_id=self.tokenizer.pad_token_id
+                if self.tokenizer.pad_token_id
+                else self.tokenizer.eos_token_id,
             )
-
-            model_answers = [cur_reponses[0].result]
+            model_outputs = model_outputs[0, model_inputs["input_ids"].size(1):]
+            model_generations = [model_outputs]
+            decoded_generation = self.tokenizer.decode(model_outputs)
+            for term in stop_tokens:
+                decoded_generation = decoded_generation.split(term)[0]
 
             for i, multi_turn_context in enumerate(request.context[1:]):
-                multi_turn_context = multi_turn_context.format(model_response=model_answers[0])
+                multi_turn_context = multi_turn_context.format(model_response=decoded_generation)
 
-                tokenized = self.tokenizer(
+                model_inputs = self.tokenizer(
                     multi_turn_context,
                     padding=True,
                     truncation=True,
@@ -414,32 +435,59 @@ def greedy_until_multi_turn(
                     add_special_tokens=self.add_special_tokens,
                 ).to(self.device)
 
-                prepared_batch = Batch(
-                    input_ids=tokenized["input_ids"],
-                    input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
-                    input_mask=tokenized["attention_mask"],
-                    truncated=[0] * len(tokenized["input_ids"]),
-                    padded=[0] * len(tokenized["input_ids"]),
+                stopping_criteria = transformers.StoppingCriteriaList(
+                    [
+                        *[
+                            MultiTokenEOSCriteria(
+                                sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape
+                            )
+                            for sequence in stop_tokens
+                        ],
+                    ]
                 )
 
-                cur_reponses = self._generate(
-                    batch=prepared_batch,
+                model_outputs = self.model.generate(
+                    input_ids=model_inputs["input_ids"],
+                    attention_mask=model_inputs["attention_mask"],
                     max_new_tokens=max_generated_tokens,
-                    stop_tokens=stop_tokens,
-                    returns_logits=False,
+                    stopping_criteria=stopping_criteria,
+                    do_sample=False,
+                    pad_token_id=self.tokenizer.pad_token_id
+                    if self.tokenizer.pad_token_id
+                    else self.tokenizer.eos_token_id,
                 )
+                model_outputs = model_outputs[0, model_inputs["input_ids"].size(1):]  # batch size 1
+                model_generations.append(model_outputs)
+                decoded_generation = self.tokenizer.decode(model_outputs, skip_special_tokens=True)
 
-                model_answers.append(cur_reponses[0].result)
+                for term in stop_tokens:
+                    decoded_generation = decoded_generation.split(term)[0]
 
-            results.append(
-                GenerateMultiTurnReturn(
-                    result=model_answers,
-                    input_tokens=[],
-                    generated_tokens=[],
-                    truncated_tokens_count=0,
-                    padded_tokens_count=0,
+            if self.accelerator:
+                padding_size = max(gen.shape[0] for gen in model_generations)
+                for i, gen in enumerate(model_generations):
+                    model_generations[i] = F.pad(
+                        gen, (0, padding_size - gen.shape[0]), value=self.tokenizer.pad_token_id
+                    )
+                model_generations = torch.stack(model_generations, dim=0)
+                model_generations, lengths = self.pad_and_gather(model_generations, drop_last_samples=False)
+
+            model_answers = []
+            for generation, _ in zip(model_generations, lengths):
+                generation = generation.cpu().tolist()
+                decoded = self.tokenizer.decode(generation, skip_special_tokens=True)
+                model_answers.append(decoded)
+
+            for answers in self.batched(model_answers, len(request.context)):
+                results.append(
+                    GenerateMultiTurnReturn(
+                        result=answers,
+                        input_tokens=[],
+                        generated_tokens=[],
+                        truncated_tokens_count=0,
+                        padded_tokens_count=0,
+                    )
                 )
-            )
 
         return results
 
@@ -587,7 +635,7 @@ def _generate(
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        generations = outputs.sequences[:, batch.input_ids.size(1) :]
+        generations = outputs.sequences[:, batch.input_ids.size(1):]
         generations, len_gens = self.pad_and_gather(generations)
         batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids)
 
@@ -691,7 +739,7 @@ def _loglikelihood_tokens(
                 max_context_continuation_size_allowed = len(context_enc + continuation_enc)
             else:  # in normal mode, we left cut the context if needed
                 max_context_continuation_size_allowed = len(
-                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+                    (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
                 )
 
             batch_size = self._get_batch_size(
@@ -728,7 +776,7 @@ def _loglikelihood_tokens(
                         cont_toks = cont_toks[:inplen].unsqueeze(0).to(self.device)  # [1, seq]
                     else:
                         cur_logits = (
-                            cur_logits[inplen - contlen : inplen].unsqueeze(0).to(self.device)
+                            cur_logits[inplen - contlen: inplen].unsqueeze(0).to(self.device)
                         )  # [1, seq, voc]
                         cont_toks = cont_toks.unsqueeze(0).to(self.device)  # [1, seq]
 
@@ -851,7 +899,7 @@ def prepare_batch_logprob(
             padded=padded,
         )
 
-    def pad_and_gather(self, output_tensor: torch.Tensor) -> torch.Tensor:
+    def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool = True) -> torch.Tensor:
         """Gather together tensors of (possibly) various size spread on separate GPUs (first exchange the lengths and then pad and gather)"""
         # Create a tensor of size batch_size, [output_length] * batch_size, for each each process
         length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
@@ -864,7 +912,10 @@ def pad_and_gather(self, output_tensor: torch.Tensor) -> torch.Tensor:
             output_tensor, (0, max_length - output_tensor.shape[1], 0, 0), value=self.tokenizer.pad_token_id
         )
         if self.accelerator:
-            output_tensor = self.accelerator.gather_for_metrics(output_tensor)
+            if drop_last_samples:
+                output_tensor = self.accelerator.gather_for_metrics(output_tensor)
+            else:
+                output_tensor = self.accelerator.gather(output_tensor)
         return output_tensor, length_tensor
 
     def loglikelihood_single_token(
@@ -908,7 +959,7 @@ def _loglikelihood_single_token(
 
         for split_start, split_end in tqdm(dataset.splits_start_end_iterator()):
             context_enc = dataset[0].tokenized_context
-            max_context = len(context_enc[-self.max_length :])
+            max_context = len(context_enc[-self.max_length:])
             batch_size = self._get_batch_size(override_bs=override_bs, max_input_length=max_context)
             starting_batch_size = batch_size * 2
 
@@ -989,10 +1040,15 @@ def __init__(
         self,
         sequence: str,
         tokenizer: transformers.PreTrainedTokenizer,
-        batch: Batch,
+        batch: Batch = None,
+        input_ids_shape: Tuple[int, int] = None,
     ):
-        initial_decoder_input_length = batch.input_ids.shape[1]
-        batch_size = batch.input_ids.shape[0]
+        if batch is not None:
+            initial_decoder_input_length = batch.input_ids.shape[1]
+            batch_size = batch.input_ids.shape[0]
+        else:
+            initial_decoder_input_length = input_ids_shape[1]
+            batch_size = input_ids_shape[0]
 
         self.initial_decoder_input_length = initial_decoder_input_length
         self.done_tracker = [False] * batch_size
@@ -1003,7 +1059,7 @@ def __init__(
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
         # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][:, -self.sequence_id_len :]
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length:][:, -self.sequence_id_len:]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 

From 64ceee512f15a74134f71d9a21bfc574a2135978 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 14:46:27 +0000
Subject: [PATCH 35/45] fixes for review

---
 extended_tasks/mt_bench/judges.py      | 86 ++++++++++++++++++++++++++
 extended_tasks/mt_bench/main.py        |  2 +-
 src/lighteval/data.py                  |  3 +-
 src/lighteval/few_shot_manager.py      | 43 ++++++++++++-
 src/lighteval/logging/info_loggers.py  |  2 +
 src/lighteval/models/abstract_model.py | 14 ++++-
 src/lighteval/models/base_model.py     | 26 +++-----
 src/lighteval/tasks/lighteval_task.py  | 43 +------------
 8 files changed, 157 insertions(+), 62 deletions(-)

diff --git a/extended_tasks/mt_bench/judges.py b/extended_tasks/mt_bench/judges.py
index 9549964e4..582f50c35 100644
--- a/extended_tasks/mt_bench/judges.py
+++ b/extended_tasks/mt_bench/judges.py
@@ -20,6 +20,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+# Inspired by the FastChat Codebase: https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/README.md
+
 
 import ast
 import json
@@ -40,6 +42,34 @@ def evaluate_answer(answers, questions, references) -> tuple[str, list[dict[str,
 
 
 class JudgeOpenAI(Judge):
+    """
+    A class representing a judge for evaluating answers using the OpenAI API.
+
+    Args:
+        model (str): The name of the OpenAI model to use.
+        seed (int): The seed value for generating random responses.
+        temperature (float): The temperature value for controlling the randomness of the responses.
+        templates_path (str): The path to the JSON file containing the templates for prompts.
+
+    Attributes:
+        client: An instance of the OpenAI client.
+        model (str): The name of the OpenAI model.
+        seed (int): The seed value, passed to the API when generating responses.
+        temperature (float): The temperature value, passed to the API when generating responses.
+        templates (dict): A dictionary containing the templates for prompts.
+        one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response.
+        one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores.
+        API_MAX_RETRY (int): The maximum number of API retries.
+        API_RETRY_SLEEP (int): The sleep time between API retries.
+        max_tokens (int): The maximum number of tokens allowed in the response.
+
+    Methods:
+        evaluate_answer: Evaluates an answer using the OpenAI API.
+        __get_prompts_multi_turn: Generates prompts for multi-turn conversations.
+        __get_prompts_single_turn: Generates prompts for single-turn conversations.
+        __process_judge_response: Processes the judge's response and extracts the score.
+    """
+
     def __init__(self, model: str, seed: int, temperature: float, templates_path: str):
         self.client = OpenAI()
         self.model = model
@@ -54,6 +84,9 @@ def __init__(self, model: str, seed: int, temperature: float, templates_path: st
 
         self.templates = {d["name"]: d for d in data}
 
+        # Patterns for extracting scores from the response
+        # The first pattern is for the default case: [[score]],
+        # the second is for the backup case: [score]
         self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
         self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
 
@@ -64,6 +97,21 @@ def __init__(self, model: str, seed: int, temperature: float, templates_path: st
     def evaluate_answer(
         self, questions: list[str], answers: list[str], references: list[str], single_turn: bool
     ) -> tuple[int, list[dict[str, str]], str]:
+        """
+        Evaluates an answer using the OpenAI API.
+
+        Args:
+            questions (list[str]): A list of questions (can be a list because of multi-turn conversations)
+            answers (list[str]): A list of answers, one for each question.
+            references (list[str]): A list of reference answers, one for each question (sometimes not available)
+            single_turn (bool): Indicates whether the conversation is single-turn or multi-turn.
+
+        Returns:
+            A tuple containing the score, prompts, and judgment.
+
+        Raises:
+            Exception: If an error occurs during the API call.
+        """
         if single_turn:
             prompts = self.__get_prompts_single_turn(
                 questions[0], answers[0], references[0] if len(references) > 0 else None
@@ -85,6 +133,10 @@ def evaluate_answer(
             except Exception as e:
                 hlog_warn(f"{type(e), e}")
                 time.sleep(self.API_RETRY_SLEEP)
+                response = None
+
+        if response is None:
+            raise Exception("Failed to get response from the API")
 
         judgment = response.choices[0].message.content
         score = self.__process_judge_response(judgment)
@@ -94,6 +146,18 @@ def evaluate_answer(
     def __get_prompts_multi_turn(
         self, questions: list[str], answers: list[str], references: Optional[list[str]]
     ) -> list[dict[str, str]]:
+        """
+        Generates prompts for multi-turn conversations. The prompts are generated based on the templates.
+        The prompt is different for the case where reference answers are available.
+
+        Args:
+            questions (list[str]): A list of questions.
+            answers (list[str]): A list of answers.
+            references (Optional[list[str]]): A list of reference answers.
+
+        Returns:
+            A list of prompts.
+        """
         if references is None:
             system_prompt = {"role": "system", "content": self.templates["single-v1-multi-turn"]["system_prompt"]}
             user_prompt_str = self.templates["single-v1-multi-turn"]["prompt_template"].format(
@@ -113,6 +177,18 @@ def __get_prompts_multi_turn(
         return [system_prompt, user_prompt]
 
     def __get_prompts_single_turn(self, question: str, answer: str, reference: Optional[str]) -> list[dict[str, str]]:
+        """
+        Generates prompts for single-turn conversations. The prompts are generated based on the templates.
+        The prompt is different for the case where a reference answer is available.
+
+        Args:
+            question (str): The question.
+            answer (str): The answer.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            A list of prompts.
+        """
         if reference is None:
             system_prompt = {"role": "system", "content": self.templates["single-v1"]["system_prompt"]}
             user_prompt_str = self.templates["single-v1"]["prompt_template"].format(question=question, answer=answer)
@@ -125,6 +201,16 @@ def __get_prompts_single_turn(self, question: str, answer: str, reference: Optio
         return [system_prompt, user_prompt]
 
     def __process_judge_response(self, judgment: str) -> int:
+        """
+        Processes the judge's response and extracts the score.
+        Returns -1 if the score cannot be extracted.
+
+        Args:
+            judgment (str): The judge's response.
+
+        Returns:
+            The extracted score.
+        """
         match = re.search(self.one_score_pattern, judgment)
         if not match:
             match = re.search(self.one_score_pattern_backup, judgment)
diff --git a/extended_tasks/mt_bench/main.py b/extended_tasks/mt_bench/main.py
index cbdd91c36..830d5b566 100644
--- a/extended_tasks/mt_bench/main.py
+++ b/extended_tasks/mt_bench/main.py
@@ -47,7 +47,7 @@
     name="mt_bench",
     prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
     suite=["extended"],
-    hf_repo="SaylorTwift/mt-bench",
+    hf_repo="lighteval/mt-bench",
     hf_subset="default",
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 33437175a..56ada93b4 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -215,7 +215,8 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
         Returns:
             Any: The collated data.
         """
-        toks = request.tokenized_context
+        print(request.tokenized_context)
+        toks = sum([len(context) for context in request.tokenized_context])
         gen_length = request.generation_size
         # The generative task has no limit except the model context
         if gen_length is None:
diff --git a/src/lighteval/few_shot_manager.py b/src/lighteval/few_shot_manager.py
index 984685f3d..183c2c00f 100644
--- a/src/lighteval/few_shot_manager.py
+++ b/src/lighteval/few_shot_manager.py
@@ -27,7 +27,7 @@
 from itertools import cycle
 from typing import TYPE_CHECKING, Optional
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import Doc
@@ -219,6 +219,47 @@ def get_examples(
         )
         return instruction + labeled_examples + example
 
+    
+    def create_multi_turn_contexts(self,
+        doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
+    ) -> list[str]:
+        """Creates N contexts (depending on the number of turn) for a tasks.
+        Multi turn tasks need use chat templating.
+
+        Args:
+            doc (Doc): Formated document.
+            use_chat_template (bool): wether or not to use chat template. Will fail if false.
+            system_prompt (Optional[str]): The system prompt to use
+            tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
+
+        Raises:
+            ValueError: If use_chat_template is set to false.
+
+        Returns:
+            list[str]: contexts for every turn
+        """
+        if not use_chat_template:
+            raise ValueError("You need to use the chat template to create multi turn contexts")
+
+        role_content_list = []
+        if system_prompt is not None:
+            role_content_list.append({"role": "system", "content": system_prompt})
+
+        for i in doc.specific["multi_turn_queries"]:
+            role_content_list.append({"role": "user", "content": i})
+            role_content_list.append({"role": "assistant", "content": "{model_response}"})
+        role_content_list.pop(-1)
+
+        contexts = []
+        offset = 2 if system_prompt is not None else 1
+        for i in range(0, len(role_content_list), offset + 1):
+            c = tokenizer.apply_chat_template(
+                role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
+            )
+            contexts.append(c)
+
+        return contexts, 0
+
     def fewshot_context(
         self,
         task: "LightevalTask",
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index f65359d5b..45aedfc1e 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -322,6 +322,8 @@ def log(
             doc (Doc): Current sample that we want to store.
             outputs (list[ModelReturn]): Model outputs for the current sample
             metrics (_type_): Model scores for said sample on the current task's metrics.
+            llm_as_prompt_judgement (tuple[str, str]): Tuple containing the
+                prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric.
         """
         detail = self.Detail()
         detail.example = doc.query
diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
index be88a6b28..ccc49146c 100644
--- a/src/lighteval/models/abstract_model.py
+++ b/src/lighteval/models/abstract_model.py
@@ -27,8 +27,14 @@
 from transformers import BatchEncoding
 
 from lighteval.models.model_config import EnvConfig
-from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
+from lighteval.models.model_output import (
+    GenerateMultiTurnReturn,
+    GenerateReturn,
+    LoglikelihoodReturn,
+    LoglikelihoodSingleTokenReturn,
+)
 from lighteval.tasks.requests import (
+    GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
@@ -102,6 +108,12 @@ def greedy_until_with_logits(
             returns_logits=True,
         )
 
+    def greedy_until_multi_turn(  # noqa: C901
+        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
+    ) -> GenerateMultiTurnReturn:
+        """Generates responses using a greedy decoding strategy until certain ending conditions are met."""
+        return NotImplemented
+
     @abstractmethod
     def greedy_until(
         self,
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 476d3b6b7..f10d96d56 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -376,13 +376,11 @@ def greedy_until_multi_turn(  # noqa: C901
         if self.accelerator:
             dataloader = self.accelerator.prepare(dataloader)
 
-        # Always batch size 1 for multi-turn
+        hlog_warn("Running greedy multi turn generation, the batch size is set to 1 for this task.")
+
         for request_batch in tqdm(
             dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
         ):
-            # NOTE: we are assuming all items in a batch behave similarly (same
-            # stop_tokens and max_tokens genrated) which is not necessarily
-            # the case! Because of that we only use batch size of 1
             request = request_batch[0]
             stop_tokens = request.stop_sequence
             max_generated_tokens = request.generation_size
@@ -401,9 +399,7 @@ def greedy_until_multi_turn(  # noqa: C901
             stopping_criteria = transformers.StoppingCriteriaList(
                 [
                     *[
-                        MultiTokenEOSCriteria(
-                            sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape
-                        )
+                        MultiTokenEOSCriteria(sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape)
                         for sequence in stop_tokens
                     ],
                 ]
@@ -438,9 +434,7 @@ def greedy_until_multi_turn(  # noqa: C901
                 stopping_criteria = transformers.StoppingCriteriaList(
                     [
                         *[
-                            MultiTokenEOSCriteria(
-                                sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape
-                            )
+                            MultiTokenEOSCriteria(sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape)
                             for sequence in stop_tokens
                         ],
                     ]
@@ -456,7 +450,7 @@ def greedy_until_multi_turn(  # noqa: C901
                     if self.tokenizer.pad_token_id
                     else self.tokenizer.eos_token_id,
                 )
-                model_outputs = model_outputs[0, model_inputs["input_ids"].size(1):]  # batch size 1
+                model_outputs = model_outputs[0, model_inputs["input_ids"].size(1):]
                 model_generations.append(model_outputs)
                 decoded_generation = self.tokenizer.decode(model_outputs, skip_special_tokens=True)
 
@@ -635,7 +629,7 @@ def _generate(
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        generations = outputs.sequences[:, batch.input_ids.size(1):]
+        generations = outputs.sequences[:, batch.input_ids.size(1) :]
         generations, len_gens = self.pad_and_gather(generations)
         batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids)
 
@@ -739,7 +733,7 @@ def _loglikelihood_tokens(
                 max_context_continuation_size_allowed = len(context_enc + continuation_enc)
             else:  # in normal mode, we left cut the context if needed
                 max_context_continuation_size_allowed = len(
-                    (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
                 )
 
             batch_size = self._get_batch_size(
@@ -776,7 +770,7 @@ def _loglikelihood_tokens(
                         cont_toks = cont_toks[:inplen].unsqueeze(0).to(self.device)  # [1, seq]
                     else:
                         cur_logits = (
-                            cur_logits[inplen - contlen: inplen].unsqueeze(0).to(self.device)
+                            cur_logits[inplen - contlen : inplen].unsqueeze(0).to(self.device)
                         )  # [1, seq, voc]
                         cont_toks = cont_toks.unsqueeze(0).to(self.device)  # [1, seq]
 
@@ -959,7 +953,7 @@ def _loglikelihood_single_token(
 
         for split_start, split_end in tqdm(dataset.splits_start_end_iterator()):
             context_enc = dataset[0].tokenized_context
-            max_context = len(context_enc[-self.max_length:])
+            max_context = len(context_enc[-self.max_length :])
             batch_size = self._get_batch_size(override_bs=override_bs, max_input_length=max_context)
             starting_batch_size = batch_size * 2
 
@@ -1059,7 +1053,7 @@ def __init__(
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
         # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length:][:, -self.sequence_id_len:]
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][:, -self.sequence_id_len :]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 46679e948..469b12534 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -623,47 +623,6 @@ def download_dataset_worker(args):
     return dataset
 
 
-def create_multi_turn_contexts(
-    doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
-) -> list[str]:
-    """Creates N contexts (depending on the number of turn) for a tasks.
-    Multi turn tasks need use chat templating.
-
-    Args:
-        doc (Doc): Formated document.
-        use_chat_template (bool): wether or not to use chat template. Will fail if false.
-        system_prompt (Optional[str]): The system prompt to use
-        tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
-
-    Raises:
-        ValueError: If use_chat_template is set to false.
-
-    Returns:
-        list[str]: contexts for every turn
-    """
-    if not use_chat_template:
-        raise ValueError("You need to use the chat template to create multi turn contexts")
-
-    role_content_list = []
-    if system_prompt is not None:
-        role_content_list.append({"role": "system", "content": system_prompt})
-
-    for i in doc.specific["multi_turn_queries"]:
-        role_content_list.append({"role": "user", "content": i})
-        role_content_list.append({"role": "assistant", "content": "{model_response}"})
-    role_content_list.pop(-1)
-
-    contexts = []
-    offset = 2 if system_prompt is not None else 1
-    for i in range(0, len(role_content_list), offset + 1):
-        c = tokenizer.apply_chat_template(
-            role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
-        )
-        contexts.append(c)
-
-    return contexts, 0
-
-
 def create_requests_from_tasks(  # noqa: C901
     task_dict: dict[str, LightevalTask],
     fewshot_dict: dict[str, list[Tuple[int, bool]]],
@@ -748,7 +707,7 @@ def create_requests_from_tasks(  # noqa: C901
                             system_prompt=system_prompt,
                         )
                     else:
-                        ctx, num_effective_few_shots = create_multi_turn_contexts(
+                        ctx, num_effective_few_shots = task.fewshot_sampler.create_multi_turn_contexts(
                             doc, use_chat_template, system_prompt, lm.tokenizer
                         )
                         doc.specific["multi_turn_queries_context"] = ctx

From 46d7dd8da44f2b41baec0d7b70c8b415b63c6f16 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 14:47:22 +0000
Subject: [PATCH 36/45] make style

---
 src/lighteval/few_shot_manager.py     |  5 ++---
 src/lighteval/models/base_model.py    | 12 ++++++++----
 src/lighteval/tasks/lighteval_task.py |  1 -
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/lighteval/few_shot_manager.py b/src/lighteval/few_shot_manager.py
index 183c2c00f..081703a8a 100644
--- a/src/lighteval/few_shot_manager.py
+++ b/src/lighteval/few_shot_manager.py
@@ -219,9 +219,8 @@ def get_examples(
         )
         return instruction + labeled_examples + example
 
-    
-    def create_multi_turn_contexts(self,
-        doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
+    def create_multi_turn_contexts(
+        self, doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
     ) -> list[str]:
         """Creates N contexts (depending on the number of turn) for a tasks.
         Multi turn tasks need use chat templating.
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index f10d96d56..aa284f6f2 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -399,7 +399,9 @@ def greedy_until_multi_turn(  # noqa: C901
             stopping_criteria = transformers.StoppingCriteriaList(
                 [
                     *[
-                        MultiTokenEOSCriteria(sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape)
+                        MultiTokenEOSCriteria(
+                            sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape
+                        )
                         for sequence in stop_tokens
                     ],
                 ]
@@ -413,7 +415,7 @@ def greedy_until_multi_turn(  # noqa: C901
                 if self.tokenizer.pad_token_id
                 else self.tokenizer.eos_token_id,
             )
-            model_outputs = model_outputs[0, model_inputs["input_ids"].size(1):]
+            model_outputs = model_outputs[0, model_inputs["input_ids"].size(1) :]
             model_generations = [model_outputs]
             decoded_generation = self.tokenizer.decode(model_outputs)
             for term in stop_tokens:
@@ -434,7 +436,9 @@ def greedy_until_multi_turn(  # noqa: C901
                 stopping_criteria = transformers.StoppingCriteriaList(
                     [
                         *[
-                            MultiTokenEOSCriteria(sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape)
+                            MultiTokenEOSCriteria(
+                                sequence, self.tokenizer, input_ids_shape=model_inputs["input_ids"].shape
+                            )
                             for sequence in stop_tokens
                         ],
                     ]
@@ -450,7 +454,7 @@ def greedy_until_multi_turn(  # noqa: C901
                     if self.tokenizer.pad_token_id
                     else self.tokenizer.eos_token_id,
                 )
-                model_outputs = model_outputs[0, model_inputs["input_ids"].size(1):]
+                model_outputs = model_outputs[0, model_inputs["input_ids"].size(1) :]
                 model_generations.append(model_outputs)
                 decoded_generation = self.tokenizer.decode(model_outputs, skip_special_tokens=True)
 
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 469b12534..a213076b6 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -28,7 +28,6 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 from datasets import load_dataset
-from transformers import PreTrainedTokenizer
 
 from lighteval.few_shot_manager import FewShotSampler
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn

From e2f7fa8ec44ee754a60da5dd72b044443395d5d3 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 15:07:11 +0000
Subject: [PATCH 37/45] fix

---
 src/lighteval/data.py              | 19 +------------------
 src/lighteval/models/base_model.py |  2 +-
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 56ada93b4..711b0749b 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -215,8 +215,7 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
         Returns:
             Any: The collated data.
         """
-        print(request.tokenized_context)
-        toks = sum([len(context) for context in request.tokenized_context])
+        toks = request.tokenized_context
         gen_length = request.generation_size
         # The generative task has no limit except the model context
         if gen_length is None:
@@ -224,22 +223,6 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
         return -(len(toks) + gen_length)
 
 
-class GenerativeTaskMultiTurnDataset(DynamicBatchDataset):
-    def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsRequest) -> int:
-        """
-        Collate function for generating batches.
-
-        Args:
-            x (Any): The input data.
-
-        Returns:
-            Any: The collated data.
-        """
-        toks = sum([len(r) for r in request.tokenized_contexts])
-        gen_length = request.generation_size
-        return -(len(toks) + gen_length)
-
-
 class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
     def __getitem__(self, index) -> Request:
         """
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index aa284f6f2..e95b2228f 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -366,7 +366,7 @@ def greedy_until_multi_turn(  # noqa: C901
     ) -> GenerateMultiTurnReturn:
         for request in requests:
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
-            request.tokenized_context = self.tok_encode(request.context)
+            request.tokenized_context = self.tok_encode(request.context)["input_ids"]
 
         results = []
 

From 3260147c678715ead175ea4e5be3a9070a28647b Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 15:39:22 +0000
Subject: [PATCH 38/45] revert generate_response in base model

---
 src/lighteval/models/base_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index e95b2228f..c53d4888e 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -629,7 +629,7 @@ def _generate(
             pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id,
             return_dict_in_generate=True,
             output_scores=True,
-            # eos_token_id=self.tokenizer.eos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)

From 33eb252043f194e59feeaf2cff6d2056e33482c5 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 15:40:27 +0000
Subject: [PATCH 39/45] merge

---
 .../lighteval/tasks/extended}/mt_bench/judge_prompts.jsonl        | 0
 .../lighteval/tasks/extended}/mt_bench/judges.py                  | 0
 {extended_tasks => src/lighteval/tasks/extended}/mt_bench/main.py | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename {extended_tasks => src/lighteval/tasks/extended}/mt_bench/judge_prompts.jsonl (100%)
 rename {extended_tasks => src/lighteval/tasks/extended}/mt_bench/judges.py (100%)
 rename {extended_tasks => src/lighteval/tasks/extended}/mt_bench/main.py (100%)

diff --git a/extended_tasks/mt_bench/judge_prompts.jsonl b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
similarity index 100%
rename from extended_tasks/mt_bench/judge_prompts.jsonl
rename to src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
diff --git a/extended_tasks/mt_bench/judges.py b/src/lighteval/tasks/extended/mt_bench/judges.py
similarity index 100%
rename from extended_tasks/mt_bench/judges.py
rename to src/lighteval/tasks/extended/mt_bench/judges.py
diff --git a/extended_tasks/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
similarity index 100%
rename from extended_tasks/mt_bench/main.py
rename to src/lighteval/tasks/extended/mt_bench/main.py

From b2e5895f16194ec5cd06c58fa4aa8316076c8aa0 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 15:50:37 +0000
Subject: [PATCH 40/45] fix tests

---
 src/lighteval/tasks/lighteval_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index a213076b6..3b8204d0c 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -690,7 +690,7 @@ def create_requests_from_tasks(  # noqa: C901
                     # to fix!!
                     cur_task_name = f"{task_name}|{num_fewshot}"
                     doc = task_docs[doc_id]
-                    is_multi_turn = len(doc.specific.get("multi_turn_queries", [])) > 0
+                    is_multi_turn = doc.specific is not None and len(doc.specific.get("multi_turn_queries", [])) > 0
 
                     if not is_multi_turn:
                         ctx, num_effective_few_shots = task.fewshot_sampler.fewshot_context(

From c42e65d3f32d74f131a90aaee5628d965b5e5990 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Wed, 27 Mar 2024 16:57:46 +0000
Subject: [PATCH 41/45] fix format

---
 src/lighteval/tasks/extended/mt_bench/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
index 830d5b566..59b8051fc 100644
--- a/src/lighteval/tasks/extended/mt_bench/main.py
+++ b/src/lighteval/tasks/extended/mt_bench/main.py
@@ -21,7 +21,7 @@
 # SOFTWARE.
 
 
-# ruff: noqa: F405, F403, F401
+# ruff: noqa: F405, F403, F401, I001
 """
 Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
 This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.

From aa6c6f88a82ad01aaa9b61a2b144c8ec903235f4 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 29 Mar 2024 12:44:41 +0000
Subject: [PATCH 42/45] commit

---
 src/lighteval/logging/info_loggers.py          |  4 ++--
 src/lighteval/models/base_model.py             | 18 ++++++++++++++++--
 src/lighteval/tasks/extended/__init__.py       |  3 ++-
 .../tasks/extended/mt_bench/judges.py          |  4 ++--
 src/lighteval/tasks/extended/mt_bench/main.py  | 12 ++++++++++--
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 45aedfc1e..b11c124c9 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -24,7 +24,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
-from typing import Union
+from typing import Optional, Union
 
 import git
 import numpy as np
@@ -312,7 +312,7 @@ def log(
         doc: Doc,
         outputs: list[ModelReturn],
         metrics: dict,
-        llm_as_prompt_judgement: tuple[str, str],
+        llm_as_prompt_judgement: Optional[tuple[str, str]] = None,
     ) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index c53d4888e..2cb96bc97 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -421,6 +421,8 @@ def greedy_until_multi_turn(  # noqa: C901
             for term in stop_tokens:
                 decoded_generation = decoded_generation.split(term)[0]
 
+            input_tokens = [model_inputs["input_ids"]]
+
             for i, multi_turn_context in enumerate(request.context[1:]):
                 multi_turn_context = multi_turn_context.format(model_response=decoded_generation)
 
@@ -457,6 +459,7 @@ def greedy_until_multi_turn(  # noqa: C901
                 model_outputs = model_outputs[0, model_inputs["input_ids"].size(1) :]
                 model_generations.append(model_outputs)
                 decoded_generation = self.tokenizer.decode(model_outputs, skip_special_tokens=True)
+                input_tokens.append(model_inputs["input_ids"])
 
                 for term in stop_tokens:
                     decoded_generation = decoded_generation.split(term)[0]
@@ -898,8 +901,19 @@ def prepare_batch_logprob(
         )
 
     def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool = True) -> torch.Tensor:
-        """Gather together tensors of (possibly) various size spread on separate GPUs (first exchange the lengths and then pad and gather)"""
-        # Create a tensor of size batch_size, [output_length] * batch_size, for each each process
+        """
+        Pads the `output_tensor` to the maximum length and gathers the lengths across processes.
+
+        Args:
+            output_tensor (torch.Tensor): The output tensor to be padded.
+            drop_last_samples (bool, optional): Whether to drop the last samples during gathering.
+            Last samples are dropped when the number of samples is not divisible by the number of processes.
+                Defaults to True.
+
+        Returns:
+            torch.Tensor: The padded output tensor and the gathered length tensor.
+        """
+        # Create a tensor of size batch_size, [output_length] * batch_size, for each process
         length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
         if self.accelerator is not None:
             # Gather all the lengths, we end up with a tensor of size num_processes [output_length_1, output_length_2, ...]
diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py
index 201c8c4de..81919c0af 100644
--- a/src/lighteval/tasks/extended/__init__.py
+++ b/src/lighteval/tasks/extended/__init__.py
@@ -25,9 +25,10 @@
 
 if can_load_extended_tasks():
     import lighteval.tasks.extended.ifeval.main as ifeval
+    import lighteval.tasks.extended.mt_bench.main as mt_bench
     import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
 
-    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks]
+    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench]
 
 else:
     AVAILABLE_EXTENDED_TASKS_MODULES = []
diff --git a/src/lighteval/tasks/extended/mt_bench/judges.py b/src/lighteval/tasks/extended/mt_bench/judges.py
index 582f50c35..a75d4eacc 100644
--- a/src/lighteval/tasks/extended/mt_bench/judges.py
+++ b/src/lighteval/tasks/extended/mt_bench/judges.py
@@ -70,8 +70,8 @@ class JudgeOpenAI(Judge):
         __process_judge_response: Processes the judge's response and extracts the score.
     """
 
-    def __init__(self, model: str, seed: int, temperature: float, templates_path: str):
-        self.client = OpenAI()
+    def __init__(self, model: str, seed: int, temperature: float, templates_path: str, openai_api_key: str):
+        self.client = OpenAI(api_key=openai_api_key)
         self.model = model
         self.seed = seed
         self.temperature = temperature
diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
index 59b8051fc..8d4024676 100644
--- a/src/lighteval/tasks/extended/mt_bench/main.py
+++ b/src/lighteval/tasks/extended/mt_bench/main.py
@@ -32,13 +32,20 @@
 from aenum import extend_enum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from extended_tasks.mt_bench.judges import JudgeOpenAI
+from lighteval.tasks.extended.mt_bench.judges import JudgeOpenAI
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
+from colorama import Fore, Style
+import os
 
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+
+if OPENAI_API_KEY is None:
+    # Using print here because hlog_warn is not yet available in this context
+    print(Fore.YELLOW + "No OpenAI API key found. If you are using the OpenAI judge, please set the OPENAI_API_KEY environment variable." + Style.RESET_ALL)
 
 # EVAL WITH NO SUBSET ##
 # This is how you create a simple tasks (like hellaswag) which has one single subset
@@ -91,7 +98,8 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
         model="gpt-3.5-turbo",
         seed=42,
         temperature=0.0,
-        templates_path="extended_tasks/mt_bench/judge_prompts.jsonl",
+        templates_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+        openai_api_key=OPENAI_API_KEY
     )
 
     questions = formatted_doc.specific["multi_turn_queries"]

From bb4b1333fddbb453b15efd3a656abc78092e9be9 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 29 Mar 2024 12:44:58 +0000
Subject: [PATCH 43/45] make style

---
 src/lighteval/tasks/extended/mt_bench/main.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
index 8d4024676..c3508903a 100644
--- a/src/lighteval/tasks/extended/mt_bench/main.py
+++ b/src/lighteval/tasks/extended/mt_bench/main.py
@@ -45,7 +45,11 @@
 
 if OPENAI_API_KEY is None:
     # Using print here because hlog_warn is not yet available in this context
-    print(Fore.YELLOW + "No OpenAI API key found. If you are using the OpenAI judge, please set the OPENAI_API_KEY environment variable." + Style.RESET_ALL)
+    print(
+        Fore.YELLOW
+        + "No OpenAI API key found. If you are using the OpenAI judge, please set the OPENAI_API_KEY environment variable."
+        + Style.RESET_ALL
+    )
 
 # EVAL WITH NO SUBSET ##
 # This is how you create a simple tasks (like hellaswag) which has one single subset
@@ -99,7 +103,7 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
         seed=42,
         temperature=0.0,
         templates_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
-        openai_api_key=OPENAI_API_KEY
+        openai_api_key=OPENAI_API_KEY,
     )
 
     questions = formatted_doc.specific["multi_turn_queries"]

From 2d3a04ca2898f626d9736828fea6a3a426da6af0 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 29 Mar 2024 14:23:36 +0000
Subject: [PATCH 44/45] fix from review

---
 src/lighteval/models/base_model.py            | 13 ++-----------
 src/lighteval/models/utils.py                 | 10 ++++++++++
 src/lighteval/tasks/extended/mt_bench/main.py | 15 ---------------
 src/lighteval/tasks/lighteval_task.py         | 12 ++++++------
 4 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 2cb96bc97..e5545c36b 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -21,7 +21,6 @@
 # SOFTWARE.
 
 import os
-from itertools import islice
 from typing import Optional, Tuple, Union
 
 import torch
@@ -43,7 +42,7 @@
     LoglikelihoodReturn,
     LoglikelihoodSingleTokenReturn,
 )
-from lighteval.models.utils import _get_dtype, _get_precision, _simplify_name
+from lighteval.models.utils import _get_dtype, _get_precision, _simplify_name, batched
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
@@ -353,14 +352,6 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def batched(self, iterable, n):
-        # batched('ABCDEFG', 3) → ABC DEF G
-        if n < 1:
-            raise ValueError("n must be at least one")
-        it = iter(iterable)
-        while batch := tuple(islice(it, n)):
-            yield batch
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -479,7 +470,7 @@ def greedy_until_multi_turn(  # noqa: C901
                 decoded = self.tokenizer.decode(generation, skip_special_tokens=True)
                 model_answers.append(decoded)
 
-            for answers in self.batched(model_answers, len(request.context)):
+            for answers in batched(model_answers, len(request.context)):
                 results.append(
                     GenerateMultiTurnReturn(
                         result=answers,
diff --git a/src/lighteval/models/utils.py b/src/lighteval/models/utils.py
index 9c66f29f0..2b245a742 100644
--- a/src/lighteval/models/utils.py
+++ b/src/lighteval/models/utils.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import os
+from itertools import islice
 from typing import TYPE_CHECKING, Optional, Union
 
 import torch
@@ -113,3 +114,12 @@ def _get_model_sha(repo_id: str, revision: str):
         return model_info.sha
     except Exception:
         return ""
+
+
+def batched(self, iterable, n):
+    # batched('ABCDEFG', 3) → ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
index c3508903a..e2c595119 100644
--- a/src/lighteval/tasks/extended/mt_bench/main.py
+++ b/src/lighteval/tasks/extended/mt_bench/main.py
@@ -20,13 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-
 # ruff: noqa: F405, F403, F401, I001
-"""
-Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
-This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-Author:
-"""
 
 import numpy as np
 from aenum import extend_enum
@@ -51,9 +45,6 @@
         + Style.RESET_ALL
     )
 
-# EVAL WITH NO SUBSET ##
-# This is how you create a simple tasks (like hellaswag) which has one single subset
-# attached to it, and one evaluation possible.
 task = LightevalTaskConfig(
     name="mt_bench",
     prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
@@ -70,8 +61,6 @@
 )
 
 
-# DEFINE YOUR PROMPT FUNCTIONS
-# Define as many as you need for your different tasks
 def prompt_fn(line, task_name: str = None):
     """Defines how to go from a dataset line to a doc object.
     Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
@@ -132,12 +121,8 @@ def mt_bench_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dic
     },
 )
 
-# STORE YOUR EVALS
 _TASKS = [task]
 
-# MODULE LOGIC
-# You should not need to touch this
-# Convert to dict for lighteval
 TASKS_TABLE = [task.as_dict() for task in _TASKS]
 extend_enum(
     Metrics,
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 3b8204d0c..c0208cd4b 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -692,7 +692,12 @@ def create_requests_from_tasks(  # noqa: C901
                     doc = task_docs[doc_id]
                     is_multi_turn = doc.specific is not None and len(doc.specific.get("multi_turn_queries", [])) > 0
 
-                    if not is_multi_turn:
+                    if is_multi_turn:
+                        ctx, num_effective_few_shots = task.fewshot_sampler.create_multi_turn_contexts(
+                            doc, use_chat_template, system_prompt, lm.tokenizer
+                        )
+                        doc.specific["multi_turn_queries_context"] = ctx
+                    else:
                         ctx, num_effective_few_shots = task.fewshot_sampler.fewshot_context(
                             task=task,
                             doc=doc,
@@ -705,11 +710,6 @@ def create_requests_from_tasks(  # noqa: C901
                             use_chat_template=use_chat_template,
                             system_prompt=system_prompt,
                         )
-                    else:
-                        ctx, num_effective_few_shots = task.fewshot_sampler.create_multi_turn_contexts(
-                            doc, use_chat_template, system_prompt, lm.tokenizer
-                        )
-                        doc.specific["multi_turn_queries_context"] = ctx
 
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot

From 0819ac75940dd825c53eb7b506e26bbd0f4636b0 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Fri, 29 Mar 2024 14:34:50 +0000
Subject: [PATCH 45/45] fix

---
 src/lighteval/models/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/models/utils.py b/src/lighteval/models/utils.py
index 2b245a742..ba9681515 100644
--- a/src/lighteval/models/utils.py
+++ b/src/lighteval/models/utils.py
@@ -116,7 +116,7 @@ def _get_model_sha(repo_id: str, revision: str):
         return ""
 
 
-def batched(self, iterable, n):
+def batched(iterable, n):
     # batched('ABCDEFG', 3) → ABC DEF G
     if n < 1:
         raise ValueError("n must be at least one")