From b06e18e18b6068241904dc392087ae103feacf5e Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 16 Apr 2024 13:36:35 +0000
Subject: [PATCH 01/19] init

---
 src/lighteval/evaluator.py              |  6 +-
 src/lighteval/metrics/__init__.py       |  7 +-
 src/lighteval/metrics/metrics.py        | 17 +++++
 src/lighteval/metrics/metrics_sample.py | 87 +++++++++++++++++++++++++
 src/lighteval/metrics/utils.py          |  3 +-
 src/lighteval/models/base_model.py      | 37 ++++++++++-
 src/lighteval/tasks/lighteval_task.py   | 25 ++++++-
 src/lighteval/tasks/requests.py         | 20 ++++++
 8 files changed, 190 insertions(+), 12 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index c77c3889..5fa511ed 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -82,12 +82,14 @@ def evaluate(  # noqa: C901
             full_resps = lm.loglikelihood(requests, override_bs=override_bs)
         elif request_type == RequestType.LOGLIKELIHOOD_SINGLE_TOKEN:
             full_resps = lm.loglikelihood_single_token(requests, override_bs=override_bs)
+        elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
+            full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL:
             full_resps = lm.greedy_until(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
-        elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
-            full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
+        elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
+            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 7ef77aef..68727b8e 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -91,7 +91,7 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
     for metric in metrics:
-        if Metrics[metric].value.category == MetricCategory.GENERATIVE:
+        if Metrics[metric].value.category in [MetricCategory.GENERATIVE, MetricCategory.GENERATIVE_SAMPLING]:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=pred, formatted_doc=formatted_doc))
 
     return results, outputs
@@ -153,10 +153,7 @@ def apply_llm_as_judge_metric(results: list[ModelReturn], formatted_doc: Doc, me
     predictions = results.pop(0).result
 
     for metric in metrics:
-        if (
-            Metrics[metric].value.category == MetricCategory.LLM_AS_JUDGE_MULTI_TURN
-            or Metrics[metric].value.category == MetricCategory.LLM_AS_JUDGE
-        ):
+        if Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE_MULTI_TURN, MetricCategory.LLM_AS_JUDGE]:
             outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))
 
     return results, outputs
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index 4a0e367d..ef2e3ee0 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -41,6 +41,7 @@
     F1_score,
     JudgeLLM,
     LoglikelihoodAcc,
+    MajAtK,
     Recall,
     StringDistance,
     acc_golds_likelihood,
@@ -326,6 +327,22 @@ class Metrics(Enum):
         corpus_level_fn=matthews_corrcoef,
         higher_is_better=True,
     )
+    maj_at_5 = SampleLevelMetric(
+        metric="maj@5",
+        sample_level_fn=MajAtK(k=5).compute,
+        category=MetricCategory.GENERATIVE_,
+        use_case=MetricUseCase.ACCURACY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    maj_at_8 = SampleLevelMetric(
+        metric="maj@8",
+        sample_level_fn=MajAtK(k=8).compute,
+        category=MetricCategory.GENERATIVE_,
+        use_case=MetricUseCase.ACCURACY,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     mrr = SampleLevelMetric(
         metric="mrr",
         sample_level_fn=MRR().compute,
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index a3809adb..2d17c4af 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -675,3 +675,90 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[
             "user_prompt": messages[0],
             "judgement": judgements[0],
         }
+
+
+class MajAtK:
+    def __init__(
+        self,
+        k: int,
+        normalize_gold: callable = None,
+        normalize_pred: callable = None,
+        strip_strings: bool = False,
+        type_exact_match: str = "full",
+    ):
+        """An exact match class.
+
+        Args:
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
+                Defaults to None if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to None if no normalization is applied.
+            strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
+            type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
+                Can be any of `prefix`, `suffix` or `full`. Defaults to "full".
+                `prefix` checks if the prediction starts with the gold,
+                `suffix` if the prediction ends with the gold,
+                `full` if the prediction and gold are equal
+        """
+        self.k = k
+        self.normalize_gold = normalize_gold
+        self.normalize_pred = normalize_pred
+        self.strip_strings = strip_strings
+
+        if type_exact_match not in ["prefix", "suffix", "full"]:
+            # todo: we could add a set exact match
+            raise ValueError(
+                f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {type_exact_match} instead."
+            )
+        self.type_exact_match = type_exact_match
+
+    def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+        """Computes the metric over a list of golds and predictions for one single sample.
+
+        Args:
+            golds (list[str]): Reference targets
+            predictions (list[str]): Predicted strings
+
+        Returns:
+            float: Aggregated score over the current sample's items.
+        """
+        if len(golds) > 0:
+            raise Exception("Cannot compute maj@k with several golds")
+
+        gold = golds[0]
+        all_answers = []
+        for pred in predictions[: self.k]:
+            all_answers.append(self.compute_one_item(gold=gold, pred=pred))
+        return 1 if sum(all_answers) / len(all_answers) >= 0.5 else 0
+
+    def compute_one_item(
+        self,
+        gold: str,
+        pred: str,
+    ) -> float:
+        """Compares two strings only.
+
+        Args:
+            gold (str): One of the possible references
+            pred (str): One of the possible predictions
+
+        Returns:
+            float: The exact match score. Will be 1 for a match, 0 otherwise.
+        """
+        if not pred:
+            return 0
+
+        if self.strip_strings:
+            gold = gold.strip()
+            pred = pred.strip()
+
+        if self.normalize_gold:
+            gold = self.normalize_gold(gold)
+        if self.normalize_pred:
+            pred = self.normalize_pred(pred)
+
+        if self.type_exact_match == "prefix":
+            return 1 if pred.startswith(gold) else 0
+        if self.type_exact_match == "suffix":
+            return 1 if pred.endswith(gold) else 0
+        return 1 if gold == pred else 0
diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py
index 6c79871e..e5ceaeb0 100644
--- a/src/lighteval/metrics/utils.py
+++ b/src/lighteval/metrics/utils.py
@@ -28,9 +28,10 @@ class MetricCategory(Enum):
     TARGET_PERPLEXITY = auto()
     PERPLEXITY = auto()
     GENERATIVE = auto()
+    GENERATIVE_LOGPROB = auto()
+    GENERATIVE_SAMPLING = auto()
     LLM_AS_JUDGE_MULTI_TURN = auto()
     LLM_AS_JUDGE = auto()
-    GENERATIVE_LOGPROB = auto()
     MULTICHOICE = auto()
     MULTICHOICE_ONE_TOKEN = auto()
     IGNORED = auto()
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 5dbaa750..d31c9ebd 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -47,6 +47,7 @@
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
+    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -336,7 +337,7 @@ def greedy_until_with_logits(
         returning both the generated sequences and the logits.
 
         Args:
-            requests (list[tuple[str, dict]]): A list of input requests,
+            requests (list[GreedyUntilWithLogitsRequest]): A list of input requests,
                 where each request is a tuple containing a prompt string and a dictionary of additional parameters.
             override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
 
@@ -352,6 +353,34 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
+    def greedy_until_with_sampling(
+        self,
+        requests: list[GreedyUntilWithSamplingRequest],
+        num_samples: int,
+        override_bs: Optional[int] = None,
+    ) -> list[GenerateReturn]:
+        """
+        Generates sequences greedily until a stopping condition is met,
+        returning both the generated sequences and the logits.
+
+        Args:
+            requests (list[GreedyUntilWithSamplingRequest]): A list of input requests,
+                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
+            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
+
+        Returns:
+            list[GenerateReturn]: A list of GenerateReturn objects,
+                where each object contains the generated sequence and the corresponding logits.
+        """
+
+        return self.greedy_until(
+            requests,
+            returns_logits=False,
+            disable_tqdm=self.disable_tqdm,
+            override_bs=override_bs,
+            num_samples=num_samples,
+        )
+
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -488,6 +517,7 @@ def greedy_until(
         requests: list[GreedyUntilRequest],
         returns_logits: bool = False,
         override_bs: Optional[int] = None,
+        num_samples: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -596,6 +626,7 @@ def greedy_until(
                     max_new_tokens=max_new_tokens,
                     stop_tokens=stop_tokens,
                     returns_logits=returns_logits,
+                    num_samples=num_samples,
                 )
                 results.extend(cur_reponses)
 
@@ -607,6 +638,7 @@ def _generate(
         max_new_tokens: int,
         stop_tokens: list[str],
         returns_logits: Optional[bool] = False,
+        num_samples: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """Contains the actual logic of the generation.
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn.
@@ -619,11 +651,12 @@ def _generate(
             attention_mask=batch.input_mask,
             max_new_tokens=max_new_tokens,
             stopping_criteria=stopping_criteria,
-            do_sample=False,
             pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id,
             return_dict_in_generate=True,
             output_scores=True,
             eos_token_id=self.tokenizer.eos_token_id,
+            do_sample=num_samples is not None,
+            num_return_sequences=num_samples,
         )
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 64ba9f39..4ab6c7a6 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -48,6 +48,7 @@
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
     GreedyUntilWithLogitsRequest,
+    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -101,6 +102,7 @@ class LightevalTaskConfig:
     generation_size: int = None
     stop_sequence: Optional[Tuple[str]] = None
     output_regex: Optional[str] = None
+    num_samples: Optional[list[int]] = None
 
     frozen: bool = False
     suite: Optional[Tuple[str]] = None
@@ -201,6 +203,8 @@ def __init__(  # noqa: C901
             hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.")
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
+        # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
+        self.num_samples = [int(metric.split("_")[-1]) for metric in self.metrics if "maj_at_" in metric]
 
         # Data processing
         # to use once prompt formatting is managed as a module
@@ -394,7 +398,7 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
         return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
 
     # Requests
-    def get_request_type(self) -> list[RequestType]:
+    def get_request_type(self) -> list[RequestType]:  # noqa C901
         """
         Returns the request types for the task.
 
@@ -418,6 +422,8 @@ def get_request_type(self) -> list[RequestType]:
             request_types.append(RequestType.GREEDY_UNTIL)
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             request_types.append(RequestType.GREEDY_UNTIL_WITH_LOGITS)
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            request_types.append(RequestType.GREEDY_UNTIL_WITH_SAMPLING)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
             request_types.append(RequestType.LOGLIKELIHOOD)
         if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
@@ -474,6 +480,18 @@ def construct_requests(
                     generation_size=self.generation_size,
                 )
             ]
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            requests[RequestType.GREEDY_UNTIL_WITH_SAMPLING] += [
+                GreedyUntilWithSamplingRequest(
+                    task_name=current_task_name,
+                    example_index=document_id_seed,
+                    request_index=0,
+                    context=context,
+                    stop_sequence=self.stop_sequence,
+                    generation_size=self.generation_size,
+                    num_samples=max(self.num_samples),  # If we have several samplings to apply, we use the max
+                )
+            ]
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             requests[RequestType.GREEDY_UNTIL_WITH_LOGITS] += [
                 GreedyUntilWithLogitsRequest(
@@ -543,7 +561,10 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            self.has_metric_category[MetricCategory.GENERATIVE]
+            or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+        ):
             results, cur_outputs = apply_generative_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
             )
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index c4c86335..ead236c3 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -35,6 +35,7 @@ class RequestType(Enum):
     GREEDY_UNTIL = auto()
     GREEDY_UNTIL_MULTI_TURN = auto()
     GREEDY_UNTIL_WITH_LOGITS = auto()
+    GREEDY_UNTIL_WITH_SAMPLING = auto()
 
 
 @dataclass
@@ -155,6 +156,25 @@ class GreedyUntilWithLogitsRequest(Request):
     tokenized_context: list[int] = None
 
 
+@dataclass
+class GreedyUntilWithSamplingRequest(Request):
+    """
+    Represents a request for generating text using the Greedy-Until strategy but
+    returning the logits.
+
+    Attributes:
+        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
+        generation_size (int): The maximum number of tokens to generate.
+        request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
+    """
+
+    stop_sequence: Union[str, tuple[str], list[str]]
+    generation_size: int
+    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLES
+    tokenized_context: list[int] = None
+    num_samples: int = None
+
+
 class TaskExampleId(NamedTuple):
     """
     Represents the identifier for an example in a task.

From cfdc6ee59a6f1a11ecfbce8bb51091eb8dc42358 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 16 Apr 2024 14:12:04 +0000
Subject: [PATCH 02/19] wip

---
 src/lighteval/evaluator.py            |  2 +-
 src/lighteval/metrics/metrics.py      | 14 ++++++++++++--
 src/lighteval/models/base_model.py    |  6 +++---
 src/lighteval/tasks/lighteval_task.py |  4 +++-
 src/lighteval/tasks/requests.py       |  2 +-
 src/lighteval/tasks/tasks_table.jsonl |  2 +-
 6 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index 5fa511ed..c4ccc2e7 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -89,7 +89,7 @@ def evaluate(  # noqa: C901
         elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
-            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)
+            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)  # , num_samples=self)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index ef2e3ee0..d1d85a03 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -330,7 +330,7 @@ class Metrics(Enum):
     maj_at_5 = SampleLevelMetric(
         metric="maj@5",
         sample_level_fn=MajAtK(k=5).compute,
-        category=MetricCategory.GENERATIVE_,
+        category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
@@ -338,11 +338,21 @@ class Metrics(Enum):
     maj_at_8 = SampleLevelMetric(
         metric="maj@8",
         sample_level_fn=MajAtK(k=8).compute,
-        category=MetricCategory.GENERATIVE_,
+        category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    maj_at_8_gsm8k = SampleLevelMetric(
+        metric="qem",
+        sample_level_fn=MajAtK(
+            k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.MATH,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     mrr = SampleLevelMetric(
         metric="mrr",
         sample_level_fn=MRR().compute,
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index d31c9ebd..c51b6b0d 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -517,7 +517,7 @@ def greedy_until(
         requests: list[GreedyUntilRequest],
         returns_logits: bool = False,
         override_bs: Optional[int] = None,
-        num_samples: Optional[int] = None,
+        num_samples: Optional[int] = 1,
     ) -> list[GenerateReturn]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -638,7 +638,7 @@ def _generate(
         max_new_tokens: int,
         stop_tokens: list[str],
         returns_logits: Optional[bool] = False,
-        num_samples: Optional[int] = None,
+        num_samples: Optional[int] = 1,
     ) -> list[GenerateReturn]:
         """Contains the actual logic of the generation.
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn.
@@ -655,7 +655,7 @@ def _generate(
             return_dict_in_generate=True,
             output_scores=True,
             eos_token_id=self.tokenizer.eos_token_id,
-            do_sample=num_samples is not None,
+            do_sample=num_samples > 1,
             num_return_sequences=num_samples,
         )
         if returns_logits:
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 4ab6c7a6..015fe71a 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -204,7 +204,9 @@ def __init__(  # noqa: C901
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
         # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
-        self.num_samples = [int(metric.split("_")[-1]) for metric in self.metrics if "maj_at_" in metric]
+        self.num_samples = [
+            int(metric.replace("maj_at_", "").split("_")[0]) for metric in self.metrics if "maj_at_" in metric
+        ]
 
         # Data processing
         # to use once prompt formatting is managed as a module
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index ead236c3..de9413b7 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -170,7 +170,7 @@ class GreedyUntilWithSamplingRequest(Request):
 
     stop_sequence: Union[str, tuple[str], list[str]]
     generation_size: int
-    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLES
+    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLING
     tokenized_context: list[int] = None
     num_samples: int = None
 
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
index 12e70f38..cd7f7c8e 100644
--- a/src/lighteval/tasks/tasks_table.jsonl
+++ b/src/lighteval/tasks/tasks_table.jsonl
@@ -442,7 +442,7 @@
 {"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}

From 98c1c12a3a7844a9ed1a12922af36004cd307922 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Wed, 17 Apr 2024 17:02:19 +0000
Subject: [PATCH 03/19] testing how to pad and gather with an added dimension
 for the num_samples

---
 src/lighteval/evaluator.py              |  2 +-
 src/lighteval/metrics/metrics_sample.py |  2 +-
 src/lighteval/models/base_model.py      | 74 +++++++++++--------------
 src/lighteval/tasks/requests.py         | 12 +---
 4 files changed, 37 insertions(+), 53 deletions(-)

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index c4ccc2e7..cfbfc7d0 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -89,7 +89,7 @@ def evaluate(  # noqa: C901
         elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
             full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
-            full_resps = lm.greedy_until_with_sampling(requests, override_bs=override_bs)  # , num_samples=self)
+            full_resps = lm.greedy_until(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 2d17c4af..d55b7f83 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -722,7 +722,7 @@ def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc,
         Returns:
             float: Aggregated score over the current sample's items.
         """
-        if len(golds) > 0:
+        if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
         gold = golds[0]
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index c51b6b0d..7cea6047 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -353,34 +353,6 @@ def greedy_until_with_logits(
             override_bs=override_bs,
         )
 
-    def greedy_until_with_sampling(
-        self,
-        requests: list[GreedyUntilWithSamplingRequest],
-        num_samples: int,
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[GreedyUntilWithSamplingRequest]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-
-        return self.greedy_until(
-            requests,
-            returns_logits=False,
-            disable_tqdm=self.disable_tqdm,
-            override_bs=override_bs,
-            num_samples=num_samples,
-        )
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -517,7 +489,6 @@ def greedy_until(
         requests: list[GreedyUntilRequest],
         returns_logits: bool = False,
         override_bs: Optional[int] = None,
-        num_samples: Optional[int] = 1,
     ) -> list[GenerateReturn]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -576,6 +547,7 @@ def greedy_until(
                 # the case! Because of that we only use batch size of 1
                 stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
+                num_samples = batch[0].num_samples if isinstance(batch[0], GreedyUntilWithSamplingRequest) else 1
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
@@ -644,6 +616,7 @@ def _generate(
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn.
         """
         stopping_criteria = stop_sequences_criteria(self.tokenizer, stop_sequences=stop_tokens, batch=batch)
+        batch_size, _ = batch.input_ids.shape
 
         # Compute model generation
         outputs = self.model.generate(
@@ -661,7 +634,8 @@ def _generate(
         if returns_logits:
             logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
         generations = outputs.sequences[:, batch.input_ids.size(1) :]
-        generations, len_gens = self.pad_and_gather(generations)
+        generations = torch.reshape(generations, (batch_size, num_samples, -1))
+        generations, len_gens = self.pad_and_gather(generations, num_samples=num_samples)
         batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids)
 
         logits, len_logits = None, None
@@ -679,20 +653,30 @@ def _generate(
 
         # We convert to GenerateReturn outputs
         all_responses = []
-        for ix, (generation, batched_input, trunc, padded) in enumerate(
+        for ix, (batched_generations, batched_input, trunc, padded) in enumerate(
             zip(generations, batch.input_ids, batch.truncated, batch.padded)
         ):
+            result_generations = []
+            decoded_generations = []
             # Ensure the generated responses do not contain the stop sequences.
-            generation = generation[: len_gens[ix]]
-            decoded_generation = self.tok_decode([generation])[0]
+            for generation in batched_generations:
+                generation = generation[: len_gens[ix]]
+                result_generations.append(generation)
+                decoded_generation = self.tok_decode([generation])[0]
 
-            for term in stop_tokens:
-                decoded_generation = decoded_generation.split(term)[0]
+                for term in stop_tokens:
+                    decoded_generation = decoded_generation.split(term)[0]
+
+                decoded_generations.append(decoded_generation)
+
+            if num_samples == 1:  # We only return one item
+                result_generations = result_generations[0]
+                decoded_generations = decoded_generations[0]
 
             cur_response = GenerateReturn(
-                result=decoded_generation,
+                result=decoded_generations,
                 logits=logits[ix][: len_logits[ix]] if returns_logits else None,
-                generated_tokens=generation,
+                generated_tokens=result_generations,
                 input_tokens=batched_input[: len_ids[ix]],
                 truncated_tokens_count=trunc.cpu().item(),
                 padded_tokens_count=padded.cpu().item(),
@@ -924,7 +908,9 @@ def prepare_batch_logprob(
             padded=padded,
         )
 
-    def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool = True) -> torch.Tensor:
+    def pad_and_gather(
+        self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = 1
+    ) -> torch.Tensor:
         """
         Pads the `output_tensor` to the maximum length and gathers the lengths across processes.
 
@@ -938,15 +924,21 @@ def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool =
             torch.Tensor: The padded output tensor and the gathered length tensor.
         """
         # Create a tensor of size batch_size, [output_length] * batch_size, for each process
-        length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
+        # length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
+        length_tensor = torch.zeros(
+            [output_tensor.shape[-1]] * num_samples * output_tensor.shape[0], device=self.device
+        )
         if self.accelerator is not None:
             # Gather all the lengths, we end up with a tensor of size num_processes [output_length_1, output_length_2, ...]
             length_tensor = self.accelerator.gather(length_tensor)
         # We pad the output_tensor to the max length
         max_length = length_tensor.max().item()
-        output_tensor = F.pad(
-            output_tensor, (0, max_length - output_tensor.shape[1], 0, 0), value=self.tokenizer.pad_token_id
+        padding = (
+            (0, max_length - output_tensor.shape[1], 0, 0, 0, 0)
+            if num_samples > 1
+            else (0, max_length - output_tensor.shape[1], 0, 0)
         )
+        output_tensor = F.pad(output_tensor, padding, value=self.tokenizer.pad_token_id)
         if self.accelerator:
             if drop_last_samples:
                 output_tensor = self.accelerator.gather_for_metrics(output_tensor)
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index de9413b7..952fd430 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -139,25 +139,20 @@ class GreedyUntilMultiTurnRequest(Request):
 
 
 @dataclass
-class GreedyUntilWithLogitsRequest(Request):
+class GreedyUntilWithLogitsRequest(GreedyUntilRequest):
     """
     Represents a request for generating text using the Greedy-Until strategy but
     returning the logits.
 
     Attributes:
-        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
-        generation_size (int): The maximum number of tokens to generate.
         request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
     """
 
-    stop_sequence: Union[str, tuple[str], list[str]]
-    generation_size: int
     request_type = RequestType.GREEDY_UNTIL_WITH_LOGITS
-    tokenized_context: list[int] = None
 
 
 @dataclass
-class GreedyUntilWithSamplingRequest(Request):
+class GreedyUntilWithSamplingRequest(GreedyUntilRequest):
     """
     Represents a request for generating text using the Greedy-Until strategy but
     returning the logits.
@@ -168,10 +163,7 @@ class GreedyUntilWithSamplingRequest(Request):
         request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
     """
 
-    stop_sequence: Union[str, tuple[str], list[str]]
-    generation_size: int
     request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLING
-    tokenized_context: list[int] = None
     num_samples: int = None
 
 

From 1549fda239dba7402dafa1ebda18c55a09f7d0d8 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 19 Apr 2024 12:42:07 +0000
Subject: [PATCH 04/19] now working, need to check why the metric is not
 displayed

---
 src/lighteval/logging/info_loggers.py |  5 ++++-
 src/lighteval/models/base_model.py    | 14 ++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 83fe981e..c211d2e4 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -350,7 +350,10 @@ def log(
         ):
             pred_saved = True
             pass  # should we log something?
-        if task.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            task.has_metric_category[MetricCategory.GENERATIVE]
+            or task.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+        ):
             detail.gold = doc.get_golds()
             pred_saved = True
         if task.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 7cea6047..d9acf1a4 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -909,7 +909,7 @@ def prepare_batch_logprob(
         )
 
     def pad_and_gather(
-        self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = 1
+        self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = None
     ) -> torch.Tensor:
         """
         Pads the `output_tensor` to the maximum length and gathers the lengths across processes.
@@ -924,19 +924,17 @@ def pad_and_gather(
             torch.Tensor: The padded output tensor and the gathered length tensor.
         """
         # Create a tensor of size batch_size, [output_length] * batch_size, for each process
-        # length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device)
-        length_tensor = torch.zeros(
-            [output_tensor.shape[-1]] * num_samples * output_tensor.shape[0], device=self.device
-        )
+        # output_tensor can be of size: batch_size * num_samples * length_item or just batch_size * length_item
+        length_tensor = torch.tensor([output_tensor.shape[-1]] * output_tensor.shape[0], device=self.device)
         if self.accelerator is not None:
             # Gather all the lengths, we end up with a tensor of size num_processes [output_length_1, output_length_2, ...]
             length_tensor = self.accelerator.gather(length_tensor)
         # We pad the output_tensor to the max length
         max_length = length_tensor.max().item()
         padding = (
-            (0, max_length - output_tensor.shape[1], 0, 0, 0, 0)
-            if num_samples > 1
-            else (0, max_length - output_tensor.shape[1], 0, 0)
+            (0, max_length - output_tensor.shape[-1], 0, 0, 0, 0)
+            if num_samples is not None
+            else (0, max_length - output_tensor.shape[-1], 0, 0)
         )
         output_tensor = F.pad(output_tensor, padding, value=self.tokenizer.pad_token_id)
         if self.accelerator:

From 8adfc0772e184555aa5ce3095102fd1fde5d3012 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 19 Apr 2024 14:00:44 +0000
Subject: [PATCH 05/19] seems to be working!

---
 src/lighteval/metrics/__init__.py       | 50 ++++++++++++++++++++-----
 src/lighteval/metrics/metrics.py        |  2 +-
 src/lighteval/metrics/metrics_sample.py | 35 ++++++++---------
 src/lighteval/tasks/lighteval_task.py   | 12 ++++--
 4 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 68727b8e..2458496b 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -70,12 +70,15 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     outputs = {}
 
     # Post processing prediction
-    pred_raw = results.pop(0).result
-    if output_regex is not None:
-        pred = next(iter(re.findall(output_regex, pred_raw)), "")
-    else:
-        pred = pred_raw
-    pred = as_list(pred)
+    preds_raw = as_list(results.pop(0).result)
+    preds = []
+
+    for pred_raw in preds_raw:
+        if output_regex is not None:
+            pred = next(iter(re.findall(output_regex, pred_raw)), "")
+        else:
+            pred = pred_raw
+        preds.append(pred)
 
     # Extracting gold
     try:
@@ -87,12 +90,41 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     # if "label_to_choices" in formatted_doc:
     if formatted_doc.specific is not None and "label_to_choices" in formatted_doc.specific:
         # Helm predicts on labels keys (A/B/C/D), but computes metrics on choices
-        pred = [formatted_doc.specific["label_to_choices"].get(p) for p in pred]
+        preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
     for metric in metrics:
-        if Metrics[metric].value.category in [MetricCategory.GENERATIVE, MetricCategory.GENERATIVE_SAMPLING]:
-            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=pred, formatted_doc=formatted_doc))
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE:
+            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
+
+    return results, outputs
+
+
+def apply_generative_sampling_metric(
+    results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None
+):
+    outputs = {}
+
+    # Post processing prediction
+    preds_raw = as_list(results.pop(0).result)
+    preds = []
+
+    for pred_raw in preds_raw:
+        if output_regex is not None:
+            pred = next(iter(re.findall(output_regex, pred_raw)), "")
+        else:
+            pred = pred_raw
+        preds.append(pred)
+
+    # Extracting gold
+    try:
+        golds = formatted_doc.get_golds()
+    except (KeyError, IndexError):
+        golds = None
+
+    for metric in metrics:
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
+            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
 
     return results, outputs
 
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index d1d85a03..f9430afd 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -344,7 +344,7 @@ class Metrics(Enum):
         higher_is_better=True,
     )
     maj_at_8_gsm8k = SampleLevelMetric(
-        metric="qem",
+        metric="maj@8",
         sample_level_fn=MajAtK(
             k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
         ).compute,
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index d55b7f83..5b286afb 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -725,38 +725,35 @@ def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc,
         if len(golds) > 1:
             raise Exception("Cannot compute maj@k with several golds")
 
-        gold = golds[0]
+        gold = self.get_processed_gold(golds[0])
         all_answers = []
         for pred in predictions[: self.k]:
-            all_answers.append(self.compute_one_item(gold=gold, pred=pred))
-        return 1 if sum(all_answers) / len(all_answers) >= 0.5 else 0
+            all_answers.append(self.get_processed_pred(pred=pred))
+        majority_prediction = max(all_answers, key=all_answers.count)
+        return self.compute_score(majority_prediction, gold)
 
-    def compute_one_item(
-        self,
-        gold: str,
-        pred: str,
-    ) -> float:
-        """Compares two strings only.
+    def get_processed_gold(self, gold: str) -> float:
+        if self.strip_strings:
+            gold = gold.strip()
 
-        Args:
-            gold (str): One of the possible references
-            pred (str): One of the possible predictions
+        if self.normalize_gold:
+            gold = self.normalize_gold(gold)
 
-        Returns:
-            float: The exact match score. Will be 1 for a match, 0 otherwise.
-        """
+        return gold
+
+    def get_processed_pred(self, pred: str) -> float:
         if not pred:
-            return 0
+            return ""
 
         if self.strip_strings:
-            gold = gold.strip()
             pred = pred.strip()
 
-        if self.normalize_gold:
-            gold = self.normalize_gold(gold)
         if self.normalize_pred:
             pred = self.normalize_pred(pred)
 
+        return pred
+
+    def compute_score(self, pred: str, gold: str):
         if self.type_exact_match == "prefix":
             return 1 if pred.startswith(gold) else 0
         if self.type_exact_match == "suffix":
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 015fe71a..832f5367 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -34,6 +34,7 @@
 from lighteval.metrics import (
     apply_generative_logprob_metric,
     apply_generative_metric,
+    apply_generative_sampling_metric,
     apply_llm_as_judge_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
@@ -563,14 +564,17 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
-        if (
-            self.has_metric_category[MetricCategory.GENERATIVE]
-            or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
-        ):
+        if self.has_metric_category[MetricCategory.GENERATIVE]:
             results, cur_outputs = apply_generative_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
             )
             outputs.update(cur_outputs)
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            results, cur_outputs = apply_generative_sampling_metric(
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
+            )
+            outputs.update(cur_outputs)
+
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
             results, cur_outputs = apply_generative_logprob_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics

From 24d46922a5acc8e9c23e08cbc02a202774dee2ac Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Fri, 19 Apr 2024 14:15:35 +0000
Subject: [PATCH 06/19] add maj at 4 for math with preprocessing

---
 src/lighteval/metrics/metrics.py      | 10 ++++++++++
 src/lighteval/tasks/tasks_table.jsonl | 14 +++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index f9430afd..07d5c918 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -327,6 +327,16 @@ class Metrics(Enum):
         corpus_level_fn=matthews_corrcoef,
         higher_is_better=True,
     )
+    maj_at_4_math = SampleLevelMetric(
+        metric="maj@4",
+        sample_level_fn=MajAtK(
+            k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer_gold
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.MATH,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     maj_at_5 = SampleLevelMetric(
         metric="maj@5",
         sample_level_fn=MajAtK(k=5).compute,
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
index cd7f7c8e..ecaeff04 100644
--- a/src/lighteval/tasks/tasks_table.jsonl
+++ b/src/lighteval/tasks/tasks_table.jsonl
@@ -540,13 +540,13 @@
 {"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
-{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
+{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
 {"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}

From 3a1c1c4d32d9ab744e8b6c75ea31623b2a3e3d63 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 09:38:05 +0000
Subject: [PATCH 07/19] Uses a homogeneized system for all greedy evaluations -
 we can do evals in one single step

---
 src/lighteval/data.py                  |  3 +-
 src/lighteval/evaluator.py             |  4 --
 src/lighteval/metrics/__init__.py      | 40 +------------
 src/lighteval/models/abstract_model.py | 28 ----------
 src/lighteval/models/base_model.py     | 35 +-----------
 src/lighteval/models/endpoint_model.py | 33 ++---------
 src/lighteval/models/nanotron_model.py | 17 +-----
 src/lighteval/tasks/lighteval_task.py  | 77 +++++++++-----------------
 src/lighteval/tasks/requests.py        | 33 +----------
 tests/test_unit_harness_metrics.py     | 12 ++--
 10 files changed, 44 insertions(+), 238 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 711b0749..247cff04 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -29,7 +29,6 @@
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -205,7 +204,7 @@ def _sorting_criteria(self, request: LoglikelihoodSingleTokenRequest) -> int:
 
 
 class GenerativeTaskDataset(DynamicBatchDataset):
-    def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsRequest) -> int:
+    def _sorting_criteria(self, request: GreedyUntilRequest) -> int:
         """
         Collate function for generating batches.
 
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index cfbfc7d0..e837b922 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -86,10 +86,6 @@ def evaluate(  # noqa: C901
             full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL:
             full_resps = lm.greedy_until(requests, override_bs=override_bs)
-        elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS:
-            full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
-        elif request_type == RequestType.GREEDY_UNTIL_WITH_SAMPLING:
-            full_resps = lm.greedy_until(requests, override_bs=override_bs)
         elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
             full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
         else:
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 2458496b..7577c771 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -96,46 +96,10 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
-
-    return results, outputs
-
-
-def apply_generative_sampling_metric(
-    results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None
-):
-    outputs = {}
-
-    # Post processing prediction
-    preds_raw = as_list(results.pop(0).result)
-    preds = []
-
-    for pred_raw in preds_raw:
-        if output_regex is not None:
-            pred = next(iter(re.findall(output_regex, pred_raw)), "")
-        else:
-            pred = pred_raw
-        preds.append(pred)
-
-    # Extracting gold
-    try:
-        golds = formatted_doc.get_golds()
-    except (KeyError, IndexError):
-        golds = None
-
-    for metric in metrics:
-        if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
-            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
-
-    return results, outputs
-
-
-def apply_generative_logprob_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
-    # Applied to no metric atm, but we have the model side logic
-    outputs = {}
-
-    for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
             outputs.update(Metrics[metric].value.compute(results=results, formatted_doc=formatted_doc))
+        if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
+            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
 
     return results, outputs
 
diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
index ccc49146..754a6144 100644
--- a/src/lighteval/models/abstract_model.py
+++ b/src/lighteval/models/abstract_model.py
@@ -36,7 +36,6 @@
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -83,31 +82,6 @@ def max_length(self) -> int:
     def disable_tqdm(self) -> bool:
         raise NotImplementedError
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[GreedyUntilWithLogitsRequest],
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[tuple[str, dict]]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            disable_tqdm (bool, optional): Whether to disable the tqdm progress bar. Defaults to False.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-        return self.greedy_until(
-            requests=requests,
-            override_bs=override_bs,
-            returns_logits=True,
-        )
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -118,7 +92,6 @@ def greedy_until_multi_turn(  # noqa: C901
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
-        returns_logits: bool = False,
         override_bs: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """
@@ -126,7 +99,6 @@ def greedy_until(
 
         Args:
             requests (list[Request]): list of requests containing the context and ending conditions.
-            returns_logits (bool, optional): Whether to return the logits of the generated responses. Defaults to False.
             disable_tqdm (bool, optional): Whether to disable the progress bar. Defaults to False.
             override_bs (int, optional): Override the batch size for generation. Defaults to None.
 
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index d9acf1a4..7d9bd8d2 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -46,8 +46,6 @@
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
-    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -327,32 +325,6 @@ def forward_batch(batch_size):
         hlog(f"Determined largest batch size: {batch_size}")
         return batch_size
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[GreedyUntilWithLogitsRequest],
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[GreedyUntilWithLogitsRequest]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-
-        return self.greedy_until(
-            requests,
-            returns_logits=True,
-            disable_tqdm=self.disable_tqdm,
-            override_bs=override_bs,
-        )
-
     def greedy_until_multi_turn(  # noqa: C901
         self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
     ) -> GenerateMultiTurnReturn:
@@ -487,7 +459,6 @@ def greedy_until_multi_turn(  # noqa: C901
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
-        returns_logits: bool = False,
         override_bs: Optional[int] = None,
     ) -> list[GenerateReturn]:
         """
@@ -495,7 +466,6 @@ def greedy_until(
 
         Args:
             requests (list[Request]): list of requests containing the context and ending conditions.
-            returns_logits (bool, optional): Whether to return the logits of the generated responses. Defaults to False.
             override_bs (int, optional): Override the batch size for generation. Defaults to None.
 
         Returns:
@@ -543,11 +513,12 @@ def greedy_until(
                 dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm
             ):
                 # NOTE: we are assuming all items in a batch behave similarly (same
-                # stop_tokens and max_tokens genrated) which is not necessarily
+                # stop_tokens and max_tokens generated) which is not necessarily
                 # the case! Because of that we only use batch size of 1
                 stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
-                num_samples = batch[0].num_samples if isinstance(batch[0], GreedyUntilWithSamplingRequest) else 1
+                returns_logits = batch[0].use_logits
+                num_samples = batch[0].num_samples
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
index b118a93b..03e184bc 100644
--- a/src/lighteval/models/endpoint_model.py
+++ b/src/lighteval/models/endpoint_model.py
@@ -44,7 +44,6 @@
 from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -182,7 +181,7 @@ def __process_request(self, context: str, stop_tokens: list[str], max_tokens: in
 
     async def __async_process_batch_generate(
         self,
-        requests: list[GreedyUntilRequest | GreedyUntilWithLogitsRequest],
+        requests: list[GreedyUntilRequest],
     ) -> list[TextGenerationOutput]:
         return await asyncio.gather(
             *[
@@ -197,7 +196,7 @@ async def __async_process_batch_generate(
 
     def __process_batch_generate(
         self,
-        requests: list[GreedyUntilRequest | GreedyUntilWithLogitsRequest],
+        requests: list[GreedyUntilRequest],
     ) -> list[TextGenerationOutput]:
         return [
             self.__process_request(
@@ -234,35 +233,9 @@ def __process_batch_logprob(
             for request in requests
         ]
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[GreedyUntilWithLogitsRequest],
-        override_bs: Optional[int] = None,
-    ) -> list[GenerateReturn]:
-        """
-        Generates sequences greedily until a stopping condition is met,
-        returning both the generated sequences and the logits.
-
-        Args:
-            requests (list[tuple[str, dict]]): A list of input requests,
-                where each request is a tuple containing a prompt string and a dictionary of additional parameters.
-            override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None.
-
-        Returns:
-            list[GenerateReturn]: A list of GenerateReturn objects,
-                where each object contains the generated sequence and the corresponding logits.
-        """
-
-        return self.greedy_until(
-            requests,
-            returns_logits=True,
-            override_bs=override_bs,
-        )
-
     def greedy_until(
         self,
         requests: List[GreedyUntilRequest],
-        returns_logits: bool = False,
         override_bs: Optional[int] = None,
     ) -> List[GenerateReturn]:
         for request in requests:
@@ -286,6 +259,8 @@ def greedy_until(
                 dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm
             ):
                 # the `returns_logits` flag is only used to filter the results, we always request the full details.
+                returns_logits = batch[0].use_logits
+
                 if self.use_async:
                     responses = asyncio.run(self.__async_process_batch_generate(batch))
                 else:
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 69ad420f..eecd18fb 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -351,21 +351,6 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
     def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
         return self.model(inputs)
 
-    def greedy_until_with_logits(
-        self,
-        requests: list[tuple[str, dict]],
-        disable_tqdm: bool = False,
-        override_bs=None,
-        dataset_splits: int = 4,
-    ) -> list[GenerateReturn]:
-        return self.greedy_until(
-            requests,
-            returns_logits=True,
-            disable_tqdm=disable_tqdm,
-            override_bs=override_bs,
-            dataset_splits=dataset_splits,
-        )
-
     def _encode_pair(self, context, continuation):
         n_spaces = len(context) - len(context.rstrip())
         if n_spaces > 0:
@@ -1130,7 +1115,6 @@ def _loglikelihood_tokens(
     def greedy_until(
         self,
         requests: List[GreedyUntilRequest],
-        returns_logits=False,
         disable_tqdm: bool = False,
         override_bs=None,
         dataset_splits: int = 1,
@@ -1216,6 +1200,7 @@ def greedy_until(
                 # the maximum allowed generation size for the batch, unless we want to force truncation
                 # need to pass them somewhere ! stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
+                returns_logits = batch[0].use_logits
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 832f5367..cef8d5d0 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -32,9 +32,7 @@
 from lighteval.few_shot_manager import FewShotSampler
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics import (
-    apply_generative_logprob_metric,
     apply_generative_metric,
-    apply_generative_sampling_metric,
     apply_llm_as_judge_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
@@ -48,8 +46,6 @@
     Doc,
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
-    GreedyUntilWithLogitsRequest,
-    GreedyUntilWithSamplingRequest,
     LoglikelihoodRequest,
     LoglikelihoodRollingRequest,
     LoglikelihoodSingleTokenRequest,
@@ -205,7 +201,8 @@ def __init__(  # noqa: C901
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
         # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
-        self.num_samples = [
+        # We assume num_samples always contains 1 (for base generative evals)
+        self.num_samples = [1] + [
             int(metric.replace("maj_at_", "").split("_")[0]) for metric in self.metrics if "maj_at_" in metric
         ]
 
@@ -415,27 +412,27 @@ def get_request_type(self) -> list[RequestType]:  # noqa C901
         request_types = []
         if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]:
             request_types.append(RequestType.LOGLIKELIHOOD)
+        if self.has_metric_category[MetricCategory.MULTICHOICE]:
+            request_types.append(RequestType.LOGLIKELIHOOD)
+        if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
+            request_types.append(RequestType.LOGLIKELIHOOD_SINGLE_TOKEN)
         if self.has_metric_category[MetricCategory.PERPLEXITY]:
             request_types.append(RequestType.LOGLIKELIHOOD_ROLLING)
         if self.has_metric_category[MetricCategory.GENERATIVE]:
             request_types.append(RequestType.GREEDY_UNTIL)
-        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]:
-            request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN)
-        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]:
-            request_types.append(RequestType.GREEDY_UNTIL)
         if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
-            request_types.append(RequestType.GREEDY_UNTIL_WITH_LOGITS)
+            request_types.append(RequestType.GREEDY_UNTIL)
         if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
-            request_types.append(RequestType.GREEDY_UNTIL_WITH_SAMPLING)
-        if self.has_metric_category[MetricCategory.MULTICHOICE]:
-            request_types.append(RequestType.LOGLIKELIHOOD)
-        if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
-            request_types.append(RequestType.LOGLIKELIHOOD_SINGLE_TOKEN)
+            request_types.append(RequestType.GREEDY_UNTIL)
+        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]:
+            request_types.append(RequestType.GREEDY_UNTIL)
+        if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]:
+            request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN)
 
         if len(request_types) == 0:
             raise NotImplementedError(f"Request type not implemented for task {self.name}")
 
-        return request_types
+        return list(set(request_types))
 
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
@@ -472,7 +469,13 @@ def construct_requests(
                     task_name=current_task_name, example_index=document_id_seed, request_index=0, context=context
                 )
             ]
-        if self.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+            or self.has_metric_category[MetricCategory.GENERATIVE]
+            or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
+        ):
+            # All these tasks require the same generation process - we can do them in one step
+            use_logits = self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
             requests[RequestType.GREEDY_UNTIL] += [
                 GreedyUntilRequest(
                     task_name=current_task_name,
@@ -481,29 +484,8 @@ def construct_requests(
                     context=context,
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
-                )
-            ]
-        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
-            requests[RequestType.GREEDY_UNTIL_WITH_SAMPLING] += [
-                GreedyUntilWithSamplingRequest(
-                    task_name=current_task_name,
-                    example_index=document_id_seed,
-                    request_index=0,
-                    context=context,
-                    stop_sequence=self.stop_sequence,
-                    generation_size=self.generation_size,
                     num_samples=max(self.num_samples),  # If we have several samplings to apply, we use the max
-                )
-            ]
-        if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
-            requests[RequestType.GREEDY_UNTIL_WITH_LOGITS] += [
-                GreedyUntilWithLogitsRequest(
-                    task_name=current_task_name,
-                    example_index=document_id_seed,
-                    request_index=0,
-                    context=context,
-                    stop_sequence=self.stop_sequence,
-                    generation_size=self.generation_size,
+                    use_logits=use_logits,
                 )
             ]
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
@@ -564,22 +546,15 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.GENERATIVE]:
+        if (
+            self.has_metric_category[MetricCategory.GENERATIVE]
+            or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
+            or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
+        ):
             results, cur_outputs = apply_generative_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
-            results, cur_outputs = apply_generative_sampling_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
-            )
-            outputs.update(cur_outputs)
-
-        if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]:
-            results, cur_outputs = apply_generative_logprob_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics
-            )
-            outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:
             results, cur_outputs = apply_multichoice_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 952fd430..51abf61d 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -34,8 +34,6 @@ class RequestType(Enum):
     LOGLIKELIHOOD_ROLLING = auto()
     GREEDY_UNTIL = auto()
     GREEDY_UNTIL_MULTI_TURN = auto()
-    GREEDY_UNTIL_WITH_LOGITS = auto()
-    GREEDY_UNTIL_WITH_SAMPLING = auto()
 
 
 @dataclass
@@ -120,6 +118,8 @@ class GreedyUntilRequest(Request):
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
+    num_samples: int = None
+    use_logits: bool = False
 
 
 @dataclass
@@ -138,35 +138,6 @@ class GreedyUntilMultiTurnRequest(Request):
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
 
 
-@dataclass
-class GreedyUntilWithLogitsRequest(GreedyUntilRequest):
-    """
-    Represents a request for generating text using the Greedy-Until strategy but
-    returning the logits.
-
-    Attributes:
-        request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
-    """
-
-    request_type = RequestType.GREEDY_UNTIL_WITH_LOGITS
-
-
-@dataclass
-class GreedyUntilWithSamplingRequest(GreedyUntilRequest):
-    """
-    Represents a request for generating text using the Greedy-Until strategy but
-    returning the logits.
-
-    Attributes:
-        stop_sequence (str): The sequence of tokens that indicates when to stop generating text.
-        generation_size (int): The maximum number of tokens to generate.
-        request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS).
-    """
-
-    request_type = RequestType.GREEDY_UNTIL_WITH_SAMPLING
-    num_samples: int = None
-
-
 class TaskExampleId(NamedTuple):
     """
     Represents the identifier for an example in a task.
diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py
index 35f6634f..d8a6503a 100644
--- a/tests/test_unit_harness_metrics.py
+++ b/tests/test_unit_harness_metrics.py
@@ -26,7 +26,6 @@
 import pytest
 
 from lighteval.metrics import (
-    apply_generative_logprob_metric,
     apply_generative_metric,
     apply_multichoice_metric,
     apply_multichoice_metric_one_token,
@@ -129,14 +128,13 @@ def apply_metric(metric, results, formatted_doc: Doc):
     if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
         _, cur_outputs = apply_perplexity_metric(results=results, formatted_doc=formatted_doc, metrics=[metric])
         return cur_outputs
-    if Metrics[metric].value.category == MetricCategory.GENERATIVE:
+    if Metrics[metric].value.category in [
+        MetricCategory.GENERATIVE,
+        MetricCategory.GENERATIVE_LOGPROB,
+        MetricCategory.GENERATIVE_SAMPLING,
+    ]:
         _, cur_outputs = apply_generative_metric(results=results, formatted_doc=formatted_doc, metrics=[metric])
         return cur_outputs
-    if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
-        _, cur_outputs = apply_generative_logprob_metric(
-            results=results, formatted_doc=formatted_doc, metrics=[metric]
-        )
-        return cur_outputs
     if Metrics[metric].value.category == MetricCategory.MULTICHOICE:
         _, cur_outputs = apply_multichoice_metric(results=results, formatted_doc=formatted_doc, metrics=[metric])
         return cur_outputs

From 71aa2b88608dc4fb9b90198b2a82e1fa9284f543 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 09:51:54 +0000
Subject: [PATCH 08/19] edit to prevent sampling for providing too many answers
 to some metrics

---
 src/lighteval/metrics/__init__.py     | 16 +++++++++++++---
 src/lighteval/tasks/lighteval_task.py |  6 +++++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 7577c771..c4e44c03 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -66,7 +66,9 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr
     return results, outputs
 
 
-def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None):
+def apply_generative_metric(
+    results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None, max_num_samples=1
+):
     outputs = {}
 
     # Post processing prediction
@@ -93,11 +95,19 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr
         preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
+    preds_no_sampling = preds
+    if max_num_samples > 1:  # We want to run our evaluation on only one sample for base generative evals
+        preds_no_sampling = as_list(preds[0])
+
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE:
-            outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
+            outputs.update(
+                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+            )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
-            outputs.update(Metrics[metric].value.compute(results=results, formatted_doc=formatted_doc))
+            outputs.update(
+                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+            )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
 
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index cef8d5d0..f9df6fdd 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -552,7 +552,11 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
             or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
         ):
             results, cur_outputs = apply_generative_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex
+                results=results,
+                formatted_doc=formatted_doc,
+                metrics=self.metrics,
+                output_regex=self.output_regex,
+                max_num_samples=max(self.num_samples),
             )
             outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.MULTICHOICE]:

From e36d6e091458a6e4171317bf97a9b7d7db1911c4 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 09:58:49 +0000
Subject: [PATCH 09/19] added some doc

---
 src/lighteval/metrics/metrics_sample.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 5b286afb..3a64f5d0 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -712,12 +712,14 @@ def __init__(
             )
         self.type_exact_match = type_exact_match
 
-    def compute(self, golds: list[str], predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[str, float]:
         """Computes the metric over a list of golds and predictions for one single sample.
+        It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones,
+        then compares it to the gold.
 
         Args:
             golds (list[str]): Reference targets
-            predictions (list[str]): Predicted strings
+            predictions (list[str]): k predicted strings
 
         Returns:
             float: Aggregated score over the current sample's items.

From 4ef4c99779eb4a40b4b3cefcaf58c10a73d3dfe7 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 13:06:12 +0000
Subject: [PATCH 10/19] neither nanotron nor endpoints models cover sampling
 atm

---
 src/lighteval/models/endpoint_model.py | 5 +++++
 src/lighteval/models/nanotron_model.py | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
index 03e184bc..d79e0f91 100644
--- a/src/lighteval/models/endpoint_model.py
+++ b/src/lighteval/models/endpoint_model.py
@@ -260,6 +260,11 @@ def greedy_until(
             ):
                 # the `returns_logits` flag is only used to filter the results, we always request the full details.
                 returns_logits = batch[0].use_logits
+                num_samples = batch[0].num_samples
+                if num_samples > 1:
+                    hlog_err(
+                        "Inference endpoints does not allow sampling evaluations - this is likely to fail or provide problematic results"
+                    )
 
                 if self.use_async:
                     responses = asyncio.run(self.__async_process_batch_generate(batch))
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index eecd18fb..977b2b19 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -54,7 +54,7 @@
     LoglikelihoodDataset,
     LoglikelihoodSingleTokenDataset,
 )
-from lighteval.logging.hierarchical_logger import hlog_warn
+from lighteval.logging.hierarchical_logger import hlog_err, hlog_warn
 from lighteval.models.base_model import LightevalModel
 from lighteval.models.model_config import EnvConfig
 from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
@@ -1201,6 +1201,11 @@ def greedy_until(
                 # need to pass them somewhere ! stop_tokens = batch[0].stop_sequence
                 max_new_tokens = batch[0].generation_size
                 returns_logits = batch[0].use_logits
+                num_samples = batch[0].num_samples
+                if num_samples > 1:
+                    hlog_err(
+                        "Nonotron models does not allow sampling evaluations - this is likely to fail or provide problematic results"
+                    )
 
                 # The main question for this step is the following:
                 # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk

From 26c7868d580d35610e4fb68dd905f2e19570443c Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 13:56:13 +0000
Subject: [PATCH 11/19] add readme

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index e7c54928..0596061f 100644
--- a/README.md
+++ b/README.md
@@ -350,6 +350,7 @@ These metrics need the model to generate an output. They are therefore slower.
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
     - `f1_score_macro`: Corpus level macro F1 score
     - `f1_score_macro`: Corpus level micro F1 score
+    - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.
 - Summarization:
     - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
     - `rouge1` (HELM): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
@@ -376,7 +377,9 @@ These metrics need the model to generate an output. They are therefore slower.
         - `edit_similarity`: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference.
 - Math:
     - `quasi_exact_match_math` (HELM): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed)
+    - `maj_at_4_math` (Lighteval): Majority choice evaluation, using the math normalisation for the predictions and gold
     - `quasi_exact_match_gsm8k` (Harness): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed)
+    - `maj_at_8_gsm8k` (Lighteval): Majority choice evaluation, using the gsm8k normalisation for the predictions and gold
 
 ### Metrics for specific tasks
 To keep compatibility with the Harness for some specific tasks, we ported their evaluations more or less as such. They include `drop` (for the DROP dataset) and `truthfulqa_mc_metrics` (for TruthfulQA). In general, except for tasks where the dataset has a very different formatting than usual (an other language, programming language, math, ...), we want to use standard implementations of the above metrics. It makes little sense to have 10 different versions of an exact match depending on the task. However, most of the above metrics are parametrizable so that you can change the normalization applied easily for experimental purposes.

From edcdc7299064439caca2e46c7b24444281e90722 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 16:18:56 +0000
Subject: [PATCH 12/19] Now, tasks are sorted by conditions. Need to cut the
 batch size lower to fit user conditions if needed

---
 src/lighteval/data.py                  | 78 ++++++++++++++++----------
 src/lighteval/models/base_model.py     |  8 +--
 src/lighteval/models/endpoint_model.py |  6 +-
 src/lighteval/models/nanotron_model.py | 20 +++----
 tests/test_unit_reorder.py             |  4 +-
 5 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 247cff04..f87db670 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -40,12 +40,12 @@ class DynamicBatchDataset(Dataset):
     def __init__(
         self,
         requests: list,
-        dataset_splits: int,
+        num_dataset_splits: int,
     ):
         """
         This dataset class uses dynamic batching to speed up the generation.
         Each request is sorted by the length of the prompt + the length of the
-        continuation. Then, the dataset is split into dataset_splits splits.
+        continuation. Then, the dataset is split into num_dataset_splits splits.
         The first split will contain the longest requests, the second split will
         contain the second longest requests, etc. This allows us to use dynamic
         batching by starting with a small batch size and doubling it for each
@@ -54,7 +54,7 @@ def __init__(
 
         Args:
             requests (List): A list of requests.
-            dataset_splits (int): The number of dataset splits.
+            num_dataset_splits (int): The number of dataset splits.
         """
         # We make sure the requests contain the tokenized versions of their values
         if any(r.tokenized_context is None for r in requests):
@@ -69,16 +69,24 @@ def __init__(
 
         self.total_size = len(self.sorted_data)
 
-        if dataset_splits >= self.total_size:
+        self.num_dataset_splits, self.splits = self.init_split_limits(num_dataset_splits)
+
+        self.split_start = self.splits[0][0]
+        self.split_end = self.splits[0][1]
+
+    def init_split_limits(self, num_dataset_splits):
+        if num_dataset_splits >= self.total_size:
             hlog_warn(
-                f"dataset_splits ({dataset_splits}) >= total_size ({self.total_size}), setting dataset_splits to 1"
+                f"num_dataset_splits ({num_dataset_splits}) >= total_size ({self.total_size}), setting num_dataset_splits to 1"
             )
-            dataset_splits = 1
+            num_dataset_splits = 1
+
+        split_size = self.total_size // self.num_dataset_splits + 1
+        splits_indices = [
+            (ix * split_size, min((ix + 1) * split_size, self.total_size)) for ix in range(self.num_dataset_splits)
+        ]
 
-        self.dataset_splits = dataset_splits
-        self.split_size = self.total_size // self.dataset_splits + 1
-        self.split_start = 0
-        self.split_end = min(self.split_start + self.split_size, self.total_size)
+        return num_dataset_splits, splits_indices
 
     def get_original_order(self, new_arr: list) -> list:
         """
@@ -113,8 +121,8 @@ def get_split_start_end(self, split_id: int) -> tuple[int, int]:
         Returns:
             tuple: A tuple containing the start and end indices of the split.
         """
-        self.split_start = split_id * self.split_size
-        self.split_end = min(self.split_start + self.split_size, self.total_size)
+        self.split_start = self.splits[split_id][0]
+        self.split_end = self.splits[split_id][1]
         return self.split_start, self.split_end
 
     def splits_start_end_iterator(self) -> tuple[int, int]:
@@ -126,7 +134,7 @@ def splits_start_end_iterator(self) -> tuple[int, int]:
         Yields:
             tuple: A tuple containing the start and end indices of a split.
         """
-        for split_id in range(self.dataset_splits):
+        for split_id in range(self.num_dataset_splits):
             yield self.get_split_start_end(split_id)
 
     def __getitem__(self, index) -> Request:
@@ -204,6 +212,32 @@ def _sorting_criteria(self, request: LoglikelihoodSingleTokenRequest) -> int:
 
 
 class GenerativeTaskDataset(DynamicBatchDataset):
+    def init_split_limits(self, num_dataset_splits):
+        if num_dataset_splits is not None:
+            hlog_warn(
+                "You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring."
+            )
+
+        all_sorting_criterion = [self._sorting_criteria(self.sorted_data[0])[:2]]
+        splits_indices = [(-1, -1)]
+        for ix, req in enumerate(self.sorted_data[1:]):
+            current_sorting_criteria = self._sorting_criteria(req)
+            current_key = current_sorting_criteria[:2]
+            if current_key not in all_sorting_criterion:
+                all_sorting_criterion.append(current_key)
+                splits_indices.append((splits_indices[-1][1] + 1, ix))
+
+        # We add the last split
+        if splits_indices[-1][1] != self.total_size:
+            splits_indices.append((splits_indices[-1][1] + 1, self.total_size))
+
+        # We remove the fake first index
+        splits_indices = splits_indices[1:]
+
+        num_dataset_splits = len(splits_indices)
+
+        return num_dataset_splits, splits_indices
+
     def _sorting_criteria(self, request: GreedyUntilRequest) -> int:
         """
         Collate function for generating batches.
@@ -219,10 +253,10 @@ def _sorting_criteria(self, request: GreedyUntilRequest) -> int:
         # The generative task has no limit except the model context
         if gen_length is None:
             gen_length = 0
-        return -(len(toks) + gen_length)
+        return request.use_logits, request.stop_sequence, -(len(toks) + gen_length)
 
 
-class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
+class GenerativeTaskDatasetNanotron(GenerativeTaskDataset):
     def __getitem__(self, index) -> Request:
         """
         Get an item from the dataset depending on the split we are currently in.
@@ -238,20 +272,6 @@ def __getitem__(self, index) -> Request:
         """
         return index, self.sorted_data[index + self.split_start]
 
-    def _sorting_criteria(self, request) -> int:
-        """
-        Collate function for generating batches.
-
-        Args:
-            x (Any): The input data.
-
-        Returns:
-            Any: The collated data.
-        """
-        toks = request.tokenized_context
-        gen_length = request.generation_size
-        return -(len(toks) + gen_length)
-
 
 class GenDistributedSampler(DistributedSampler):
     """A distributed sampler that copy the last element only when drop_last is False so we keep a small padding in the batches
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 7d9bd8d2..4db412b4 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -334,7 +334,7 @@ def greedy_until_multi_turn(  # noqa: C901
 
         results = []
 
-        dataset = GenerativeTaskDataset(requests=requests, dataset_splits=1)
+        dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=1)
         dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda batch: batch)
 
         if self.accelerator:
@@ -475,7 +475,7 @@ def greedy_until(
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
 
-        dataset = GenerativeTaskDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
         starting_batch_size = STARTING_BATCH_SIZE
         results = []
 
@@ -708,7 +708,7 @@ def _loglikelihood_tokens(
         return_bool_score: bool = True,
         rolling: bool = False,
     ) -> list[LoglikelihoodReturn]:
-        dataset = LoglikelihoodDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
         starting_batch_size = STARTING_BATCH_SIZE
         res = []
 
@@ -950,7 +950,7 @@ def loglikelihood_single_token(
     def _loglikelihood_single_token(
         self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: int = -1
     ) -> list[LoglikelihoodSingleTokenReturn]:
-        dataset = LoglikelihoodSingleTokenDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataset = LoglikelihoodSingleTokenDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
         starting_batch_size = STARTING_BATCH_SIZE
         res = []
 
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
index d79e0f91..320bc010 100644
--- a/src/lighteval/models/endpoint_model.py
+++ b/src/lighteval/models/endpoint_model.py
@@ -242,7 +242,7 @@ def greedy_until(
             request.tokenized_context = self.tok_encode(request.context)
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
 
-        dataset = GenerativeTaskDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
         batch_size = override_bs if override_bs is not None else BATCH_SIZE
         results: List[str] = []
 
@@ -288,7 +288,7 @@ def loglikelihood(
         for request in requests:
             request.tokenized_context = self.tok_encode(request.context)
             request.tokenized_continuation = self.tok_encode(request.choice)
-        dataset = LoglikelihoodDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
         batch_size = override_bs if override_bs is not None else BATCH_SIZE
         results: List[str] = []
 
@@ -334,7 +334,7 @@ def loglikelihood_rolling(
             request.tokenized_context = [self.tokenizer.eos_token_id]
             request.tokenized_continuation = self.tok_encode(request.context)
 
-        dataset = LoglikelihoodDataset(requests=requests, dataset_splits=self.DATASET_SPLITS)
+        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
         batch_size = override_bs if override_bs is not None else BATCH_SIZE
         results: List[str] = []
 
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 977b2b19..b0546e79 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -643,9 +643,9 @@ def pad_and_gather(self, output_tensor: torch.Tensor) -> Tuple[torch.Tensor, tor
 
         return gathered_outputs, gathered_length
 
-    def _get_subsets(self, dataset, dataset_splits):
+    def _get_subsets(self, dataset, num_dataset_splits):
         total_length = len(dataset)
-        subset_length = int(float(total_length) / float(dataset_splits)) + 1
+        subset_length = int(float(total_length) / float(num_dataset_splits)) + 1
         if subset_length < self.parallel_context.dp_pg.size():
             # We need at least one subset sample per DP process
             subset_length = self.parallel_context.dp_pg.size()
@@ -653,7 +653,7 @@ def _get_subsets(self, dataset, dataset_splits):
 
     @torch.inference_mode()
     def _loglikelihood_single_token(
-        self, requests, disable_tqdm: bool = False, override_bs: int = -1, dataset_splits: int = 1
+        self, requests, disable_tqdm: bool = False, override_bs: int = -1, num_dataset_splits: int = 1
     ) -> List[LoglikelihoodSingleTokenReturn]:
         dataset = LoglikelihoodSingleTokenDataset(requests=requests)
         res = []
@@ -663,7 +663,7 @@ def _loglikelihood_single_token(
         printed_error = False
         starting_batch_size = 512
 
-        total_length, subset_length = self._get_subsets(dataset, dataset_splits)
+        total_length, subset_length = self._get_subsets(dataset, num_dataset_splits)
 
         for s, subset_start in enumerate(
             tqdm(
@@ -883,17 +883,17 @@ def _loglikelihood_tokens(
         requests,
         disable_tqdm: bool = False,
         override_bs: int = -1,
-        dataset_splits: int = 1,
+        num_dataset_splits: int = 1,
         return_bool_score: bool = True,
     ) -> List[LoglikelihoodReturn]:
-        dataset = LoglikelihoodDataset(requests=requests, dataset_splits=dataset_splits)
+        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=num_dataset_splits)
         res = []
 
         # Dataset is sorted in descending size.
         # every 20-25% of the dataset we try to double the batch size for speed up
         starting_batch_size = 512
 
-        total_length, subset_length = self._get_subsets(dataset, dataset_splits)
+        total_length, subset_length = self._get_subsets(dataset, num_dataset_splits)
 
         for s, subset_start in enumerate(
             tqdm(
@@ -1117,7 +1117,7 @@ def greedy_until(
         requests: List[GreedyUntilRequest],
         disable_tqdm: bool = False,
         override_bs=None,
-        dataset_splits: int = 1,
+        num_dataset_splits: int = 1,
     ) -> List[GenerateReturn]:
         """Greedy generation until a stop token is generated."""
         # automatic (variable) batch size detection for vectorization
@@ -1126,14 +1126,14 @@ def greedy_until(
             request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
 
-        dataset = GenerativeTaskDatasetNanotron(requests=requests, dataset_splits=dataset_splits)
+        dataset = GenerativeTaskDatasetNanotron(requests=requests, num_dataset_splits=num_dataset_splits)
         res = []
 
         # Dataset is sorted in descending size.
         # every 20-25% of the dataset we try to double the batch size for speed up
         starting_batch_size = 512
 
-        total_length, subset_length = self._get_subsets(dataset, dataset_splits)
+        total_length, subset_length = self._get_subsets(dataset, num_dataset_splits)
 
         for s, subset_start in enumerate(
             tqdm(
diff --git a/tests/test_unit_reorder.py b/tests/test_unit_reorder.py
index 1936bcc5..6487cd93 100644
--- a/tests/test_unit_reorder.py
+++ b/tests/test_unit_reorder.py
@@ -77,7 +77,7 @@
 class TestReorderGenerativeTaskDataset:
     def test_dataset_needs_tokenization(self):
         with pytest.raises(ValueError):
-            GenerativeTaskDataset(requests=TEST_DATA, dataset_splits=DATASET_SPLITS)
+            GenerativeTaskDataset(requests=TEST_DATA, num_dataset_splits=DATASET_SPLITS)
 
     def test_reorder_dataset(self):
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
@@ -85,7 +85,7 @@ def test_reorder_dataset(self):
         for request in data:
             request.tokenized_context = tokenizer.encode(request.context)
 
-        dataset = GenerativeTaskDataset(requests=data, dataset_splits=DATASET_SPLITS)
+        dataset = GenerativeTaskDataset(requests=data, num_dataset_splits=DATASET_SPLITS)
 
         sorted_data = dataset.sorted_data
         original_data = dataset.get_original_order(sorted_data)

From 17607303e9145259af44990950e231ff1562c8ef Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 22 Apr 2024 18:45:08 +0000
Subject: [PATCH 13/19] add mini fix

---
 src/lighteval/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index f87db670..0e10299c 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -81,9 +81,9 @@ def init_split_limits(self, num_dataset_splits):
             )
             num_dataset_splits = 1
 
-        split_size = self.total_size // self.num_dataset_splits + 1
+        split_size = self.total_size // num_dataset_splits + 1
         splits_indices = [
-            (ix * split_size, min((ix + 1) * split_size, self.total_size)) for ix in range(self.num_dataset_splits)
+            (ix * split_size, min((ix + 1) * split_size, self.total_size)) for ix in range(num_dataset_splits)
         ]
 
         return num_dataset_splits, splits_indices
@@ -228,8 +228,8 @@ def init_split_limits(self, num_dataset_splits):
                 splits_indices.append((splits_indices[-1][1] + 1, ix))
 
         # We add the last split
-        if splits_indices[-1][1] != self.total_size:
-            splits_indices.append((splits_indices[-1][1] + 1, self.total_size))
+        if splits_indices[-1][1] != self.total_size + 1:
+            splits_indices.append((splits_indices[-1][1], self.total_size + 1))
 
         # We remove the fake first index
         splits_indices = splits_indices[1:]

From a480fc0b334fce4f42db835961b924fd01c73b48 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Tue, 23 Apr 2024 09:02:38 +0000
Subject: [PATCH 14/19] fix! indices were incorrect

---
 src/lighteval/data.py              | 13 +++++--------
 src/lighteval/models/base_model.py |  2 +-
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 0e10299c..54063200 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -219,20 +219,17 @@ def init_split_limits(self, num_dataset_splits):
             )
 
         all_sorting_criterion = [self._sorting_criteria(self.sorted_data[0])[:2]]
-        splits_indices = [(-1, -1)]
-        for ix, req in enumerate(self.sorted_data[1:]):
+        splits_indices = [[0, None]]
+        for ix, req in enumerate(self.sorted_data):
             current_sorting_criteria = self._sorting_criteria(req)
             current_key = current_sorting_criteria[:2]
             if current_key not in all_sorting_criterion:
                 all_sorting_criterion.append(current_key)
-                splits_indices.append((splits_indices[-1][1] + 1, ix))
+                splits_indices[-1][1] = ix
+                splits_indices.append([ix, None])
 
         # We add the last split
-        if splits_indices[-1][1] != self.total_size + 1:
-            splits_indices.append((splits_indices[-1][1], self.total_size + 1))
-
-        # We remove the fake first index
-        splits_indices = splits_indices[1:]
+        splits_indices[-1][1] = self.total_size
 
         num_dataset_splits = len(splits_indices)
 
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 4db412b4..62bdaf2c 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -481,7 +481,7 @@ def greedy_until(
 
         for split_start, split_end in tqdm(
             dataset.splits_start_end_iterator(),
-            total=self.DATASET_SPLITS,
+            total=dataset.num_dataset_splits,
             desc="Splits",
             position=0,
             disable=self.disable_tqdm,

From 9b35ff41a61152b460ec805c2d3172ab4c7c5af4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Fri, 3 May 2024 11:27:47 +0200
Subject: [PATCH 15/19] Update src/lighteval/data.py

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
---
 src/lighteval/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 54063200..73bca8ae 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -232,7 +232,7 @@ def init_split_limits(self, num_dataset_splits):
         splits_indices[-1][1] = self.total_size
 
         num_dataset_splits = len(splits_indices)
-
+    split_indices = [tuple(e) for e in split_indices]
         return num_dataset_splits, splits_indices
 
     def _sorting_criteria(self, request: GreedyUntilRequest) -> int:

From 3995494398d7aceedea9ba874d0e5682a01945d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Tue, 9 Jul 2024 15:39:07 +0200
Subject: [PATCH 16/19] Apply suggestions from code review

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
---
 src/lighteval/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 73bca8ae..68baea82 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -71,7 +71,7 @@ def __init__(
 
         self.num_dataset_splits, self.splits = self.init_split_limits(num_dataset_splits)
 
-        self.split_start = self.splits[0][0]
+        self.split_start, self.split_end = self.splits[0]
         self.split_end = self.splits[0][1]
 
     def init_split_limits(self, num_dataset_splits):
@@ -121,7 +121,7 @@ def get_split_start_end(self, split_id: int) -> tuple[int, int]:
         Returns:
             tuple: A tuple containing the start and end indices of the split.
         """
-        self.split_start = self.splits[split_id][0]
+        self.split_start, self.split_end = self.splits[split_id]
         self.split_end = self.splits[split_id][1]
         return self.split_start, self.split_end
 

From 5db69e8feae83d3b9de8590dad8fec4cab6514ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Wed, 10 Jul 2024 16:50:58 +0200
Subject: [PATCH 17/19] Apply suggestions from code review

---
 src/lighteval/data.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index bb0a3ded..5211f745 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -72,7 +72,6 @@ def __init__(
         self.num_dataset_splits, self.splits = self.init_split_limits(num_dataset_splits)
 
         self.split_start, self.split_end = self.splits[0]
-        self.split_end = self.splits[0][1]
 
     def init_split_limits(self, num_dataset_splits):
         if num_dataset_splits >= self.total_size:
@@ -122,7 +121,6 @@ def get_split_start_end(self, split_id: int) -> tuple[int, int]:
             tuple: A tuple containing the start and end indices of the split.
         """
         self.split_start, self.split_end = self.splits[split_id]
-        self.split_end = self.splits[split_id][1]
         return self.split_start, self.split_end
 
     def splits_start_end_iterator(self) -> tuple[int, int]:

From fadf2bb0ae411826eef857496d2df220bca4b7a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Thu, 11 Jul 2024 10:04:05 +0200
Subject: [PATCH 18/19] revert change

---
 src/lighteval/metrics/__init__.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index c4e44c03..b7f9183a 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -95,18 +95,22 @@ def apply_generative_metric(
         preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
         golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
 
-    preds_no_sampling = preds
-    if max_num_samples > 1:  # We want to run our evaluation on only one sample for base generative evals
-        preds_no_sampling = as_list(preds[0])
-
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.GENERATIVE:
             outputs.update(
-                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+                Metrics[metric].value.compute(
+                    golds=golds,
+                    predictions=as_list(preds[0]) if max_num_samples > 1 else preds,
+                    formatted_doc=formatted_doc,
+                )
             )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB:
             outputs.update(
-                Metrics[metric].value.compute(golds=golds, predictions=preds_no_sampling, formatted_doc=formatted_doc)
+                Metrics[metric].value.compute(
+                    golds=golds,
+                    predictions=as_list(preds[0]) if max_num_samples > 1 else preds,
+                    formatted_doc=formatted_doc,
+                )
             )
         if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING:
             outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))

From b6b85b5e6cbc749622cea91411a88531080777f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Thu, 11 Jul 2024 10:15:11 +0200
Subject: [PATCH 19/19] added doc

---
 src/lighteval/data.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 5211f745..22b68bd6 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -211,6 +211,23 @@ def _sorting_criteria(self, request: LoglikelihoodSingleTokenRequest) -> int:
 
 class GenerativeTaskDataset(DynamicBatchDataset):
     def init_split_limits(self, num_dataset_splits):
+        """Initialises the split limits based on generation parameters.
+        The splits are used to estimate time remaining when evaluating, and in the case of generative evaluations, to group similar samples together.
+
+        For generative tasks, self._sorting_criteria outputs:
+        - a boolean (whether the generation task uses logits)
+        - a list (the stop sequences)
+        - the item length (the actual size sorting factor).
+
+        In the current function, we create evaluation groups by generation parameters (logits and eos), so that samples with similar properties get batched together afterwards.
+        The samples will then be further organised by length in each split.
+
+        Args:
+            num_dataset_splits (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
         if num_dataset_splits is not None:
             hlog_warn(
                 "You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring."
@@ -233,7 +250,7 @@ def init_split_limits(self, num_dataset_splits):
         splits_indices = [tuple(e) for e in splits_indices]
         return num_dataset_splits, splits_indices
 
-    def _sorting_criteria(self, request: GreedyUntilRequest) -> int:
+    def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, list, int]:
         """
         Collate function for generating batches.