Merge remote-tracking branch 'origin/main' into fix-brrr

huggingface · Feb 6, 2024 · fbb2321 · fbb2321
2 parents ac4a64a + 059e100
commit fbb2321
Show file tree

Hide file tree

Showing 17 changed files with 460 additions and 165 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,8 +82,8 @@ optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
-  "nanotron@git+https://github.com/huggingface/nanotron@main",
-  "brrr@git+https://github.com/huggingface/brrr@fix-lighteval",
+  "nanotron@git+https://github.com/huggingface/nanotron@8c1a49588d0745a6404644a86547c2dd6a63640e",
+  "brrr@git+https://github.com/huggingface/brrr@e8a503e2ec08b34eed7522d331aec3bee8cdd29b",
   "tensorboardX"
 ]
 

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -72,7 +72,7 @@ def get_original_order(self, new_arr: list) -> list:
 
         return original_order
 
-    def get_set_split_start_end(self, split_id: int) -> tuple[int, int]:
+    def get_split_start_end(self, split_id: int) -> tuple[int, int]:
         """
         Get the start and end indices of a dataset split.
 
@@ -96,7 +96,7 @@ def splits_start_end_iterator(self) -> tuple[int, int]:
             tuple: A tuple containing the start and end indices of a split.
         """
         for split_id in range(self.dataset_splits):
-            yield self.get_set_split_start_end(split_id)
+            yield self.get_split_start_end(split_id)
 
     def __getitem__(self, index) -> Request:
         """
@@ -195,7 +195,7 @@ def _sorting_criteria(self, x) -> int:
         return -(len(toks) + gen_length)
 
 
-class GenerativeTaskDatasetBrrr(DynamicBatchDataset):
+class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
     def _sorting_criteria(self, x) -> int:
         """
         Collate function for generating batches.

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -28,7 +28,8 @@
 
 
 class EnhancedJSONEncoder(json.JSONEncoder):
-    """Provides a proper json encoding for the loggers and trackers json dumps.
+    """
+    Provides a proper json encoding for the loggers and trackers json dumps.
     Notably manages the json encoding of dataclasses.
     """
 
@@ -39,10 +40,16 @@ def default(self, o):
 
 
 class EvaluationTracker:
-    """Keeps track of the overall evaluation process and relevant informations.
-
-    The [`EvaluationTracker`] contains specific loggers for experiments details ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions ([`VersionsLogger`]) as well as for the general configurations of both the specific task ([`TaskConfigLogger`]) and overall evaluation run ([`GeneralConfigLogger`]).
-    It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+    Keeps track of the overall evaluation process and relevant informations.
+
+    The [`EvaluationTracker`] contains specific loggers for experiments details
+    ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
+    ([`VersionsLogger`]) as well as for the general configurations of both the
+    specific task ([`TaskConfigLogger`]) and overall evaluation run
+    ([`GeneralConfigLogger`]).  It compiles the data from these loggers and
+    writes it to files, which can be published to the Hugging Face hub if
+    requested.
     """
 
     details_logger: DetailsLogger
@@ -53,11 +60,15 @@ class EvaluationTracker:
     hub_results_org: str
 
     def __init__(self, hub_results_org: str = "", token: str = "") -> None:
-        """Creates all the necessary loggers for evaluation tracking.
+        """
+        Creates all the necessary loggers for evaluation tracking.
 
         Args:
-            hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`]
-            token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`.
+            hub_results_org (str): The organisation to push the results to. See
+                more details about the datasets organisation in
+                [`EvaluationTracker.save`]
+            token (str): Token to use when pushing to the hub. This token should
+                have write access to `hub_results_org`.
         """
         self.details_logger = DetailsLogger()
         self.metrics_logger = MetricsLogger()
@@ -79,7 +90,8 @@ def save(
     ) -> None:
         """Saves the experiment information and results to files, and to the hub if requested.
 
-        Note: In case of save failure, this function will only print a warning, with the error message.
+        Note:
+            In case of save failure, this function will only print a warning, with the error message.
 
         Args:
             output_dir (str): Local folder path where you want results to be saved
@@ -204,6 +216,7 @@ def details_to_hub(
             details_folder_path (str or Path): Local path of the current's experiment details folder.
                 The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model.
             push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private.
+
         """
         results_file_path = str(results_file_path)
         details_folder_path = str(details_folder_path)
@@ -255,6 +268,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
         Args:
             repo_id (str): Details dataset repository path on the hub (`org/dataset`)
             model_name (str): Name of the currently evaluated model.
+
         """
         # Add a nice dataset card and the configuration YAML
         files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -2,6 +2,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
+from typing import Union
 
 import git
 import numpy as np
@@ -38,7 +39,7 @@ class GeneralConfigLogger:
         job_id (int): If the evaluation suite is launched as a slurm job, stores the current job id.
             Purely informative parameter used to retrieve scheduler logs.
         start_time (float): Start time of the experiment. Logged at class init.
-        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
+        end_time (float): End time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
         total_evaluation_time_secondes (str): Inferred total evaluation time in seconds (from the start and end times).
         model_name (str): Name of the currently evaluated model.
         model_sha (str): Commit hash of the currently evaluated model on the hub if available.
@@ -72,14 +73,44 @@ def __init__(self) -> None:
         self.lighteval_sha = repo.git.rev_parse("HEAD")
         self.start_time = time.perf_counter()
 
-    def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job_id, config=None) -> None:
+    def log_args_info(
+        self,
+        num_fewshot_seeds: int,
+        override_batch_size: Union[None, int],
+        max_samples: Union[None, int],
+        job_id: str,
+        config: "BrrrConfig" = None,
+    ) -> None:
+        """
+        Logs the information about the arguments passed to the method.
+
+        Args:
+            num_fewshot_seeds (int): number of few-shot seeds.
+            override_batch_size (Union[None, int]): overridden batch size.
+                If strictly positive, its value is used as the batch size for all experiments.
+                Else, the batch size is automatically inferred depending on what fits in memory.
+            max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
+            job_id (str): job ID, used to retrieve logs.
+            config (optional): BrrrConfig
+
+        Returns:
+            None
+
+        """
         self.num_fewshot_seeds = num_fewshot_seeds
         self.override_batch_size = override_batch_size
         self.max_samples = max_samples
         self.job_id = job_id
         self.config = config
 
     def log_model_info(self, model_info: ModelInfo) -> None:
+        """
+        Logs the model information.
+
+        Args:
+            model_info (ModelInfo): Model information to be logged.
+
+        """
         self.model_name = model_info.model_name
         self.model_sha = model_info.model_sha
         self.model_dtype = model_info.model_dtype
@@ -102,6 +133,7 @@ class DetailsLogger:
             Example: winogrande: [sample1_details, sample2_details, ...]
         compiled_details (dict[str, `CompiledDetail`]): : Maps each task name to the list of its samples' compiled details.
         compiled_details_over_all_tasks (CompiledDetailOverAllTasks): Aggregated details over all the tasks.
+
     """
 
     @dataclass()
@@ -129,6 +161,7 @@ class Detail:
             choices (list): List of the possible choices (for multichoice/loglikelihood evaluations)
             gold_index (list): Indices of the gold targets among the [`choices`]
             metrics (dict): Metric name to current example score
+
         """
 
         example: str = ""
@@ -160,9 +193,10 @@ class CompiledDetail:
             padded (int): Total umber of samples which needed padding during the batching step for the current task.
             non_padded (int): Total number of samples which did not need padding during the batching step for the current task.
             effective_few_shots (float): Average effective few shots across all samples for the current task.
-                The effective few shot is the number of few shots actually used to fit the prompt in the model context
+                effective few shot is the number of few shots actually used to fit the prompt in the model context
                 length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task.
+
         """
 
         hashes: dict = field(default_factory=dict)
@@ -186,9 +220,10 @@ class CompiledDetailOverAllTasks:
             padded (int): Number of samples which needed padding during the batching step across all tasks.
             non_padded (int): Number of samples which did not need padding during the batching step across all tasks.
             effective_few_shots (float): Average effective few shots across all samples across all tasks.
-                The effective few shot is the number of few shots actually used to fit the prompt in the model context
+                effective few shot is the number of few shots actually used to fit the prompt in the model context
                 length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
+
         """
 
         hashes: dict = field(default_factory=dict)
@@ -388,7 +423,8 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
 
         Args:
             task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric
-            bootstrap_iters (int, optional): _description_. Defaults to 1000.
+            bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000.
+
         """
 
         for task_name, metrics in self.metrics_values.items():
@@ -440,6 +476,7 @@ class VersionsLogger:
 
     Attributes:
         version (dict[str, int]): Maps the task names with the task versions.
+
     """
 
     # the versions dict will be a dict of task_name: task_version
@@ -455,6 +492,7 @@ class TaskConfigLogger:
 
     Attributes:
         tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`]
+
     """
 
     @dataclass
@@ -479,6 +517,7 @@ class TaskConfig:
             truncated_num_docs (bool): Whether less than the total number of documents were used
             output_regex (str)
             frozen (bool)
+
         """
 
         name: str

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
@@ -7,21 +7,12 @@
 
 
 def apply_target_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
-    if len(formatted_doc.get_golds()) != 1:
-        raise ValueError("Target perplexity metric can only be used with one gold reference")
     outputs = {}
-    reference_text = formatted_doc.get_golds()[0]
-    current_result = results.pop(0)
-    target_logprob = current_result.result[0]
-    target_acc = current_result.result[1]
+    current_results = [results.pop(0) for _ in range(len(formatted_doc.get_golds()))]
 
     for metric in metrics:
-        if Metrics[metric].value.category == MetricCategory.TARGET_PERPLEXITY:
-            outputs.update(
-                Metrics[metric].value.compute(
-                    logprobs=target_logprob, target_acc=target_acc, reference_text=reference_text
-                )
-            )
+        if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
+            outputs.update(Metrics[metric].value.compute(results=current_results))
 
     return results, outputs
 
@@ -39,9 +30,7 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr
 
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
-            outputs.update(
-                Metrics[metric].value.compute(logprobs=current_result.result, reference_text=reference_text)
-            )
+            outputs.update(Metrics[metric].value.compute(results=current_result, reference_text=reference_text))
 
     return results, outputs
 

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -501,7 +501,7 @@ def higher_is_better():
         return res
 
     @staticmethod
-    def corpus_level_fns():
+    def corpus_level_fns() -> dict[str, callable]:
         res = {}
         for metric in Metrics:
             if metric.value.category == MetricCategory.IGNORED:

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -275,16 +275,17 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
         return 1.0 / (min(ranked_choices) + 1)
 
 
-def acc_golds_likelihood(target_acc: list[int] | int, **kwargs) -> int:
+def acc_golds_likelihood(results: list[tuple[float, int]], **kwargs) -> int:
     """Tests if at least one of predicted gold targets' log-likelihood is above 0.5.
 
     Args:
-        target_acc (list[int]): List of scores indicating whether the predictions log-probabilities are above 0.5 aggregated.
+        results (list[int]): List of tuples containing, for each gold, the predictions log-probabilities associated with whether they are above 0.5 aggregated.
+        formatted_doc (Doc): _description_
 
     Returns:
         int: 1 if at least one of the possible golds had a log-likelihood above 0.5.
     """
-    return max([int(acc_ppl) for acc_ppl in as_list(target_acc)])
+    return max([int(acc_ppl) for _, acc_ppl in results])
 
 
 class ROUGE:

diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py
@@ -106,14 +106,14 @@ def count_units(self, text: str) -> int:
         if self.units_type == "bytes":
             return len(text.encode("utf-8"))
 
-    def prepare(self, logprobs: list[float] | float, reference_text: str, **kwargs):
+    def prepare(self, results, reference_text, **kwargs):
         """Prepares an individual perplexity example to the format expected by metrics computed at the corpus level (aggregated).
 
         Args:
-            logprobs (list[float]): List of the logprobabilities computed for each item of the sequence or single aggregated logprob over the sequence
+            results (list[float]): List of the logprobabilities computed for each item
             reference_text (str): Current reference text for which to compute the length in self.units_type
 
         Returns:
             PerplexityCorpusMetricInput: Stores the measured logprobs and associated text lengths, counted in the reference unit.
         """
-        return PerplexityCorpusMetricInput(logprobs=logprobs, weights=self.count_units(reference_text))
+        return PerplexityCorpusMetricInput(logprobs=results.result, weights=self.count_units(reference_text))