Merge branch 'fix-target-perplexity' of https://github.com/huggingfac…

…e/lighteval into fix-target-perplexity
huggingface · Feb 5, 2024 · 15d14c9 · 15d14c9
2 parents 1e7eb76 + 50c18f9
commit 15d14c9
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ LightEval is an evaluation suite which gathers a selection of features from wide
 
 It is still an early, internal version - it should be nice to use but don't expect 100% stability!
 
-In case of problems or question, feel free to open an issue! 
+In case of problems or question, feel free to open an issue!
 
 ## How to install and use
 ### Requirements
@@ -50,11 +50,11 @@ Lastly, create a **line summary** of your evaluation, in `metadata_table.json`.
 - `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval"]; you can add also add new ones (for test, we recommend using "custom").
 - `prompt_function` (str), the name of the prompt function you defined in the step above
 - `hf_repo` (str), the path to your evaluation dataset on the hub
-- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) 
+- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`)
 - `hf_avail_splits` (list), all the splits available for your dataset (train, valid or validation, test, other...)
 - `evaluation_splits` (list), the splits you want to use for evaluation
 - `few_shots_split` (str, can be `null`), the specific split from which you want to select samples for your few-shot examples. It should be different from the sets included in `evaluation_splits`
-- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: 
+- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of:
     - `balanced` selects examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) towards one specific label
     - `random` selects examples at random from the `few_shots_split`
     - `random_sampling` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is removed from the available samples
@@ -102,7 +102,7 @@ These metrics need the model to generate an output. They are therefore slower.
     - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed
     - `f1_score_quasi` (HELM): Average F1 score in terms of word overlap between the model output and gold, with both being normalized first
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
-    - `f1_score_macro`: Corpus level macro F1 score 
+    - `f1_score_macro`: Corpus level macro F1 score
     - `f1_score_macro`: Corpus level micro F1 score
 - Summarization:
     - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
@@ -141,7 +141,7 @@ These metrics need both the generation and its logprob. They are not working at
 - `prediction_perplexity` (HELM): Measure of the logprob of a given input.
 
 ## Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric. 
+If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
 
 ## Examples of scripts to launch lighteval on the cluster
 ### Evaluate a whole suite on one node, 8 GPUs

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
@@ -8,12 +8,18 @@
 
 def apply_target_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
     outputs = {}
+    reference_text = formatted_doc.get_golds()[0]
     current_result = results.pop(0)
-    reference_text = formatted_doc.choices[formatted_doc.gold_index]
+    target_logprob = current_result.result[0]
+    target_acc = current_result.result[1]
 
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.TARGET_PERPLEXITY:
-            outputs.update(Metrics[metric].value.compute(results=current_result, reference_text=reference_text))
+            outputs.update(
+                Metrics[metric].value.compute(
+                    logprobs=target_logprob, target_acc=target_acc, reference_text=reference_text
+                )
+            )
 
     return results, outputs
 
@@ -31,7 +37,9 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr
 
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
-            outputs.update(Metrics[metric].value.compute(results=current_result, reference_text=reference_text))
+            outputs.update(
+                Metrics[metric].value.compute(logprobs=current_result.result, reference_text=reference_text)
+            )
 
     return results, outputs
 

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -275,17 +275,16 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
         return 1.0 / (min(ranked_choices) + 1)
 
 
-def acc_golds_likelihood(results: list[tuple[float, int]], **kwargs) -> int:
+def acc_golds_likelihood(target_acc: list[int] | int, **kwargs) -> int:
     """Tests if at least one of predicted gold targets' log-likelihood is above 0.5.
 
     Args:
-        results (list[int]): List of tuples containing, for each gold, the predictions log-probabilities associated with whether they are above 0.5 aggregated.
-        formatted_doc (Doc): _description_
+        target_acc (list[int]): List of scores indicating whether the predictions log-probabilities are above 0.5 aggregated.
 
     Returns:
         int: 1 if at least one of the possible golds had a log-likelihood above 0.5.
     """
-    return max([int(acc_ppl) for _, acc_ppl in results])
+    return max([int(acc_ppl) for acc_ppl in as_list(target_acc)])
 
 
 class ROUGE:

diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py
@@ -106,14 +106,14 @@ def count_units(self, text: str) -> int:
         if self.units_type == "bytes":
             return len(text.encode("utf-8"))
 
-    def prepare(self, results, reference_text, **kwargs):
+    def prepare(self, logprobs: list[float] | float, reference_text: str, **kwargs):
         """Prepares an individual perplexity example to the format expected by metrics computed at the corpus level (aggregated).
 
         Args:
-            results (list[float]): List of the logprobabilities computed for each item
+            logprobs (list[float]): List of the logprobabilities computed for each item of the sequence or single aggregated logprob over the sequence
             reference_text (str): Current reference text for which to compute the length in self.units_type
 
         Returns:
             PerplexityCorpusMetricInput: Stores the measured logprobs and associated text lengths, counted in the reference unit.
         """
-        return PerplexityCorpusMetricInput(logprobs=results.result, weights=self.count_units(reference_text))
+        return PerplexityCorpusMetricInput(logprobs=logprobs, weights=self.count_units(reference_text))
diff --git a/tasks_examples/open_llm_leaderboard_tasks.txt b/tasks_examples/open_llm_leaderboard_tasks.txt
@@ -57,4 +57,4 @@ lighteval|mmlu:security_studies|5|0
 lighteval|mmlu:sociology|5|0
 lighteval|mmlu:us_foreign_policy|5|0
 lighteval|mmlu:virology|5|0
-lighteval|mmlu:world_religions|5|0
+lighteval|mmlu:world_religions|5|0
diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json