huggingface · clefourrier · Feb 28, 2024 · Feb 27, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -8,7 +8,7 @@
 import numpy as np
 import xxhash
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics import MetricCategory
 from lighteval.metrics.stderr import get_stderr_function
 from lighteval.models.model_loader import ModelInfo
@@ -440,7 +440,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
                 try:
                     metric_result = task.aggregation()[metric_name](metric_values)
                 except OverflowError:
-                    hlog(f"{task_name} {metric_name} OVERFLOW ERROR")
+                    hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.")
                     metric_result = float("nan")
 
                 if isinstance(metric_result, dict):  # in which cases do we get a dict here?

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -111,6 +111,6 @@ def compute(self, items: list[PerplexityCorpusMetricInput]):
         if self.metric_type == "perplexity":
             return math.exp(-np.mean(logprobs))
         if self.metric_type == "weighted_perplexity":
-            return math.exp(-np.average(logprobs, weights=weights))
+            return math.exp(-sum(logprobs) / sum(weights))
         if self.metric_type == "bits_per_byte":
-            return -np.average(logprobs, weights=weights) / math.log(2)
+            return -sum(logprobs) / sum(weights) * 1 / math.log(2)
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
@@ -288,11 +288,10 @@ def loglikelihood(
                     responses = self.__process_batch_logprob(batch)
                 for ix, response in enumerate(responses):
                     len_choice = len(batch[ix].tokenized_continuation)
+                    logits = [t.logprob for t in response.details.prefill[-len_choice:] if t.logprob is not None]
                     results.append(
                         LoglikelihoodReturn(
-                            result=[
-                                t.logprob for t in response.details.prefill[-len_choice:] if t.logprob is not None
-                            ],
+                            result=sum(logits),
                             input_tokens=[t.id for t in response.details.prefill[:-len_choice]],
                             generated_tokens=[t.id for t in response.details.prefill[-len_choice:]],
                             truncated_tokens_count=-1,
@@ -329,9 +328,10 @@ def loglikelihood_rolling(
                 else:
                     responses = self.__process_batch_logprob(batch, rolling=True)
                 for response in responses:
+                    logits = [t.logprob for t in response.details.tokens[:-1]]
                     results.append(
                         LoglikelihoodReturn(
-                            result=[t.logprob for t in response.details.tokens[:-1]],
+                            result=sum(logits),
                             input_tokens=[t.id for t in response.details.prefill],
                             generated_tokens=[t.id for t in response.details.tokens[:-1]],
                             truncated_tokens_count=-1,

diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
@@ -5,7 +5,7 @@
 
 
 @dataclass
-class ModelReturn:  # @clefourrier: could probably an abstract class, but it might make the code too complex
+class ModelReturn:
     result: Union[tuple, list, str]
     input_tokens: list[int] = field(default_factory=list)  # model inputs
     generated_tokens: list[int] = field(default_factory=list)  # model generations

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -261,7 +261,10 @@ def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]:
                 # vs when it's used for the actual prompt. That's why we store whether we are currently using the
                 # doc for a fewshot sample (few_shots=True) or not, which then leads to the creation of a different Doc.
                 item["__few_shots"] = few_shots
-                docs.extend(as_list(self.formatter(item, self.name)))
+                cur_docs = self.formatter(item, self.name)
+                if cur_docs is None:
+                    continue
+                docs.extend(as_list(cur_docs))
         return docs
 
     def fewshot_docs(self) -> list[Doc]:
@@ -375,7 +378,9 @@ def construct_requests(
             ]
         if self.has_metric_category[MetricCategory.PERPLEXITY]:
             requests[RequestType.LOGLIKELIHOOD_ROLLING] += [
-                LoglikelihoodRollingRequest(task_name=current_task_name, doc_id=document_id_seed, ctx=context)
+                LoglikelihoodRollingRequest(
+                    task_name=current_task_name, example_index=document_id_seed, request_index=0, context=context
+                )
             ]
         if self.has_metric_category[MetricCategory.GENERATIVE]:
             requests[RequestType.GREEDY_UNTIL] += [

diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py
@@ -2065,7 +2065,7 @@ def wikifact(line, task_name: str = None):
 
 
 def wikitext_103(line, task_name: str = None):
-    return Doc(task_name=task_name, query=line["text"])
+    return Doc(task_name=task_name, choices=[""], gold_index=0, query=line["text"])
 
 
 def winogrande(line, task_name: str = None):