Add metrics as functions (#214)

--------- Co-authored-by: Hynek Kydlíček <[email protected]> Co-authored-by: Nathan Habib <[email protected]>
huggingface · Jul 17, 2024 · aaf7e8a · aaf7e8a
1 parent 733257f
commit aaf7e8a
Show file tree

Hide file tree

Showing 15 changed files with 3,021 additions and 1,683 deletions.
diff --git a/community_tasks/_template.py b/community_tasks/_template.py
@@ -68,7 +68,7 @@ def prompt_fn(line, task_name: str = None):
     evaluation_splits=[],
     few_shots_split="",
     few_shots_select="",
-    metric=[""],
+    metric=[],  # select your metric in Metrics
 )
 
 # EVALS WITH SUBSET
@@ -91,7 +91,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
             hf_repo="",
-            metric=[""],
+            metric=[custom_metric],  # select your metric in Metrics or use your custom_metric
             hf_avail_splits=[],
             evaluation_splits=[],
             few_shots_split="",
@@ -111,16 +111,14 @@ def __init__(
 
 # CUSTOM METRIC IF NEEDED
 custom_metric = SampleLevelMetric(
-    metric="my_custom_metric_name",
+    metric_name="my_custom_metric_name",
     higher_is_better=True,
     category=MetricCategory.IGNORED,
     use_case=MetricUseCase.NONE,
     sample_level_fn=lambda x: x,  # how to compute score for one sample
     corpus_level_fn=np.mean,  # aggregation
 )
 
-extend_enum(Metrics, "my_custom_metric_name", custom_metric)
-
 # MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval

diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py
@@ -25,6 +25,7 @@
 Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize
 """
 
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
@@ -48,7 +49,7 @@ def aimo_prompt(line, task_name: str = None):
     evaluation_splits=["train"],
     few_shots_split="train",
     few_shots_select="sequential",
-    metric=["quasi_exact_match_math"],
+    metric=[Metrics.quasi_exact_match_math],
     generation_size=2048,
     stop_sequence=None,
 )

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -29,6 +29,7 @@
 import random
 import re
 
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
@@ -86,7 +87,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=mmlu_arabic,
             hf_repo="OALL/Arabic_MMLU",
-            metric=["loglikelihood_acc_norm"],
+            metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "dev"],
             evaluation_splits=["test"],
             few_shots_split="dev",
@@ -143,7 +144,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=acva,
             hf_repo="OALL/ACVA",
-            metric=["loglikelihood_acc_norm"],
+            metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -195,7 +196,7 @@ def arabic_exams(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -245,7 +246,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=alghafa_prompt,
             hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
-            metric=["loglikelihood_acc_norm"],
+            metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -273,7 +274,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -290,7 +291,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -307,7 +308,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -324,7 +325,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -341,7 +342,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -358,7 +359,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -400,7 +401,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -436,7 +437,7 @@ def copa_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -481,7 +482,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -519,7 +520,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -571,7 +572,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )

diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py
@@ -30,6 +30,7 @@
 See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval
 """
 
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
@@ -161,7 +162,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 
@@ -178,7 +179,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 
@@ -196,7 +197,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 
@@ -213,7 +214,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 

diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
@@ -88,7 +88,7 @@ def preprocess(text):
         prompt_function=hellaswag_prompt,
         hf_repo="hellaswag",
         hf_subset="default",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -97,7 +97,7 @@ def preprocess(text):
         prompt_function=prompt.winogrande,
         hf_repo="winogrande",
         hf_subset="winogrande_xl",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -106,7 +106,7 @@ def preprocess(text):
         prompt_function=prompt.piqa_harness,
         hf_repo="piqa",
         hf_subset="plain_text",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -116,7 +116,7 @@ def preprocess(text):
         hf_repo="lighteval/siqa",
         hf_subset="default",
         hf_avail_splits=["train", "validation"],
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -125,7 +125,7 @@ def preprocess(text):
         prompt_function=prompt.openbookqa,
         hf_repo="openbookqa",
         hf_subset="main",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -136,7 +136,7 @@ def preprocess(text):
         hf_subset="ARC-Easy",
         evaluation_splits=["test"],
         generation_size=1,
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -147,7 +147,7 @@ def preprocess(text):
         hf_subset="ARC-Challenge",
         evaluation_splits=["test"],
         generation_size=1,
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -156,7 +156,7 @@ def preprocess(text):
         prompt_function=commonsense_qa_prompt,
         hf_repo="commonsense_qa",
         hf_subset="default",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),
@@ -226,7 +226,7 @@ def boolq_prompt(line, task_name: str = None):
         prompt_function=boolq_prompt,
         hf_repo="super_glue",
         hf_subset="boolq",
-        metric=["target_perplexity"],
+        metric=[Metrics.target_perplexity],
         trust_dataset=True,
         stop_sequence=["\n"],
     ),

diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+from lighteval.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
@@ -79,7 +80,7 @@ def mmlu_anatomy(line):
         few_shots_split="dev",
         few_shots_select="sequential",
         generation_size=5,
-        metric=["loglikelihood_acc_single_token"],
+        metric=[Metrics.loglikelihood_acc_single_token],
         stop_sequence=["\n"],
         output_regex=None,
         frozen=False,
@@ -95,7 +96,7 @@ def mmlu_anatomy(line):
         few_shots_split="dev",
         few_shots_select="sequential",
         generation_size=5,
-        metric=["loglikelihood_acc_single_token"],
+        metric=[Metrics.loglikelihood_acc_single_token],
         stop_sequence=["\n"],
         output_regex=None,
         frozen=False,

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -27,6 +27,7 @@
 import time
 from dataclasses import asdict, is_dataclass
 from datetime import datetime
+from enum import Enum
 from pathlib import Path
 
 from datasets import Dataset, load_dataset
@@ -59,6 +60,8 @@ def default(self, o):
             return asdict(o)
         if callable(o):
             return o.__name__
+        if isinstance(o, Enum):
+            return o.name
         return super().default(o)