Skip to content

Commit

Permalink
Add metrics as functions (#214)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Hynek Kydlíček <[email protected]>
Co-authored-by: Nathan Habib <[email protected]>
  • Loading branch information
3 people authored Jul 17, 2024
1 parent 733257f commit aaf7e8a
Show file tree
Hide file tree
Showing 15 changed files with 3,021 additions and 1,683 deletions.
8 changes: 3 additions & 5 deletions community_tasks/_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def prompt_fn(line, task_name: str = None):
evaluation_splits=[],
few_shots_split="",
few_shots_select="",
metric=[""],
metric=[], # select your metric in Metrics
)

# EVALS WITH SUBSET
Expand All @@ -91,7 +91,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
hf_repo="",
metric=[""],
metric=[custom_metric], # select your metric in Metrics or use your custom_metric
hf_avail_splits=[],
evaluation_splits=[],
few_shots_split="",
Expand All @@ -111,16 +111,14 @@ def __init__(

# CUSTOM METRIC IF NEEDED
custom_metric = SampleLevelMetric(
metric="my_custom_metric_name",
metric_name="my_custom_metric_name",
higher_is_better=True,
category=MetricCategory.IGNORED,
use_case=MetricUseCase.NONE,
sample_level_fn=lambda x: x, # how to compute score for one sample
corpus_level_fn=np.mean, # aggregation
)

extend_enum(Metrics, "my_custom_metric_name", custom_metric)

# MODULE LOGIC
# You should not need to touch this
# Convert to dict for lighteval
Expand Down
3 changes: 2 additions & 1 deletion community_tasks/aimo_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize
"""

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc

Expand All @@ -48,7 +49,7 @@ def aimo_prompt(line, task_name: str = None):
evaluation_splits=["train"],
few_shots_split="train",
few_shots_select="sequential",
metric=["quasi_exact_match_math"],
metric=[Metrics.quasi_exact_match_math],
generation_size=2048,
stop_sequence=None,
)
Expand Down
31 changes: 16 additions & 15 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import random
import re

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
Expand Down Expand Up @@ -86,7 +87,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=mmlu_arabic,
hf_repo="OALL/Arabic_MMLU",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "dev"],
evaluation_splits=["test"],
few_shots_split="dev",
Expand Down Expand Up @@ -143,7 +144,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=acva,
hf_repo="OALL/ACVA",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "validation"],
evaluation_splits=["test"],
few_shots_split="validation",
Expand Down Expand Up @@ -195,7 +196,7 @@ def arabic_exams(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -245,7 +246,7 @@ def __init__(
hf_subset=hf_subset,
prompt_function=alghafa_prompt,
hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "validation"],
evaluation_splits=["test"],
few_shots_split="validation",
Expand Down Expand Up @@ -273,7 +274,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -290,7 +291,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -307,7 +308,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -324,7 +325,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -341,7 +342,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand All @@ -358,7 +359,7 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -400,7 +401,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -436,7 +437,7 @@ def copa_prompt_arabic(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -481,7 +482,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -519,7 +520,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down Expand Up @@ -571,7 +572,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select="sequential",
metric=["loglikelihood_acc_norm"],
metric=[Metrics.loglikelihood_acc_norm],
trust_dataset=True,
version=0,
)
Expand Down
9 changes: 5 additions & 4 deletions community_tasks/german_rag_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval
"""

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc

Expand Down Expand Up @@ -161,7 +162,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="test",
few_shots_select="sequential",
metric=["loglikelihood_acc"],
metric=[Metrics.loglikelihood_acc],
version=1,
)

Expand All @@ -178,7 +179,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="test",
few_shots_select="sequential",
metric=["loglikelihood_acc"],
metric=[Metrics.loglikelihood_acc],
version=1,
)

Expand All @@ -196,7 +197,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="test",
few_shots_select="sequential",
metric=["loglikelihood_acc"],
metric=[Metrics.loglikelihood_acc],
version=1,
)

Expand All @@ -213,7 +214,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
evaluation_splits=["test"],
few_shots_split="test",
few_shots_select="sequential",
metric=["loglikelihood_acc"],
metric=[Metrics.loglikelihood_acc],
version=1,
)

Expand Down
18 changes: 9 additions & 9 deletions examples/nanotron/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def preprocess(text):
prompt_function=hellaswag_prompt,
hf_repo="hellaswag",
hf_subset="default",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -97,7 +97,7 @@ def preprocess(text):
prompt_function=prompt.winogrande,
hf_repo="winogrande",
hf_subset="winogrande_xl",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -106,7 +106,7 @@ def preprocess(text):
prompt_function=prompt.piqa_harness,
hf_repo="piqa",
hf_subset="plain_text",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -116,7 +116,7 @@ def preprocess(text):
hf_repo="lighteval/siqa",
hf_subset="default",
hf_avail_splits=["train", "validation"],
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -125,7 +125,7 @@ def preprocess(text):
prompt_function=prompt.openbookqa,
hf_repo="openbookqa",
hf_subset="main",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -136,7 +136,7 @@ def preprocess(text):
hf_subset="ARC-Easy",
evaluation_splits=["test"],
generation_size=1,
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -147,7 +147,7 @@ def preprocess(text):
hf_subset="ARC-Challenge",
evaluation_splits=["test"],
generation_size=1,
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand All @@ -156,7 +156,7 @@ def preprocess(text):
prompt_function=commonsense_qa_prompt,
hf_repo="commonsense_qa",
hf_subset="default",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand Down Expand Up @@ -226,7 +226,7 @@ def boolq_prompt(line, task_name: str = None):
prompt_function=boolq_prompt,
hf_repo="super_glue",
hf_subset="boolq",
metric=["target_perplexity"],
metric=[Metrics.target_perplexity],
trust_dataset=True,
stop_sequence=["\n"],
),
Expand Down
5 changes: 3 additions & 2 deletions examples/nanotron/custom_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from lighteval.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig


Expand Down Expand Up @@ -79,7 +80,7 @@ def mmlu_anatomy(line):
few_shots_split="dev",
few_shots_select="sequential",
generation_size=5,
metric=["loglikelihood_acc_single_token"],
metric=[Metrics.loglikelihood_acc_single_token],
stop_sequence=["\n"],
output_regex=None,
frozen=False,
Expand All @@ -95,7 +96,7 @@ def mmlu_anatomy(line):
few_shots_split="dev",
few_shots_select="sequential",
generation_size=5,
metric=["loglikelihood_acc_single_token"],
metric=[Metrics.loglikelihood_acc_single_token],
stop_sequence=["\n"],
output_regex=None,
frozen=False,
Expand Down
3 changes: 3 additions & 0 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import time
from dataclasses import asdict, is_dataclass
from datetime import datetime
from enum import Enum
from pathlib import Path

from datasets import Dataset, load_dataset
Expand Down Expand Up @@ -59,6 +60,8 @@ def default(self, o):
return asdict(o)
if callable(o):
return o.__name__
if isinstance(o, Enum):
return o.name
return super().default(o)


Expand Down
Loading

0 comments on commit aaf7e8a

Please sign in to comment.