diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 441ff70ad..7ec4856f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,5 +37,4 @@ repos: rev: 'v0.1.6' hooks: - id: ruff - args: ['--fix'] - id: ruff-format diff --git a/src/lighteval/logging/__init__.py b/src/lighteval/logging/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index e7628f845..0515af461 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -556,7 +556,7 @@ def push_results_to_tensorboard( # noqa: C901 tb_context.close() # flushes the unfinished write operations time.sleep(5) - files = os.listdir(str(output_dir_tb)) + files = os.listdir(output_dir_tb) for file in files: os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}")) @@ -566,3 +566,5 @@ def push_results_to_tensorboard( # noqa: C901 f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/" f" at {output_dir_tb} and global_step {global_step}" ) + # except Exception as e: + # logger.warning(f"Could not push to tensorboard\n{e}") diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 76be14215..e16963a9c 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -1,6 +1,5 @@ import collections import random -from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path from typing import TYPE_CHECKING, List, Optional, Tuple @@ -40,42 +39,6 @@ from lighteval.logging.evaluation_tracker import EvaluationTracker -@dataclass -class CustomEvaluationTaskConfig: - name: str - prompt_function: str - hf_repo: str - hf_subset: str - metric: Tuple[Metrics] - hf_avail_splits: Optional[Tuple[str]] = None - evaluation_splits: Optional[Tuple[str]] = None - few_shots_split: Optional[str] = None - few_shots_select: Optional[str] = None - generation_size: int = -1 - stop_sequence: Optional[Tuple[str]] = None - output_regex: Optional[str] = None - - frozen: bool = False - suite: Optional[Tuple[str]] = None # we use this to know if we should use a custom lighteval or bigcode task - - def __post_init__(self): - if self.suite is None: - self.suite = ["custom"] - if self.hf_avail_splits is None: - self.hf_avail_splits = ["train", "validation", "test"] - if self.evaluation_splits is None: - self.evaluation_splits = ["validation"] - if self.stop_sequence is None: - self.stop_sequence = ["\n"] - - # Convert list to tuple for hashing - self.metric = tuple(self.metric) - self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None - self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None - self.suite = tuple(self.suite) if self.suite else None - self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None - - class LightevalTask: def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None): """ diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py index 296db3720..b0dae200c 100644 --- a/tasks_examples/custom_tasks/custom_evaluation_tasks.py +++ b/tasks_examples/custom_tasks/custom_evaluation_tasks.py @@ -6,41 +6,44 @@ """ import re from dataclasses import asdict -from typing import Dict, List, Tuple +from typing import Dict, List -from lighteval.metrics import MetricCategory, Metrics -from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig from lighteval.tasks.requests import Doc -from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES +from .custom_evaluation_utils import * -_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = [] -_TASKS: List[CustomEvaluationTaskConfig] = [] + +# fmt: off +LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] +# fmt: on + +_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = [] +_TASKS: List[CustomEvaluationTask] = [] ## COMMON_SENSE_REASONING_TASKS ## COMMON_SENSE_REASONING_TASKS = [ - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="hellaswag", prompt_function="hellaswag_prompt", hf_repo="hellaswag", hf_subset="default", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="winogrande", prompt_function="winogrande", hf_repo="winogrande", hf_subset="winogrande_xl", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="piqa", prompt_function="piqa_harness", hf_repo="piqa", hf_subset="plain_text", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="siqa", prompt_function="siqa_prompt", hf_repo="lighteval/siqa", @@ -48,14 +51,14 @@ hf_avail_splits=["train", "validation"], metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="openbookqa", prompt_function="openbookqa", hf_repo="openbookqa", hf_subset="main", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="arc:easy", prompt_function="arc", hf_repo="ai2_arc", @@ -64,7 +67,7 @@ generation_size=1, metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="arc:challenge", prompt_function="arc", hf_repo="ai2_arc", @@ -73,7 +76,7 @@ generation_size=1, metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="commonsense_qa", prompt_function="commonsense_qa_prompt", hf_repo="commonsense_qa", @@ -131,7 +134,7 @@ def preprocess(text): ## WORLD_KNOWLEDGE_TASKS ## WORLD_KNOWLEDGE_TASKS = [ - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="trivia_qa", prompt_function="triviaqa", hf_repo="trivia_qa", @@ -140,7 +143,7 @@ def preprocess(text): generation_size=20, stop_sequence=["\n", ".", ","], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="natural_questions", prompt_function="natural_questions_prompt", hf_repo="lighteval/natural_questions_clean", @@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None): ## Reading comprehension ## READING_COMP_TASKS = [ - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="super_glue:boolq", prompt_function="boolq_prompt", hf_repo="super_glue", hf_subset="boolq", metric=["target_perplexity"], ), - CustomEvaluationTaskConfig( + CustomEvaluationTask( name="quac", prompt_function="quac", hf_repo="lighteval/quac_helm", @@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None): ## MATH ## -class CustomMathEvaluationTask(CustomEvaluationTaskConfig): +class CustomMathEvaluationTask(CustomEvaluationTask): """Custom class for math tasks with all the defaults set""" def __init__( @@ -251,7 +254,7 @@ def __init__( CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"), CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"), ] -GSM8K = CustomEvaluationTaskConfig( +GSM8K = CustomEvaluationTask( name="gsm8k", prompt_function="gsm8k", hf_repo="gsm8k", @@ -272,7 +275,7 @@ def __init__( ## MMLU ## -class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig): +class CustomMMLUEvaluationTask(CustomEvaluationTask): def __init__( self, name, @@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None): ## BBH ## -class CustomBBHEvaluationTask(CustomEvaluationTaskConfig): +class CustomBBHEvaluationTask(CustomEvaluationTask): def __init__( self, name, @@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None): ## AGI eval ## -class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig): +class CustomAGIEvalEvaluationTask(CustomEvaluationTask): def __init__( self, name, @@ -617,7 +620,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None): ## HUMAN EVAL ## -# human_eval = CustomEvaluationTaskConfig( +# human_eval = CustomEvaluationTask( # name="human_eval", # prompt_function="human_eval", # hf_repo="lighteval/human_eval", @@ -625,9 +628,9 @@ def agi_eval_prompt_no_letters(line, task_name: str = None): # ), -def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool: +def has_generative_metrics(task: CustomEvaluationTask) -> bool: for metric in task.metric: - if metric.category == MetricCategory.GENERATIVE: + if metric in NEEDS_GENERATION_ONLY: return True return False diff --git a/tasks_examples/custom_tasks/custom_evaluation_utils.py b/tasks_examples/custom_tasks/custom_evaluation_utils.py new file mode 100644 index 000000000..d3f005db1 --- /dev/null +++ b/tasks_examples/custom_tasks/custom_evaluation_utils.py @@ -0,0 +1,159 @@ +""" +Custom evaluation tasks for lighteval +""" +from dataclasses import dataclass +from enum import Enum, auto +from typing import Optional, Tuple, Union + + +class Metrics(Enum): + any_target_loglikelihood_acc = auto() + bert_score = auto() + bias = auto() + bits_per_byte = auto() + bleu = auto() + bleu_1 = auto() + bleu_4 = auto() + byte_perplexity = auto() + chrf = auto() + code_eval_APPS = auto() + code_eval_HE = auto() + copyright = auto() + disinformation = auto() + exact_match = auto() + exact_set_match = auto() + extractiveness = auto() + f1_from_bags = auto() + f1_quasi = auto() + f1_sequence = auto() + f1_set_match = auto() + faithfulness = auto() + iou_set_match = auto() + log_prob = auto() + loglikelihood_acc = auto() + loglikelihood_acc_norm = auto() + loglikelihood_acc_norm_nospace = auto() + loglikelihood_acc_norm_single_token = auto() + loglikelihood_acc_single_token = auto() + loglikelihood_f1 = auto() + loglikelihood_f1_single_token = auto() + math_quasi_exact_match = auto() + mc_taco = auto() + mcc = auto() + mcc_single_token = auto() + mrr = auto() + mrr_single_token = auto() + multi_fi_numeric = auto() + one_choice_loglikelihood_acc = auto() + perfect_exact_match = auto() + prediction_perplexity = auto() + prefix_exact_match = auto() + prefix_quasi_exact_match = auto() + quasi_exact_match = auto() + quasi_exact_match2 = auto() + ranking = auto() + recall_at_1_single_token = auto() + recall_at_2_single_token = auto() + recall_at_1 = auto() + recall_at_2 = auto() + rouge = auto() + rouge_1 = auto() + rouge_2 = auto() + rouge_l = auto() + target_perplexity = auto() + ter = auto() + toxicity = auto() + truthfulqa_mc_metrics = auto() + word_perplexity = auto() + + def __str__(self): + return self.name.replace("_at_", "@") + + +NEEDS_GENERATION_ONLY = [ + "perfect_exact_match", + "exact_match", + "quasi_exact_match", + "quasi_exact_match2", + "prefix_exact_match", + "prefix_quasi_exact_match", + "math_quasi_exact_match", + "iou_set_match", + "exact_set_match", + "f1_sequence", + "f1_quasi", + "f1_set_match", + "f1_from_bags", + "chrf", + "ter", + "rouge", + "rouge_1", + "rouge_2", + "rouge_l", + "faithfulness", + "extractiveness", + "bert_score", + "bleu", + "bleu_1", + "bleu_4", + "bias", + "toxicity", + "code_eval_HE", + "code_eval_APPS", + "copyright", +] + + +@dataclass(unsafe_hash=True) +class CustomEvaluationTask: + name: str + prompt_function: str + hf_repo: str + hf_subset: str + metric: Tuple[Union[str, Metrics]] + hf_avail_splits: Optional[Tuple[str]] = None + evaluation_splits: Optional[Tuple[str]] = None + few_shots_split: Optional[str] = None + few_shots_select: Optional[str] = None + generation_size: int = -1 + stop_sequence: Optional[Tuple[str]] = None + output_regex: Optional[str] = None + + frozen: bool = False + suite: Optional[Tuple[str]] = None # we use this to know if we should use a custom lighteval or bigcode task + + def __post_init__(self): + self.metric = [str(m) for m in self.metric] + if self.suite is None: + self.suite = ["custom"] + if self.hf_avail_splits is None: + self.hf_avail_splits = ["train", "validation", "test"] + if self.evaluation_splits is None: + self.evaluation_splits = ["validation"] + if self.stop_sequence is None: + self.stop_sequence = ["\n"] + + # Convert list to tuple for hashing + self.metric = tuple(self.metric) + self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None + self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None + self.suite = tuple(self.suite) if self.suite else None + self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None + + +@dataclass(unsafe_hash=True) +class BigCodeEvaluationTask: + name: str + bigcode_task: str + bigcode_task_kwargs: Optional[dict] = None + n_samples: int = 1 + prefix: Optional[str] = None + + suite: Tuple[str] = None + + def __post_init__(self): + if self.suite is None: + self.suite = ("bigcode",) + + # Convert list to tuple for hashing + self.suite = tuple(self.suite)