diff --git a/README.md b/README.md index 920dc3302..ee7617be3 100644 --- a/README.md +++ b/README.md @@ -200,8 +200,7 @@ These metrics need the model to generate an output. They are therefore slower. - `bleu_4` (HELM): Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap - uses the nltk implementation. - `chrf` (Harness): Character n-gram matches f-score. - `ter` (Harness): Translation edit/error rate. -- Bias, toxicity, copyright - - `bias` (HELM): Reports uneven association of test groups (race, gender, demographic) and target adjectives or professions, based on cooccurence statistics between the test terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)). +- Copyright - `copyright` (HELM): Reports: - `longest_common_prefix_length`: average length of longest common prefix between model generation and reference, - `edit_distance`: average Levenshtein edit distance between model generation and reference, diff --git a/pyproject.toml b/pyproject.toml index ef9dd427b..5add37c87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ classifiers = [ keywords = ["evaluation", "nlp", "llm"] dependencies = [ # Base dependencies - "transformers>=4.36.0", + "transformers>=4.38.0", "huggingface_hub==0.20.3", "torch>=2.0", "GitPython==3.1.31", # for logging diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py index 43443fdfc..4cee8ba73 100644 --- a/src/lighteval/evaluator.py +++ b/src/lighteval/evaluator.py @@ -108,8 +108,9 @@ def make_results_table(result_dict): values = [] - for k, dic in result_dict["results"].items(): - version = result_dict["versions"][k] + for k in sorted(result_dict["results"].keys()): + dic = result_dict["results"][k] + version = result_dict["versions"][k] if k in result_dict["versions"] else "" for m, v in dic.items(): if m.endswith("_stderr"): continue diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 1b5ecdbd6..23aaccbd3 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -459,18 +459,20 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = self.metric_aggregated[task_name][f"{metric_name}_stderr"] = float("nan") hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.") - suite_average = {} - suite_nb = {} - - for _, metrics in self.metric_aggregated.items(): - for metric, value in metrics.items(): - suite_average[metric] = suite_average.get(metric, 0) + value - suite_nb[metric] = suite_nb.get(metric, 0) + 1 - - for metric, value in suite_average.items(): - suite_average[metric] = value / suite_nb[metric] - - self.metric_aggregated["all"] = suite_average + # We group subtasks which belong to the same parent task, like MMLU, to compute an average on them + grouped_tasks = collections.defaultdict(list) + for k in self.metric_aggregated.keys(): + if "|" in k: + suite, task, fewshot = k.split("|") + grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(k) + + for average_task, list_of_subtasks in grouped_tasks.items(): + if len(list_of_subtasks) > 1: + metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys()) + self.metric_aggregated[average_task] = { + metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks) + for metric in metrics + } class VersionsLogger: @@ -485,7 +487,7 @@ class VersionsLogger: # the versions dict will be a dict of task_name: task_version # {"winogrande|winogrande_xl": 0} - versions: dict[str, int] = {"all": 0} + versions: dict[str, int] = {} def log(self, task_name: str, task_version: int) -> None: self.versions[task_name] = task_version