From 349aea6264dfb03cb35d546f6bc6918909ef5e56 Mon Sep 17 00:00:00 2001 From: Baber Date: Mon, 30 Dec 2024 00:23:08 +0000 Subject: [PATCH 1/2] update evaluate; update construct requests --- lm_eval/tasks/scrolls/task.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index ac2fed25ae..d57aa68a4d 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -4,7 +4,8 @@ import numpy as np import transformers.data.metrics.squad_metrics as squad_metrics -from datasets import Dataset, load_metric +from datasets import Dataset +from evaluate import load from transformers import AutoTokenizer from lm_eval.api.instance import Instance @@ -48,7 +49,10 @@ def _download_metric(): from huggingface_hub import hf_hub_download scrolls_metric_path = hf_hub_download( - repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py" + repo_id="tau/scrolls", + repo_type="dataset", + filename="metrics/scrolls.py", + revision="refs/pr/5", ) updated_scrolls_metric_path = ( os.path.dirname(scrolls_metric_path) @@ -119,7 +123,7 @@ class _SCROLLSTask(ConfigurableTask): def __init__(self, config=None): super().__init__(config={"metadata": {"version": self.VERSION}}) if self.DATASET_NAME is not None: - self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) + self.metric = load(_download_metric(), config_name=self.DATASET_NAME) def has_training_docs(self): return True @@ -253,11 +257,14 @@ def process_results(self, doc, results): } def construct_requests(self, doc, ctx, **kwargs): + apply_chat_template = kwargs.pop("apply_chat_template", False) request_list = [ Instance( request_type="loglikelihood", doc=doc, - arguments=(ctx, " {}".format(choice)), + arguments=(ctx, " {}".format(choice)) + if not apply_chat_template + else (ctx, "{}".format(choice)), idx=i, **kwargs, ) From 77f5065e4a79d80ffe6d109a19310e2d4b30b55e Mon Sep 17 00:00:00 2001 From: Baber Date: Mon, 30 Dec 2024 00:33:12 +0000 Subject: [PATCH 2/2] update construct requests to handle `apply_chat_template` kwarg --- lm_eval/tasks/scrolls/task.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index d57aa68a4d..87372d8ae1 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -292,6 +292,7 @@ def process_results(self, doc, results): } def construct_requests(self, doc, ctx, **kwargs): + kwargs.pop("apply_chat_template", False) return Instance( request_type="generate_until", doc=doc, @@ -334,19 +335,22 @@ def process_results(self, doc, results): return {"f1": (prediction, doc["outputs"])} def construct_requests(self, doc, ctx, **kwargs): + apply_chat_template = kwargs.pop("apply_chat_template", False) if doc["is_yes_no"]: return [ Instance( request_type="loglikelihood", doc=doc, - arguments=(ctx, " yes"), + arguments=(ctx, " yes") + if not apply_chat_template + else (ctx, "yes"), idx=0, **kwargs, ), Instance( request_type="loglikelihood", doc=doc, - arguments=(ctx, " no"), + arguments=(ctx, " no") if not apply_chat_template else (ctx, "no"), idx=1, **kwargs, ), @@ -413,6 +417,7 @@ def process_results(self, doc, results): return {"f1": (results[0], doc["outputs"])} def construct_requests(self, doc, ctx, **kwargs): + kwargs.pop("apply_chat_template", False) return Instance( request_type="generate_until", doc=doc,