diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index ac2fed25ae..87372d8ae1 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -4,7 +4,8 @@ import numpy as np import transformers.data.metrics.squad_metrics as squad_metrics -from datasets import Dataset, load_metric +from datasets import Dataset +from evaluate import load from transformers import AutoTokenizer from lm_eval.api.instance import Instance @@ -48,7 +49,10 @@ def _download_metric(): from huggingface_hub import hf_hub_download scrolls_metric_path = hf_hub_download( - repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py" + repo_id="tau/scrolls", + repo_type="dataset", + filename="metrics/scrolls.py", + revision="refs/pr/5", ) updated_scrolls_metric_path = ( os.path.dirname(scrolls_metric_path) @@ -119,7 +123,7 @@ class _SCROLLSTask(ConfigurableTask): def __init__(self, config=None): super().__init__(config={"metadata": {"version": self.VERSION}}) if self.DATASET_NAME is not None: - self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) + self.metric = load(_download_metric(), config_name=self.DATASET_NAME) def has_training_docs(self): return True @@ -253,11 +257,14 @@ def process_results(self, doc, results): } def construct_requests(self, doc, ctx, **kwargs): + apply_chat_template = kwargs.pop("apply_chat_template", False) request_list = [ Instance( request_type="loglikelihood", doc=doc, - arguments=(ctx, " {}".format(choice)), + arguments=(ctx, " {}".format(choice)) + if not apply_chat_template + else (ctx, "{}".format(choice)), idx=i, **kwargs, ) @@ -285,6 +292,7 @@ def process_results(self, doc, results): } def construct_requests(self, doc, ctx, **kwargs): + kwargs.pop("apply_chat_template", False) return Instance( request_type="generate_until", doc=doc, @@ -327,19 +335,22 @@ def process_results(self, doc, results): return {"f1": (prediction, doc["outputs"])} def construct_requests(self, doc, ctx, **kwargs): + apply_chat_template = kwargs.pop("apply_chat_template", False) if doc["is_yes_no"]: return [ Instance( request_type="loglikelihood", doc=doc, - arguments=(ctx, " yes"), + arguments=(ctx, " yes") + if not apply_chat_template + else (ctx, "yes"), idx=0, **kwargs, ), Instance( request_type="loglikelihood", doc=doc, - arguments=(ctx, " no"), + arguments=(ctx, " no") if not apply_chat_template else (ctx, "no"), idx=1, **kwargs, ), @@ -406,6 +417,7 @@ def process_results(self, doc, results): return {"f1": (results[0], doc["outputs"])} def construct_requests(self, doc, ctx, **kwargs): + kwargs.pop("apply_chat_template", False) return Instance( request_type="generate_until", doc=doc,